In [1]:
import pandas as pd

In [2]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()

def pre_process(X):
    X = X.str.split('|').str.join('')

    tokenizer.fit_on_texts(X)
    X_sequences = tokenizer.texts_to_sequences(X)

    return X_sequences

In [3]:
train = pd.read_csv('datasets/train.csv')
dev = pd.read_csv('datasets/dev.csv')
test = pd.read_csv('datasets/test.csv')

#get only 3% of the data
train = train.sample(frac=0.01)
dev = dev.sample(frac=0.01)
test = test.sample(frac=0.01)

train.shape, dev.shape, test.shape

((354, 2), (45, 2), (73, 2))

In [4]:
X_train, y_train = train['triple'], train['sentence']
X_dev, y_dev = dev['triple'], dev['sentence']
X_test, y_test = test['triple'], test['sentence']

In [5]:
#add X_dev to X_train
X_train = X_train.append(X_dev)
y_train = y_train.append(y_dev)

  X_train = X_train.append(X_dev)
  y_train = y_train.append(y_dev)


In [6]:
X_train = pre_process(X_train)
X_test = pre_process(X_test)
y_train = pre_process(y_train)
y_test = pre_process(y_test)

In [7]:
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
print('Found %s unique tokens.' % len(word_index))

Found 3029 unique tokens.


In [8]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

max_len = 100

X_train = pad_sequences(X_train, maxlen = max_len)
X_test = pad_sequences(X_test, maxlen = max_len)
y_train = pad_sequences(y_train, maxlen = max_len)
y_test = pad_sequences(y_test, maxlen = max_len)

y_train_one_hot = to_categorical(y_train, num_classes=vocab_size)
y_test_one_hot = to_categorical(y_test, num_classes=vocab_size)

X_train.shape

(399, 100)

## Modello LSTM

In [9]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM

model = Sequential()

model.add(Embedding(vocab_size, 50, input_length=max_len))
model.add(LSTM(32 ,return_sequences=True))
model.add(Dense(vocab_size, activation='softmax'))  # Softmax per la classificazione delle parole

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 50)           151500    
                                                                 
 lstm (LSTM)                 (None, 100, 32)           10624     
                                                                 
 dense (Dense)               (None, 100, 3030)         99990     
                                                                 
Total params: 262114 (1023.88 KB)
Trainable params: 262114 (1023.88 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [12]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.fit(X_train, y_train_one_hot, batch_size=4, validation_split=0.2, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1e7f454e890>

In [33]:
model.evaluate(X_test, y_test_one_hot)



[1.8067818880081177, 0.7767123579978943]