# Load Libraries

In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import pickle



# Change the directory

In [2]:
import os
os.chdir("../../../")
# from google.colab import drive
# drive.mount('/content/drive/')
# %cd drive/MyDrive/poleval_emotion/

# Determine constants

In [3]:
TRAIN_DIR = 'data/train/'
IN_FILENAME = 'in_prep_gpt.tsv'
EXPECTED_FILENAME = 'expected.tsv'

In [4]:
MODEL_DIR = 'models/'
TOKENIZERS_DIR = 'models/tokenizers/'
OUTPUT_MODEL_FILENAME = 'lstm_prep_gpt'

# Load & preprocess data functions

In [5]:
def load_data(file_path):
    data = pd.read_csv(file_path, sep='\t')
    return data

In [6]:
def preprocess_data(text_series, num_words=5000):
    text_series = text_series.astype(str)
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(text_series)
    sequences = tokenizer.texts_to_sequences(text_series)
    max_len = max(len(x) for x in sequences)
    padded_sequences = pad_sequences(sequences, maxlen=max_len)
    return padded_sequences, tokenizer

# Define the LSTM model

In [7]:
def create_model(input_length):
    model = Sequential([
        Embedding(input_dim=5000, output_dim=128, input_length=input_length),
        LSTM(64),
        Dense(11, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Load & preprocess the data

In [8]:
in_data = load_data(TRAIN_DIR + IN_FILENAME)
expected_data = load_data(TRAIN_DIR + EXPECTED_FILENAME)

In [9]:
X, tokenizer = preprocess_data(in_data['text'])
y = expected_data.values

# Main operation

In [10]:
input_length = X.shape[1]
model = create_model(input_length)



In [11]:
model.fit(X, y, batch_size=32, epochs=32)

Epoch 1/32
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 189ms/step - accuracy: 0.0746 - loss: 0.5384
Epoch 2/32
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 199ms/step - accuracy: 0.0681 - loss: 0.4087
Epoch 3/32
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 196ms/step - accuracy: 0.0993 - loss: 0.3318
Epoch 4/32
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 189ms/step - accuracy: 0.1151 - loss: 0.2911
Epoch 5/32
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 186ms/step - accuracy: 0.1325 - loss: 0.2664
Epoch 6/32
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 197ms/step - accuracy: 0.1686 - loss: 0.2292
Epoch 7/32
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 188ms/step - accuracy: 0.1852 - loss: 0.2069
Epoch 8/32
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 199ms/step - accuracy: 0.1772 - loss: 0.1872
Epoch 9/32
[1m2

<keras.src.callbacks.history.History at 0x1e26a6a7130>

# Save model

In [12]:
model.save(MODEL_DIR + OUTPUT_MODEL_FILENAME + '.h5')



# Tokenizer save

In [13]:
with open(TOKENIZERS_DIR + OUTPUT_MODEL_FILENAME + '.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)