In [507]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# text preprocessing modules
import re
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import WordNetLemmatizer
from nltk import PorterStemmer
from nltk.corpus import stopwords

#LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.models import Sequential
from keras.utils import to_categorical
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score


# import the training and test dataframes
train_df = pd.read_csv('./Training-dataset.csv')
test_df = pd.read_csv('./Task-2-validation-dataset.csv')


In [508]:
def preprocess_text(text):

    # for sentence in sent_tokenize(synopsis):
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub('[^a-zA-Z]', ' ', text).lower()
    # Tokenise the sentence
    text = word_tokenize(text)
    # get a set of the stopwords to remove
    stop_words = set(stopwords.words('english'))
    # Remove stopwords
    text = [word for word in text if word not in stop_words]
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]

    text = ' '.join(text)
    return text

In [509]:
train_df['text'] = train_df['title'] + ' ' + train_df['plot_synopsis']
train_df.drop(columns=['title','plot_synopsis'], inplace=True)
train_df['text'] = train_df['text'].apply(lambda x: preprocess_text(x))

In [510]:
vocab = []
for text in train_df['text']:
    vocab += list(set(text.split(' ')))
    vocab = list(set(vocab))
print(vocab)
print(len(vocab))
vocab_length = len(vocab)

81091


In [511]:
test_df['text'] = test_df['title'] + ' ' + test_df['plot_synopsis']
test_df.drop(columns=['title','plot_synopsis'], inplace=True)
test_df['text'] = test_df['text'].apply(lambda x: preprocess_text(x))

In [512]:
texts_train = list(train_df['text'])
texts_test = list(test_df['text'])
print(len(texts_train), len(texts_test))

genres = list(train_df.iloc[:,1:10].columns)
print(genres)

8257 1188
['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']


In [513]:
labels_train = []
labels_test = []
for genre in genres:
    labels_train.append(train_df[genre].to_numpy())
    labels_test.append(test_df[genre].to_numpy())

labels_train = np.flip(np.rot90(np.array(labels_train)), axis=0)
labels_test = np.flip(np.rot90(np.array(labels_test)), axis=0)
print(labels_train.shape)
print(labels_test.shape)


(8257, 9)
(1188, 9)


In [547]:
def max_length(x):
    a=x.split()
    return len(a)

# Hyperparameters 
max_words = 10000 # max number of words to use in the vocabulary
# max_len = max(train_df['text'].apply(max_length))
max_len = 200  # max length of each text (in terms of number of words)
embedding_dim = 100 # dimension of word embeddings
lstm_units = 64 # number of units in the LSTM layer
num_classes = len(genres) # number of classes
epochs = 10
batch_size = 32

In [548]:
# Tokenize the texts and create a vocabulary
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts_train)
train_sequences = tokenizer.texts_to_sequences(texts_train)
test_sequences = tokenizer.texts_to_sequences(texts_test)

In [549]:
# Pad the sequences so they all have the same length
X_train = pad_sequences(train_sequences, maxlen=max_len)
X_test = pad_sequences(test_sequences, maxlen=max_len)
print(X_train.shape)
print(X_test.shape)

# Create one-hot encoded labels
# y = to_categorical(labels_train, len(genres))
Y_train = labels_train
Y_test = labels_test
print(Y_train.shape)
print(Y_test.shape)


(8257, 800)
(1188, 800)
(8257, 9)
(1188, 9)


In [550]:
# Build the model
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(LSTM(lstm_units, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(lstm_units))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='sigmoid'))

model.summary()

Model: "sequential_42"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_42 (Embedding)    (None, 800, 400)          8000000   
                                                                 
 lstm_61 (LSTM)              (None, 800, 64)           119040    
                                                                 
 dropout_38 (Dropout)        (None, 800, 64)           0         
                                                                 
 lstm_62 (LSTM)              (None, 64)                33024     
                                                                 
 dropout_39 (Dropout)        (None, 64)                0         
                                                                 
 dense_41 (Dense)            (None, 9)                 585       
                                                                 
Total params: 8152649 (31.10 MB)
Trainable params: 81

In [551]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam')

In [552]:

# Train the model
model.fit(  X_train, 
            Y_train, 
            validation_data=(
                X_test, 
                Y_test),
            batch_size=batch_size, 
            epochs=epochs,
            )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x15f8d03f010>

In [553]:
predictions = model.predict(X_test)
thresholds=[0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

best_f1 = [0,None]
best_precision = [0,None]
best_recall = [0,None]

for val in thresholds:
    pred=predictions.copy()
  
    pred[pred>=val]=1
    pred[pred<val]=0
  
    precision = precision_score(Y_test, pred, average='micro')
    recall = recall_score(Y_test, pred, average='micro')
    f1 = f1_score(Y_test, pred, average='micro')
    
    if (precision > best_precision[0]):
        best_precision[0] = precision
        best_precision[1] = val

    if (recall > best_recall[0]):
        best_recall[0] = recall
        best_recall[1] = val

    if (f1 > best_f1[0]):
        best_f1[0] = f1
        best_f1[1] = val
   
    # print("Micro-average quality numbers", val)
    # print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print()
print("For parameters:\nmax_words: {},\nmax_len {},\nembedding_dim {},\nlstm_units {},\nbatch_size {},\nepochs {}".format(max_words,max_len,embedding_dim,lstm_units,batch_size,epochs))
print()
print("Best precision score: {:.4f}, found at threshold {:.2f}".format(best_precision[0], best_precision[1]))
print("Best recall score: {:.4f}, found at threshold {:.2f}".format(best_recall[0], best_recall[1]))
print("Best f1 score: {:.4f}, found at threshold {:.2f}".format(best_f1[0], best_f1[1]))



For parameters:
max_words: 20000,
max_len 800,
embedding_dim 400,
lstm_units 64,
batch_size 32,
epochs 10

Best precision score: 0.5377, found at threshold 0.90
Best recall score: 0.6755, found at threshold 0.01
Best f1 score: 0.4580, found at threshold 0.04


In [554]:
pred=predictions.copy()  
pred[pred>=0.1]=1
pred[pred<0.1]=0
print(pred.shape)


pred = pred.astype(int) 

id = test_df['ID'].to_numpy().reshape(pred.shape[0], 1)
all = np.hstack((id, pred))
all

(1188, 9)


array([['cf32cb00-172d-40f2-a3c1-936e8a0d89d7', 0, 0, ..., 1, 0, 0],
       ['df7e125e-2d59-40e4-a126-9397e3a0ef21', 0, 0, ..., 0, 0, 0],
       ['49bc73f3-9179-41cd-9774-905c7a3ac91b', 0, 0, ..., 0, 0, 1],
       ...,
       ['3d291d3b-c0b5-47cc-8dc8-127dc93162e3', 0, 0, ..., 0, 0, 0],
       ['6c9b3034-56b3-42f6-874e-a821c9fd1a89', 1, 1, ..., 0, 0, 0],
       ['fbd1d334-e979-465c-9fb0-e173d2642630', 0, 1, ..., 0, 0, 0]],
      dtype=object)

In [555]:
output_df = pd.DataFrame(all)
output_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,cf32cb00-172d-40f2-a3c1-936e8a0d89d7,0,0,0,0,0,0,1,0,0
1,df7e125e-2d59-40e4-a126-9397e3a0ef21,0,0,0,0,1,0,0,0,0
2,49bc73f3-9179-41cd-9774-905c7a3ac91b,0,0,0,0,1,0,0,0,1
3,0ed4822b-87af-44bc-a677-7f7abfdaccf3,0,0,0,0,1,0,0,0,0
4,0b1b0fa4-43bc-41ba-9598-b3401894b96d,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
1183,d32be875-41c7-4e84-ac04-e1d3bc3df0fe,0,0,0,0,1,0,0,0,1
1184,84e025dd-4b4e-403c-a3dd-34818b210857,0,0,0,0,1,0,0,0,0
1185,3d291d3b-c0b5-47cc-8dc8-127dc93162e3,0,0,0,0,1,0,0,0,0
1186,6c9b3034-56b3-42f6-874e-a821c9fd1a89,1,1,1,0,0,0,0,0,0


In [556]:
from pathlib import Path
filepath = Path('./10861383-Task2-method-b.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
print(output_df)
output_df.to_csv(filepath, index=False, header=False)  

                                         0  1  2  3  4  5  6  7  8  9
0     cf32cb00-172d-40f2-a3c1-936e8a0d89d7  0  0  0  0  0  0  1  0  0
1     df7e125e-2d59-40e4-a126-9397e3a0ef21  0  0  0  0  1  0  0  0  0
2     49bc73f3-9179-41cd-9774-905c7a3ac91b  0  0  0  0  1  0  0  0  1
3     0ed4822b-87af-44bc-a677-7f7abfdaccf3  0  0  0  0  1  0  0  0  0
4     0b1b0fa4-43bc-41ba-9598-b3401894b96d  0  0  0  0  1  0  0  0  0
...                                    ... .. .. .. .. .. .. .. .. ..
1183  d32be875-41c7-4e84-ac04-e1d3bc3df0fe  0  0  0  0  1  0  0  0  1
1184  84e025dd-4b4e-403c-a3dd-34818b210857  0  0  0  0  1  0  0  0  0
1185  3d291d3b-c0b5-47cc-8dc8-127dc93162e3  0  0  0  0  1  0  0  0  0
1186  6c9b3034-56b3-42f6-874e-a821c9fd1a89  1  1  1  0  0  0  0  0  0
1187  fbd1d334-e979-465c-9fb0-e173d2642630  0  1  1  0  1  1  0  0  0

[1188 rows x 10 columns]
