# Task 1

First we import all the libraries we want for this method, and import the training and test csv's.

In [116]:
import numpy as np
import pandas as pd
from pathlib import Path
import time

# text preprocessing modules
import re
from nltk import word_tokenize
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

# naive bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score

#LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.models import Sequential
from keras.utils import to_categorical


# import the training and test dataframes
train_df = pd.read_csv('./Training-dataset.csv')
validation_df = pd.read_csv('./Task-2-validation-dataset.csv')
test_df = pd.read_csv('./Task-2-test-dataset1.csv')


Let's just do some analysis to look at the balance of our data

In [48]:
# Returns a series of all the labels and how many synopses are classified by that label
label_count = train_df.iloc[:,3:].sum()
# For each synopsis return the number of labels assigned for that synopsis
movie_label_count = train_df.iloc[:,3:].sum(axis=1) 

# Iterate through all our synopses, count up any non-labeled synopses
no_label_count = 0
for sum in movie_label_count.items():
    if sum==0:
        no_label_count +=1

print("Total number of movies =",len(train_df))
print("Total number of movies without label =",no_label_count)
print("Total labels =",label_count.sum())
print(label_count)

Total number of movies = 8257
Total number of movies without label = 0
Total labels = 16193
comedy        1262
cult          1801
flashback     1994
historical     186
murder        4019
revenge       1657
romantic      2006
scifi          204
violence      3064
dtype: int64


# Preprocessing

The same function will be used by both LSTM & Naive Bayes

In [49]:
def preprocess_text(text):

    # for sentence in sent_tokenize(synopsis):
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub('[^a-zA-Z]', ' ', text).lower()
    # Tokenise the sentence
    text = word_tokenize(text)
    # get a set of the stopwords to remove
    stop_words = set(stopwords.words('english'))
    # Remove stopwords
    text = [word for word in text if word not in stop_words]
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]

    text = ' '.join(text)
    return text

In [50]:
train_df['text'] = train_df['title'] + ' ' + train_df['plot_synopsis']
train_df.drop(columns=['title','plot_synopsis'], inplace=True)
train_df['text'] = train_df['text'].apply(lambda x: preprocess_text(x))

In [117]:
validation_df['text'] = validation_df['title'] + ' ' + validation_df['plot_synopsis']
validation_df.drop(columns=['title','plot_synopsis'], inplace=True)
validation_df['text'] = validation_df['text'].apply(lambda x: preprocess_text(x))

In [118]:
test_df['text'] = test_df['title'] + ' ' + test_df['plot_synopsis']
test_df.drop(columns=['title','plot_synopsis'], inplace=True)
test_df['text'] = test_df['text'].apply(lambda x: preprocess_text(x))

In [78]:
genres = list(train_df.iloc[:,1:10].columns)
print(genres)

['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']


# Method (a) - Naive Bayes
First let's create a bow vectorizer and use that to prepare the input & output data for the training, validation and test sets

In [58]:
countvectorizer = CountVectorizer(min_df=2, max_df=0.8)
X_train_nb = countvectorizer.fit_transform(train_df['text'])
X_validation_nb = countvectorizer.transform(validation_df['text'])
X_test_nb = countvectorizer.transform(test_df['text'])
print(X_train_nb.shape)
print(X_validation_nb.shape)
print(X_test_nb.shape)

Y_train_nb = []
Y_validation_nb = []
for genre in genres:
    Y_train_nb.append(train_df[genre].to_numpy())
    Y_validation_nb.append(validation_df[genre].to_numpy())
print(Y_train_nb[0].shape)
print(Y_validation_nb[0].shape)

(8257, 42514)
(1188, 42514)
(1200, 42514)
(8257,)
(1188,)


Now make a Naive Bayes model for each genres

In [75]:
models_nb = []
predictions_val_nb = []
predictions_test_nb = []

train_time = 0
classify_time = 0
test_time = 0
for i, genre in enumerate(genres):
    print('Training & predicting with ' + genre + ' model . . .')

    # Train the classifier
    genre_model = MultinomialNB()
    start = time.time()
    genre_model.fit(X_train_nb, Y_train_nb[i])
    end = time.time()
    train_time += (end-start)

    # Predict validation data
    start = time.time()
    pred_val = genre_model.predict(X_validation_nb)
    end = time.time()
    classify_time += (end-start)

    start = time.time()
    pred_test = genre_model.predict(X_test_nb)
    end = time.time()
    test_time += (end-start)

    models_nb.append(genre_model)
    predictions_val_nb.append(pred_val)
    predictions_test_nb.append(pred_test)

print(train_time)
print(classify_time)
print(test_time)


Training & predicting with comedy model . . .
Training & predicting with cult model . . .
Training & predicting with flashback model . . .
Training & predicting with historical model . . .
Training & predicting with murder model . . .
Training & predicting with revenge model . . .
Training & predicting with romantic model . . .
Training & predicting with scifi model . . .
Training & predicting with violence model . . .
0.12770509719848633
0.024897098541259766
0.024585485458374023


Get the predictions for both validation and test dfs and do some final data manipulation so it's output in the correct format!

In [71]:
predictions_val_nb = np.transpose(np.array(predictions_val_nb))
id_val_nb = validation_df['ID'].to_numpy().reshape(predictions_val_nb.shape[0], 1)
all_val_nb = np.hstack((id_val_nb, predictions_val_nb))
val_nb_df = pd.DataFrame(all_val_nb)

predictions_test_nb = np.transpose(np.array(predictions_test_nb))
id_test_nb = test_df['ID'].to_numpy().reshape(predictions_test_nb.shape[0], 1)
all_test_nb = np.hstack((id_test_nb, predictions_test_nb))
test_nb_df = pd.DataFrame(all_test_nb)

In [72]:
filepath = Path('./10861383-Task2-method-a-validation.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
print(val_nb_df)
val_nb_df.to_csv(filepath, index=False, header=False)  

filepath = Path('./10861383-Task2-method-a.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
print(test_nb_df)
test_nb_df.to_csv(filepath, index=False, header=False)

                                         0  1  2  3  4  5  6  7  8  9
0     cf32cb00-172d-40f2-a3c1-936e8a0d89d7  0  0  0  0  0  0  1  0  0
1     df7e125e-2d59-40e4-a126-9397e3a0ef21  0  0  0  0  1  0  0  0  1
2     49bc73f3-9179-41cd-9774-905c7a3ac91b  0  0  1  0  0  0  1  0  0
3     0ed4822b-87af-44bc-a677-7f7abfdaccf3  0  0  0  0  0  1  1  0  0
4     0b1b0fa4-43bc-41ba-9598-b3401894b96d  1  0  0  0  1  1  0  0  0
...                                    ... .. .. .. .. .. .. .. .. ..
1183  d32be875-41c7-4e84-ac04-e1d3bc3df0fe  0  0  1  0  0  0  1  0  0
1184  84e025dd-4b4e-403c-a3dd-34818b210857  0  0  0  0  1  0  0  0  0
1185  3d291d3b-c0b5-47cc-8dc8-127dc93162e3  0  1  0  0  0  0  0  0  1
1186  6c9b3034-56b3-42f6-874e-a821c9fd1a89  0  0  1  0  0  0  1  0  0
1187  fbd1d334-e979-465c-9fb0-e173d2642630  0  1  1  0  0  1  0  0  1

[1188 rows x 10 columns]
                                         0  1  2  3  4  5  6  7  8  9
0     9484ac61-0e30-4799-9998-6f74f4cbb204  0  1  1  0  0  0  0 

# Method (b) - LSTM


Prepare the texts and labels for the train, validation and test sets

In [81]:
texts_train = list(train_df['text'])
texts_validation = list(validation_df['text'])
texts_test = list(test_df['text'])
print(len(texts_train), len(texts_validation), len(texts_test))

labels_train = []
labels_validation = []
for genre in genres:
    labels_train.append(train_df[genre].to_numpy())
    labels_validation.append(validation_df[genre].to_numpy())

labels_train = np.flip(np.rot90(np.array(labels_train)), axis=0)
labels_validation = np.flip(np.rot90(np.array(labels_validation)), axis=0)
print(labels_train.shape)
print(labels_validation.shape)


8257 1188 1200
(8257, 9)
(1188, 9)


Set the optimal hyper parameters

In [122]:
# Hyperparameters 
max_words = 10000 # max number of words to use in the vocabulary
# max_len = max(train_df['text'].apply(max_length))
max_len = 200  # max length of each text (in terms of number of words)
embedding_dim = 100 # dimension of word embeddings
lstm_units = 64 # number of units in the LSTM layer
num_classes = len(genres) # number of classes
epochs = 10
batch_size = 32

In [123]:
# Tokenize the texts and create a vocabulary
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts_train)
train_sequences = tokenizer.texts_to_sequences(texts_train)
val_sequences = tokenizer.texts_to_sequences(texts_validation)
test_sequences = tokenizer.texts_to_sequences(texts_test)


In [124]:
# Pad the sequences so they all have the same length
X_train = pad_sequences(train_sequences, maxlen=max_len)
X_val = pad_sequences(val_sequences, maxlen=max_len)
X_test = pad_sequences(test_sequences, maxlen=max_len)
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

Y_train = labels_train
Y_val = labels_validation
print(Y_train.shape)
print(Y_val.shape)


(8257, 200)
(1188, 200)
(1200, 200)
(8257, 9)
(1188, 9)


In [125]:
# Build the model
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(LSTM(lstm_units, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(lstm_units))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='sigmoid'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 100)          1000000   
                                                                 
 lstm_2 (LSTM)               (None, 200, 64)           42240     
                                                                 
 dropout_2 (Dropout)         (None, 200, 64)           0         
                                                                 
 lstm_3 (LSTM)               (None, 64)                33024     
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 9)                 585       
                                                                 
Total params: 1075849 (4.10 MB)
Trainable params: 1075

In [126]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam')

In [127]:
# Train the model
start = time.time()
model.fit(  X_train, 
            Y_train, 
            validation_data=(
                X_val, 
                Y_val),
            batch_size=batch_size, 
            epochs=10,
            )
end = time.time()
print(end-start)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
255.8164303302765


Let's predict the validation and test set

In [128]:
start = time.time()
pred_val = model.predict(X_val)
end = time.time()
print(end-start)

start = time.time()
pred_test = model.predict(X_test)
end = time.time()
print(end-start)

4.366817474365234
1.6558811664581299


In [129]:
for i in range(len(genres)):
    pred_val[pred_val>=0.1]=1
    pred_val[pred_val<0.1]=0
    pred_test[pred_test>=0.1]=1
    pred_test[pred_test<0.1]=0
print(pred_val.shape)
print(pred_test.shape)

pred_val = pred_val.astype(int) 
pred_test = pred_test.astype(int) 

id_val = validation_df['ID'].to_numpy().reshape(pred_val.shape[0], 1)
id_test = test_df['ID'].to_numpy().reshape(pred_test.shape[0], 1)
all_val = np.hstack((id_val, pred_val))
all_test = np.hstack((id_test, pred_test))

(1188, 9)
(1200, 9)


In [130]:
val_df_output = pd.DataFrame(all_val)
filepath = Path('./10861383-Task2-method-b-validation.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
val_df_output.to_csv(filepath, index=False, header=False)  

test_df_output = pd.DataFrame(all_test)
filepath = Path('./10861383-Task2-method-b.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
test_df_output.to_csv(filepath, index=False, header=False)  

In [None]:
model.get_output_shape_at(0)