In [None]:
import pandas as pd

In [None]:
data_raw = pd.read_csv("data/movies_metadata.csv")

In [None]:
# only keep the needed columns
data = data_raw[["overview","title","genres"]]
data = data[~data['overview'].isna()]
# create a mask indication where a genre value exists
has_genres_mask = data['genres'] != "[]"
genres = data['genres'][has_genres_mask]

In [None]:
"""
AST allows us to evaluate the string list in each genre entry.
basically creates a list from a string with list content.
"""
import ast

def make_labels(strings):
    evaluated_string = ast.literal_eval(strings)
    return [g['name'] for g in evaluated_string]

genres_list = genres.apply(make_labels)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
"""
the MultiLabelBinarizer simply looks through all entries in a list and creates list containing unique labels.
"""
labeler = MultiLabelBinarizer()
labeler.fit(genres_list)

In [None]:
from joblib import dump

dump(labeler, "model/class_labler.joblib")
print(labeler.classes_)


In [None]:
# for the next steps its really important to make sure both entries are of string type
pre_X1 = data['title'][has_genres_mask].astype(dtype="str")
pre_X2 = data['overview'][has_genres_mask].astype(dtype="str")

In [None]:
"""
now we have a binary list representing the genres.
this can be used directly in the training
"""
y = labeler.transform(genres_list)

In [None]:
"""
Just checking the size of all entries, Now we are sure everything is aligned correctly
"""
print(len(y))
print(len(pre_X1))
print(len(pre_X2))

In [None]:
from tensorflow import keras
import tensorflow as tf

In [None]:

# The maximum number of words to be used.
MAX_NB_WORDS = 50000
EMBEDDING_DIM = 100 # This is a fixed value, in this case
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(pre_X1 + pre_X2)
word_index = tokenizer.word_index

dump(tokenizer, "model/tokenizer.joblib")


In [None]:
import numpy as np
def max_len(array_):
    max_l = 0
    for x in array_:
        if len(x) > max_l:
            max_l = len(x)
    return max_l
X1 = tokenizer.texts_to_sequences(pre_X1)
X2 = tokenizer.texts_to_sequences(pre_X2)

X1_max_len = max_len(X1)
X2_max_len = max_len(X2)

X1 = keras.preprocessing.sequence.pad_sequences(X1, maxlen=X1_max_len)
X2 = keras.preprocessing.sequence.pad_sequences(X2, maxlen=X2_max_len)

In [None]:
"""
this section attempted to split the data into training and validation sets
However, this was scrapped in favor of letting the fit function
deal with the validation split

However, this is used in generating a dataset for the evaluation
"""
from sklearn.model_selection import train_test_split

X1_train, X1_test = train_test_split(X1, shuffle = False)
X2_train, X2_test = train_test_split(X2, shuffle = False)
Y_train, Y_test = train_test_split(y, shuffle = False)

# print(X1_train.shape,Y_train.shape)
# print(X1_test.shape,Y_test.shape)

In [None]:
"""
load the classifier we constructed
"""
from model import GenreClassifier

model = GenreClassifier(len(labeler.classes_), MAX_NB_WORDS, X1_max_len, X2_max_len)


In [None]:
"""
train the classifier
the specified batch size requires lots of memory, consider reducing the value on first run
"""
EPOCHS = 33
BATCH_SIZE = 1024
history = model.fit(X1, X2, y,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_split=0.2,
                    shuffle=True)

In [None]:
model.save("model/simple_text_classifier_33.h5")

In [None]:
from matplotlib import pyplot as plt
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
predictions = model.predict(X1_test, X2_test)

In [None]:
print(predictions.shape)
print(Y_test.shape)

In [None]:
"""
The confusion matrix below allows us to see how the predictions compare to the ground truth,
the labels are omitted from this plot to avoid clutter.
"""
from sklearn import metrics
import matplotlib.pyplot as plt

matrix = metrics.confusion_matrix(Y_test.argmax(axis=1), predictions.argmax(axis=1))

fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(matrix)
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)

plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()