In [93]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# text preprocessing modules
import re
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import WordNetLemmatizer
from nltk import PorterStemmer
from nltk.corpus import stopwords

# ml
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score

# import the training and test dataframes
train_df = pd.read_csv('./Training-dataset.csv')
test_df = pd.read_csv('./Task-2-validation-dataset.csv')

In [65]:
# Returns a series of all the labels and how many films are classified by that label
label_count = train_df.iloc[:,3:].sum()
# For each movie return the number of labels assigned for that movie
movie_label_count = train_df.iloc[:,3:].sum(axis=1) 

# Iterate through all our movies, count up any non-labeled movies
no_label_count = 0
for sum in movie_label_count.items():
    if sum==0:
        no_label_count +=1

print("Total number of movies =",len(train_df))
print("Total number of movies without label =",no_label_count)
print("Total labels =",label_count.sum())
print(label_count)

Total number of movies = 8257
Total number of movies without label = 0
Total labels = 16193
comedy        1262
cult          1801
flashback     1994
historical     186
murder        4019
revenge       1657
romantic      2006
scifi          204
violence      3064
dtype: int64


In [66]:
def preprocess_text(text):

    # for sentence in sent_tokenize(synopsis):
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub('[^a-zA-Z]', ' ', text).lower()
    # Tokenise the sentence
    text = word_tokenize(text)
    # get a set of the stopwords to remove
    stop_words = set(stopwords.words('english'))
    # Remove stopwords
    text = [word for word in text if word not in stop_words]
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]

    text = ' '.join(text)
    return text


In [67]:
train_df['text'] = train_df['title'] + ' ' + train_df['plot_synopsis']
train_df.drop(columns=['title','plot_synopsis'], inplace=True)
train_df['text'] = train_df['text'].apply(lambda x: preprocess_text(x))

In [68]:
test_df['text'] = test_df['title'] + ' ' + test_df['plot_synopsis']
test_df.drop(columns=['title','plot_synopsis'], inplace=True)
test_df['text'] = test_df['text'].apply(lambda x: preprocess_text(x))

In [69]:
genres = list(train_df.iloc[:,1:10].columns)
print(genres)

['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']


In [70]:
countvectorizer = CountVectorizer(min_df=2, max_df=0.8)
X_train = countvectorizer.fit_transform(train_df['text'])
X_test = countvectorizer.transform(test_df['text'])
print(X_train.shape)
print(X_test.shape)

(8257, 42514)
(1188, 42514)


In [77]:
Y_train = []
Y_test = []
for genre in genres:
    Y_train.append(train_df[genre].to_numpy())
    Y_test.append(test_df[genre].to_numpy())
print(Y_train[0].shape)
print(Y_test[0].shape)

(8257,)
(1188,)


In [83]:
models = []
predictions = []

for i, genre in enumerate(genres):
    print('Training & predicting with ' + genre + ' model . . .')

    # Train the classifier
    genre_model = MultinomialNB()
    genre_model.fit(X_train, Y_train[i])

    # Predict validation data
    pred = genre_model.predict(X_test)

    models.append(genre_model)
    predictions.append(pred)


Training & predicting with comedy model . . .
Training & predicting with cult model . . .
Training & predicting with flashback model . . .
Training & predicting with historical model . . .
Training & predicting with murder model . . .
Training & predicting with revenge model . . .
Training & predicting with romantic model . . .
Training & predicting with scifi model . . .
Training & predicting with violence model . . .


In [94]:
for i in range(len(predictions)):
    print("ACCURACY", accuracy_score(Y_test[i],predictions[i]))
    print("PRECISION", precision_score(Y_test[i],predictions[i]))
    print("RECALL", recall_score(Y_test[i],predictions[i]))


ACCURACY 0.7811447811447811
PRECISION 0.33067729083665337
RECALL 0.4742857142857143
ACCURACY 0.7533670033670034
PRECISION 0.4212328767123288
RECALL 0.4979757085020243
ACCURACY 0.6877104377104377
PRECISION 0.3762057877813505
RECALL 0.3979591836734694
ACCURACY 0.9587542087542088
PRECISION 0.16216216216216217
RECALL 0.25
ACCURACY 0.6978114478114478
PRECISION 0.685
RECALL 0.7074010327022375
ACCURACY 0.7407407407407407
PRECISION 0.35269709543568467
RECALL 0.35864978902953587
ACCURACY 0.7693602693602694
PRECISION 0.5220994475138122
RECALL 0.6517241379310345
ACCURACY 0.9587542087542088
PRECISION 0.2857142857142857
RECALL 0.3870967741935484
ACCURACY 0.7297979797979798
PRECISION 0.6037735849056604
RECALL 0.6857142857142857


In [95]:
predictions = np.array(predictions)
predictions = np.transpose(predictions)
predictions

array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 1, ..., 1, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 1],
       [0, 0, 1, ..., 1, 0, 0],
       [0, 1, 1, ..., 0, 0, 1]], dtype=int64)

In [96]:
id = test_df['ID'].to_numpy().reshape(predictions.shape[0], 1)
all = np.hstack((id, predictions))
all

array([['cf32cb00-172d-40f2-a3c1-936e8a0d89d7', 0, 0, ..., 1, 0, 0],
       ['df7e125e-2d59-40e4-a126-9397e3a0ef21', 0, 0, ..., 0, 0, 1],
       ['49bc73f3-9179-41cd-9774-905c7a3ac91b', 0, 0, ..., 1, 0, 0],
       ...,
       ['3d291d3b-c0b5-47cc-8dc8-127dc93162e3', 0, 1, ..., 0, 0, 1],
       ['6c9b3034-56b3-42f6-874e-a821c9fd1a89', 0, 0, ..., 1, 0, 0],
       ['fbd1d334-e979-465c-9fb0-e173d2642630', 0, 1, ..., 0, 0, 1]],
      dtype=object)

In [97]:
output_df = pd.DataFrame(all)
output_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,cf32cb00-172d-40f2-a3c1-936e8a0d89d7,0,0,0,0,0,0,1,0,0
1,df7e125e-2d59-40e4-a126-9397e3a0ef21,0,0,0,0,1,0,0,0,1
2,49bc73f3-9179-41cd-9774-905c7a3ac91b,0,0,1,0,0,0,1,0,0
3,0ed4822b-87af-44bc-a677-7f7abfdaccf3,0,0,0,0,0,1,1,0,0
4,0b1b0fa4-43bc-41ba-9598-b3401894b96d,1,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
1183,d32be875-41c7-4e84-ac04-e1d3bc3df0fe,0,0,1,0,0,0,1,0,0
1184,84e025dd-4b4e-403c-a3dd-34818b210857,0,0,0,0,1,0,0,0,0
1185,3d291d3b-c0b5-47cc-8dc8-127dc93162e3,0,1,0,0,0,0,0,0,1
1186,6c9b3034-56b3-42f6-874e-a821c9fd1a89,0,0,1,0,0,0,1,0,0


In [98]:
from pathlib import Path
filepath = Path('./10861383-Task2-method-a.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
print(output_df)
output_df.to_csv(filepath, index=False, header=False)  

                                         0  1  2  3  4  5  6  7  8  9
0     cf32cb00-172d-40f2-a3c1-936e8a0d89d7  0  0  0  0  0  0  1  0  0
1     df7e125e-2d59-40e4-a126-9397e3a0ef21  0  0  0  0  1  0  0  0  1
2     49bc73f3-9179-41cd-9774-905c7a3ac91b  0  0  1  0  0  0  1  0  0
3     0ed4822b-87af-44bc-a677-7f7abfdaccf3  0  0  0  0  0  1  1  0  0
4     0b1b0fa4-43bc-41ba-9598-b3401894b96d  1  0  0  0  1  1  0  0  0
...                                    ... .. .. .. .. .. .. .. .. ..
1183  d32be875-41c7-4e84-ac04-e1d3bc3df0fe  0  0  1  0  0  0  1  0  0
1184  84e025dd-4b4e-403c-a3dd-34818b210857  0  0  0  0  1  0  0  0  0
1185  3d291d3b-c0b5-47cc-8dc8-127dc93162e3  0  1  0  0  0  0  0  0  1
1186  6c9b3034-56b3-42f6-874e-a821c9fd1a89  0  0  1  0  0  0  1  0  0
1187  fbd1d334-e979-465c-9fb0-e173d2642630  0  1  1  0  0  1  0  0  1

[1188 rows x 10 columns]
