In [12]:
import json
import nltk
import re
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd

pd.set_option('display.max_colwidth', 300)

# Read data
df = pd.read_pickle('Data\MovieSummaries\plots_genres_reduced_to_60.pkl')

# Preprocessing functions
def clean_text(text):
    text = re.sub("\'", "", text)
    text = re.sub("[^a-zA-Z]", " ", text)
    text = " ".join(text.split())
    text = text.lower()
    return text

def remove_stopwords(text):
    no_stop = []
    for i in text.split():
        if i not in stopwords:
            no_stop.append(i)
    return " ".join(no_stop)

def freq_plot(text):
    
    words = " ".join([x for x in text])
    words = words.split()
    fdist = nltk.FreqDist(words)
    return fdist



In [15]:
df.reset_index(inplace=True)
df

Unnamed: 0,index,id,text,title,genre
0,0,330,in order to prepare the role of an important old actress a theatre student interviews three actresses who were her pupils an international diva a television star and a dubbing director,Actrius,"[drama, comedydrama]"
1,1,3217,after being pulled through a time portal ash williams lands in ad where he is almost immediately captured by lord arthurs men who suspect him to be an agent for duke henry with whom arthur is at war he is enslaved along with the captured henry his gun and chainsaw confiscated and is taken to a c...,Army of Darkness,"[cult, horror, drama, actionadventure, fantasy, comedy, action]"
2,2,3333,the film follows two juxtaposed families the northern stonemans consisting of the abolitionist congressman austin stoneman his two sons and his daughter elsie and the southern camerons a family including two daughters margaret and flora and three sons most notably ben the stoneman brothers visi...,The Birth of a Nation,"[silent, indie, drama, blackandwhite, war]"
3,3,3746,hatnote in los angeles november retired police officer rick deckard is detained by officer gaff and brought to meet with his former supervisor bryant deckard whose job as a blade runner was to track down bioengineered beings known as replicants and retire them is told by bryant that several have...,Blade Runner,"[thriller, science fiction, cult, drama, noir, crime, chinese]"
4,4,3837,in the american old west of construction on a new railroad led by lyle runs into quicksand the route has to be changed which will require it to go through rock ridge a frontier town where everyone has the last name of johnson the conniving state attorney general hedley lamarr wants to buy the la...,Blazing Saddles,"[western, satire, comedy]"
...,...,...,...,...,...
41544,41788,37373877,according to horrorfest online six people with a common past converge years later at the funeral of an old friend while putting the affairs in order a map is presented to them they follow the directions to find a longlost time capsule from their youth the search is requested by the dead friend i...,Crazy Eights,"[cult, horror]"
41545,41789,37473592,thoppul kodihttpblogspotinthoppulkodihtml proclaims the pathetic life about the cylon refugees music done by music director thomas rathnamhttpwwwflipkartcomthoppulkodip he was also known as isai aruvi thomas rathnam he is an upcoming music composer from india he has done tamil films karuvarai po...,Thoppul Kodi,[drama]
41546,41790,37478048,anand verma a widower and father of a child admits a woman in the hospital who had lost her memory since he admitted her in the hospital the doctor asks him to take care of her until she recovers from amnesia the doctor names her asha which is the name of anands wife and makes her believe that s...,Mr. Bechara,[comedy]
41547,41791,37492363,when clovers childhood friend cherries returns to his hometown after a ten year separation he finds himself immediately attracted to her still outgoing spontaneous personality it isnt long before their rekindled friendship develops into a sweet quirky romance but as time passes and passions fade...,Cherries and Clover,"[comedy, drama, romance]"


In [16]:
#clean text
df["text"] = df["text"].apply(lambda x : clean_text(x))

# Get stopwords
from nltk.corpus import stopwords
#if stopwords not downloaded
#nltk.download('stopwords')
stopwords = set(stopwords.words("english"))
df["text"] = df["text"].apply(lambda x : remove_stopwords(x))

# Binarize genres
from sklearn.preprocessing import MultiLabelBinarizer
multilabel_bina = MultiLabelBinarizer()
multilabel_bina.fit(df["genre"])
y = multilabel_bina.transform(df["genre"])

# TF-IDF Vectorizer
tfidf_vect = TfidfVectorizer(max_df= 0.8, max_features=10000)
xtrain, xval, ytrain, yval = train_test_split(df["text"], y, test_size = 0.2, random_state= 9)
xtrain_tfidf = tfidf_vect.fit_transform(xtrain)
xval_tfidf = tfidf_vect.transform(xval)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Felix\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [17]:
#make list aff all genre
genre_list = []
for i in range(len(df["genre"])):
    genre_list.append(df["genre"].iloc[i][0])

#unique genre
unique_genre = list(set(genre_list))
len(unique_genre)

57

In [18]:
# Documents Example with 5 Plots
documents = df["text"][0:5].reset_index(drop=True)

# Term Frequency-Inverse Document Frequency vectorizer
Tfid_vect = TfidfVectorizer()

# Transform the documents using the vectorizer
documents_vect = Tfid_vect.fit_transform(documents)

# Convert the transformed for better visability. Normally sparse matrix
#df = pd.DataFrame(documents_vect.toarray(), columns=Tfid_vect.get_feature_names_out())

#df

# Multi label logistic Regression

In [19]:

# Logistic Regression
logistic_mod = LogisticRegression()

# Separate binary classifier for each class label for multi-label classification
onevsall = OneVsRestClassifier(logistic_mod) 

# Train model
onevsall.fit(xtrain_tfidf, ytrain)

# Predict and evaluate
y_pred = onevsall.predict(xval_tfidf)
print(classification_report(yval, y_pred))

# Sample predictions
def new_val(x):  
    x = remove_stopwords(clean_text(x))
    x_vec = tfidf_vect.transform([x])
    x_pred = onevsall.predict(x_vec)
    return multilabel_bina.inverse_transform(x_pred)


              precision    recall  f1-score   support

           0       0.68      0.27      0.39      1126
           1       0.70      0.13      0.22       719
           2       0.00      0.00      0.00       229
           3       0.63      0.15      0.24       632
           4       0.86      0.29      0.43       514
           5       0.00      0.00      0.00        58
           6       0.00      0.00      0.00       108
           7       0.00      0.00      0.00       115
           8       0.00      0.00      0.00        60
           9       0.82      0.10      0.17       783
          10       0.00      0.00      0.00        59
          11       0.64      0.06      0.11       233
          12       0.00      0.00      0.00       125
          13       0.00      0.00      0.00       156
          14       0.85      0.24      0.37       196
          15       0.72      0.45      0.55      2534
          16       0.00      0.00      0.00       247
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
for i in range(5): 
    k = xval.sample(1).index[0] 
    print("Movie: ", df['title'][k], "\nPredicted genre: ", new_val(xval[k]))
    print("Actual genre: ",df['genre'][k], "\n")

Movie:  Until Death 
Predicted genre:  [('crime', 'drama', 'thriller')]
Actual genre:  ['thriller', 'crime', 'action', 'actionadventure'] 

Movie:  The Little American 
Predicted genre:  [('war',)]
Actual genre:  ['silent', 'drama', 'indie', 'blackandwhite', 'war'] 

Movie:  The Ghost Goes West 
Predicted genre:  [()]
Actual genre:  ['comedy', 'fantasy', 'blackandwhite'] 

Movie:  The Castle of Cagliostro 
Predicted genre:  [()]
Actual genre:  ['japanese', 'adventure', 'childrensfamily', 'animation'] 

Movie:  The Mangler 
Predicted genre:  [('horror', 'science fiction', 'thriller')]
Actual genre:  ['cult', 'science fiction', 'horror', 'creature', 'adaptation'] 



In [21]:
for i in range(5): 
    k = xval.sample(1).index[0] 
    print("Movie: ", df['title'][k], "\nPredicted genre: ", new_val(xval[k]))
    print("Actual genre: ",df['genre'][k], "\n")

Movie:  Mr. Sardonicus 
Predicted genre:  [()]
Actual genre:  ['horror'] 

Movie:  Der Fuehrer's Face 
Predicted genre:  [('short',)]
Actual genre:  ['short', 'comedy'] 

Movie:  Joyride 
Predicted genre:  [('comedy', 'crime')]
Actual genre:  ['thriller', 'crime', 'adventure', 'actionadventure', 'comedy', 'action'] 

Movie:  The Garden of the Finzi-Continis 
Predicted genre:  [('drama', 'war')]
Actual genre:  ['war', 'history', 'drama', 'romance', 'political'] 

Movie:  The Last Days of Frankie the Fly 
Predicted genre:  [('drama',)]
Actual genre:  ['crime', 'thriller', 'drama', 'action'] 



In [22]:
for i in range(5): 
    print(k)
    k = xval.sample(1).index[0] 
    print("Movie: ", df['title'][k], "\nPredicted genre: ", new_val(xval[k]))
    print("Actual genre: ",df['genre'][k], "\n")

35123
Movie:  Finding John Christmas 
Predicted genre:  [()]
Actual genre:  ['drama', 'fantasy', 'television'] 

9585
Movie:  Hirasat 
Predicted genre:  [('drama', 'war')]
Actual genre:  ['crime', 'action', 'drama'] 

39427
Movie:  Will Penny 
Predicted genre:  [('drama',)]
Actual genre:  ['actionadventure', 'drama', 'western'] 

1624
Movie:  Desperadas 
Predicted genre:  [('drama',)]
Actual genre:  ['drama', 'comedy', 'war'] 

21117
Movie:  Snow Business 
Predicted genre:  [('animation', 'comedy', 'family', 'short')]
Actual genre:  ['short', 'family', 'comedy', 'animation'] 



In [23]:
for i in range(5): 
    print(k)
    k = xval.sample(1).index[0] 
    print("Movie: ", df['title'][k], "\nPredicted genre: ", new_val(xval[k]))
    print("Actual genre: ",df['genre'][k], "\n")

13870
Movie:  Desperate for Love 
Predicted genre:  [('drama',)]
Actual genre:  ['drama', 'romance'] 

31339
Movie:  Salome 
Predicted genre:  [('drama',)]
Actual genre:  ['drama', 'period piece'] 

14242
Movie:  The Call of the Entrepreneur 
Predicted genre:  [()]
Actual genre:  ['documentary'] 

20220
Movie:  The Little Ones 
Predicted genre:  [('drama',)]
Actual genre:  ['adventure', 'childrensfamily'] 

12642
Movie:  Beautiful Girl 
Predicted genre:  [('comedy', 'drama')]
Actual genre:  ['family', 'comedy'] 



In [None]:
f = pd.DataFrame(xval)[0:100]
f['generated_genre'] = pd.DataFrame(xval)['text'].apply(new_val)
f.merge(df, left_index=True, how='left', right_index=True)

SVM

In [24]:
#do state vector machine
import sklearn.svm as svm
from sklearn.multiclass import OneVsRestClassifier

# Create a classifier: a support vector classifier
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
                                 random_state=0))

# Train the classifier on the training set
classifier.fit(xtrain_tfidf, ytrain)

# Predict and evaluate
y_pred = classifier.predict(xval_tfidf)
print(classification_report(yval, y_pred))

# Sample predictions
def new_val(x):  
    x = remove_stopwords(clean_text(x))
    x_vec = tfidf_vect.transform([x])
    x_pred = classifier.predict(x_vec)
    return multilabel_bina.inverse_transform(x_pred)

In [None]:
for i in range(5): 
    k = xval.sample(1).index[0] 
    print("Movie: ", df['title'][k], "\nPredicted genre: ", new_val(xval[k]))
    print("Actual genre: ",df['genre'][k], "\n")

Naive Bayes

In [None]:
# do with naive bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.multiclass import OneVsRestClassifier

# Create a classifier: a support vector classifier
classifier = OneVsRestClassifier(GaussianNB())

# Train the classifier on the training set
classifier.fit(xtrain_tfidf.toarray(), ytrain)

# Predict and evaluate
y_pred = classifier.predict(xval_tfidf.toarray())
print(classification_report(yval, y_pred))

# Sample predictions
def new_val(x):  
    x = remove_stopwords(clean_text(x))
    x_vec = tfidf_vect.transform([x])
    x_pred = classifier.predict(x_vec.toarray())
    return multilabel_bina.inverse_transform(x_pred)

In [None]:
for i in range(5): 
    k = xval.sample(1).index[0] 
    print("Movie: ", df['title'][k], "\nPredicted genre: ", new_val(xval[k]))
    print("Actual genre: ",df['genre'][k], "\n")