In [64]:
import json
import nltk
import re
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd

pd.set_option('display.max_colwidth', 300)

# Read data
df = pd.read_pickle('Data\MovieSummaries\plots_genres_reduced_to_60.pkl')

# Preprocessing functions
def clean_text(text):
    text = re.sub("\'", "", text)
    text = re.sub("[^a-zA-Z]", " ", text)
    text = " ".join(text.split())
    text = text.lower()
    return text

def remove_stopwords(text):
    no_stop = []
    for i in text.split():
        if i not in stopwords:
            no_stop.append(i)
    return " ".join(no_stop)

def freq_plot(text):
    
    words = " ".join([x for x in text])
    words = words.split()
    fdist = nltk.FreqDist(words)
    return fdist



In [67]:
df.reset_index(inplace=True)
df

Unnamed: 0,index,id,text,title,genre
0,0,330,in order to prepare the role of an important old actress a theatre student interviews three actresses who were her pupils an international diva a television star and a dubbing director,Actrius,"[drama, comedydrama]"
1,1,3217,after being pulled through a time portal ash williams lands in ad where he is almost immediately captured by lord arthurs men who suspect him to be an agent for duke henry with whom arthur is at war he is enslaved along with the captured henry his gun and chainsaw confiscated and is taken to a c...,Army of Darkness,"[cult, horror, drama, actionadventure, fantasy, comedy, action]"
2,2,3333,the film follows two juxtaposed families the northern stonemans consisting of the abolitionist congressman austin stoneman his two sons and his daughter elsie and the southern camerons a family including two daughters margaret and flora and three sons most notably ben the stoneman brothers visi...,The Birth of a Nation,"[silent, indie, drama, blackandwhite, war]"
3,3,3746,hatnote in los angeles november retired police officer rick deckard is detained by officer gaff and brought to meet with his former supervisor bryant deckard whose job as a blade runner was to track down bioengineered beings known as replicants and retire them is told by bryant that several have...,Blade Runner,"[thriller, science fiction, cult, drama, noir, crime, chinese]"
4,4,3837,in the american old west of construction on a new railroad led by lyle runs into quicksand the route has to be changed which will require it to go through rock ridge a frontier town where everyone has the last name of johnson the conniving state attorney general hedley lamarr wants to buy the la...,Blazing Saddles,"[western, satire, comedy]"
...,...,...,...,...,...
41544,41788,37373877,according to horrorfest online six people with a common past converge years later at the funeral of an old friend while putting the affairs in order a map is presented to them they follow the directions to find a longlost time capsule from their youth the search is requested by the dead friend i...,Crazy Eights,"[cult, horror]"
41545,41789,37473592,thoppul kodihttpblogspotinthoppulkodihtml proclaims the pathetic life about the cylon refugees music done by music director thomas rathnamhttpwwwflipkartcomthoppulkodip he was also known as isai aruvi thomas rathnam he is an upcoming music composer from india he has done tamil films karuvarai po...,Thoppul Kodi,[drama]
41546,41790,37478048,anand verma a widower and father of a child admits a woman in the hospital who had lost her memory since he admitted her in the hospital the doctor asks him to take care of her until she recovers from amnesia the doctor names her asha which is the name of anands wife and makes her believe that s...,Mr. Bechara,[comedy]
41547,41791,37492363,when clovers childhood friend cherries returns to his hometown after a ten year separation he finds himself immediately attracted to her still outgoing spontaneous personality it isnt long before their rekindled friendship develops into a sweet quirky romance but as time passes and passions fade...,Cherries and Clover,"[comedy, drama, romance]"


In [68]:
#clean text
df["text"] = df["text"].apply(lambda x : clean_text(x))

# Get stopwords
from nltk.corpus import stopwords
stopwords = set(stopwords.words("english"))
df["text"] = df["text"].apply(lambda x : remove_stopwords(x))

# Binarize genres
from sklearn.preprocessing import MultiLabelBinarizer
multilabel_bina = MultiLabelBinarizer()
multilabel_bina.fit(df["genre"])
y = multilabel_bina.transform(df["genre"])

# TF-IDF Vectorizer
tfidf_vect = TfidfVectorizer(max_df= 0.8, max_features=10000)
xtrain, xval, ytrain, yval = train_test_split(df["text"], y, test_size = 0.2, random_state= 9)
xtrain_tfidf = tfidf_vect.fit_transform(xtrain)
xval_tfidf = tfidf_vect.transform(xval)

In [69]:
#make list aff all genre
genre_list = []
for i in range(len(df["genre"])):
    genre_list.append(df["genre"].iloc[i][0])

#unique genre
unique_genre = list(set(genre_list))
len(unique_genre)

57

In [70]:
# Documents Example with 5 Plots
documents = df["text"][0:5].reset_index(drop=True)

# Term Frequency-Inverse Document Frequency vectorizer
Tfid_vect = TfidfVectorizer()

# Transform the documents using the vectorizer
documents_vect = Tfid_vect.fit_transform(documents)

# Multi label logistic Regression

In [71]:

# Logistic Regression
logistic_mod = LogisticRegression()

# Separate binary classifier for each class label for multi-label classification
onevsall = OneVsRestClassifier(logistic_mod) 

# Train model
onevsall.fit(xtrain_tfidf, ytrain)

# Predict and evaluate
y_pred = onevsall.predict(xval_tfidf)
print(classification_report(yval, y_pred))

# Sample predictions
def new_val(x):  
    x = remove_stopwords(clean_text(x))
    x_vec = tfidf_vect.transform([x])
    x_pred = onevsall.predict(x_vec)
    return multilabel_bina.inverse_transform(x_pred)


              precision    recall  f1-score   support

           0       0.68      0.27      0.39      1126
           1       0.70      0.13      0.22       719
           2       0.00      0.00      0.00       229
           3       0.63      0.15      0.24       632
           4       0.86      0.29      0.43       514
           5       0.00      0.00      0.00        58
           6       0.00      0.00      0.00       108
           7       0.00      0.00      0.00       115
           8       0.00      0.00      0.00        60
           9       0.82      0.10      0.17       783
          10       0.00      0.00      0.00        59
          11       0.64      0.06      0.11       233
          12       0.00      0.00      0.00       125
          13       0.00      0.00      0.00       156
          14       0.85      0.24      0.37       196
          15       0.72      0.45      0.55      2534
          16       0.00      0.00      0.00       247
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [135]:
f = pd.DataFrame(xval)[0:100]
f['generated_genre'] = pd.DataFrame(xval)['text'].apply(new_val)

In [136]:
f.merge(df, left_index=True, how='left', right_index=True)

Unnamed: 0,text_x,generated_genre,index,id,text_y,title,genre
16875,movie opens appears another average day hoagies alley top cat gang today posing boy scouts good deeds hopes getting rewards course benny ball saves life bag lady unbeknownst benny revealed afterwards actually rich woman named gertrude vandergelt plans leave fortune missing niece amy meanwhile of...,"[(comedy, crime)]",16941,9257011,movie opens appears another average day hoagies alley top cat gang today posing boy scouts good deeds hopes getting rewards course benny ball saves life bag lady unbeknownst benny revealed afterwards actually rich woman named gertrude vandergelt plans leave fortune missing niece amy meanwhile of...,Top Cat and the Beverly Hills Cats,"[comedy, animation]"
39710,yearold annie parker living perfect young life loved especially mother father older sister none knows something horrible stalking perfect family fall afternoon young annie hears noise upstairs mother collapsed died agonizing downward spiral begins interweave story another far away brilliant rese...,"[(drama,)]",39936,33687379,yearold annie parker living perfect young life loved especially mother father older sister none knows something horrible stalking perfect family fall afternoon young annie hears noise upstairs mother collapsed died agonizing downward spiral begins interweave story another far away brilliant rese...,Decoding Annie Parker,[drama]
23265,two brothers discuss positive negative aspects adultery mother lies beside coma brother r mi attempts discourage conversations become explicit time passes,"[(drama,)]",23365,15510725,two brothers discuss positive negative aspects adultery mother lies beside coma brother r mi attempts discourage conversations become explicit time passes,Les 3 P'tits Cochons,"[family, comedydrama, comedy]"
4519,five people englishwoman evelyn wingate american reporter jonathan clark chinese peasant su tan german physicist klaus bechner soviet soldier ivan godofsky plucked everyday lives transported spacecraft confronted humanoid referring alien arnold moss explains representative world orbiting sun go ...,"[(science fiction, thriller)]",4529,1613385,five people englishwoman evelyn wingate american reporter jonathan clark chinese peasant su tan german physicist klaus bechner soviet soldier ivan godofsky plucked everyday lives transported spacecraft confronted humanoid referring alien arnold moss explains representative world orbiting sun go ...,The 27th Day,[science fiction]
33088,descendant impoverished polish noble family young wokulski forced work waiter hopfers warsaw restaurant dreaming life science taking part failed uprising tsarist russia sentenced exile siberia eventual return warsaw becomes salesman mincels haberdashery marrying late owners widow comes money use...,"[(comedy, drama)]",33251,26409391,descendant impoverished polish noble family young wokulski forced work waiter hopfers warsaw restaurant dreaming life science taking part failed uprising tsarist russia sentenced exile siberia eventual return warsaw becomes salesman mincels haberdashery marrying late owners widow comes money use...,The Doll,"[romance, drama]"
...,...,...,...,...,...,...,...
28235,kulsoom noor engaged sarwar shah deeply love zilleshah whorm knows childhood kulsooms marriage sealed encounter zill shah kills sarwar therefore sent jail unforthunete incident occurs kulsoom marries abid shah elder brother sarwar noor discomfort zille gives put life realms dancing girls alcohol...,"[(drama,)]",28354,21534495,kulsoom noor engaged sarwar shah deeply love zilleshah whorm knows childhood kulsooms marriage sealed encounter zill shah kills sarwar therefore sent jail unforthunete incident occurs kulsoom marries abid shah elder brother sarwar noor discomfort zille gives put life realms dancing girls alcohol...,Zill-e-Shah,[romance]
11557,mary mccloud marries seemingly peaceful kansas schoolteacher william cantrell finding harbors dark secret actually outlaw leader attacks sides civil war profit capturing wagon loaded confederate uniforms decides pass confederate officer naive idealistic brother fletcher joins believes rebel guer...,"[(western,)]",11593,5263902,mary mccloud marries seemingly peaceful kansas schoolteacher william cantrell finding harbors dark secret actually outlaw leader attacks sides civil war profit capturing wagon loaded confederate uniforms decides pass confederate officer naive idealistic brother fletcher joins believes rebel guer...,Dark Command,"[actionadventure, western]"
6861,bridges plays exgi frank pryor arrives london visit wartime girlfriend hasnt seen six years arrival airport coincides man killed sniper finds suspect pryor detained scotland yard questioning released goes girlfriends apartment learns intimate association slain man scotland yard keeps watch coupl...,"[(drama, thriller)]",6884,2580504,bridges plays exgi frank pryor arrives london visit wartime girlfriend hasnt seen six years arrival airport coincides man killed sniper finds suspect pryor detained scotland yard questioning released goes girlfriends apartment learns intimate association slain man scotland yard keeps watch coupl...,The Limping Man,[noir]
32908,aniket one finest filmmakers bengal deepti actress fallen love casting one films sacrificed career love marriage apratim son perfect family plot thickens aniket auditions young actress shikha bears uncanny resemblance wife younger deepti enthusiastically begins coach shikha husbands film mdash m...,"[(drama, romance)]",33071,26282445,aniket one finest filmmakers bengal deepti actress fallen love casting one films sacrificed career love marriage apratim son perfect family plot thickens aniket auditions young actress shikha bears uncanny resemblance wife younger deepti enthusiastically begins coach shikha husbands film mdash m...,Abahoman,[drama]


In [138]:
f.to_pickle("Results/logistic_regression_predictions.pkl")

In [120]:
for row in pd.DataFrame(xval)['text'][0:100]:
    #print(index(row))
    print(new_val(row))

[('comedy', 'crime')]
[('drama',)]
[('drama',)]
[('science fiction', 'thriller')]
[('comedy', 'drama')]
[('drama',)]
[('drama',)]
[('animation', 'comedy', 'short')]
[()]
[()]
[('drama',)]
[('drama',)]
[('drama',)]
[()]
[('horror', 'thriller')]
[('drama',)]
[('drama', 'war')]
[()]
[('documentary', 'drama')]
[('action', 'actionadventure', 'adventure', 'science fiction', 'thriller')]
[('action', 'comedy', 'crime')]
[('drama',)]
[('comedy',)]
[('drama',)]
[()]
[()]
[('drama',)]
[('horror', 'science fiction')]
[('comedy', 'short')]
[('comedy',)]
[('thriller',)]
[('animation', 'comedy', 'family', 'short')]
[('drama',)]
[('drama',)]
[()]
[('drama', 'indie')]
[('drama',)]
[('drama',)]
[('crime', 'drama', 'thriller')]
[()]
[()]
[('drama', 'romance')]
[('drama',)]
[('drama',)]
[('comedy',)]
[('drama',)]
[()]
[('drama',)]
[('thriller',)]
[()]
[('action', 'thriller')]
[('horror',)]
[()]
[('drama',)]
[('drama',)]
[('comedy', 'drama', 'romance')]
[('drama',)]
[('drama', 'war')]
[('musical',)]
[('cri

In [102]:
xval.sample(1).index[0]

8656

In [78]:
for i in range(5): 
    k = xval.sample(1).index[0] 
    print("Movie: ", df['title'][k], "\nPredicted genre: ", new_val(xval[k]))
    print("Actual genre: ",df['genre'][k], "\n")

Movie:  Madeline: Lost in Paris 
Predicted genre:  [('comedy',)]
Actual genre:  ['family', 'animation'] 

Movie:  Paper Chasers 
Predicted genre:  [('documentary',)]
Actual genre:  ['biography', 'indie', 'music', 'documentary'] 

Movie:  Phoebe in Wonderland 
Predicted genre:  [('drama',)]
Actual genre:  ['drama', 'fantasy', 'indie'] 

Movie:  The Amazing Spiderman 
Predicted genre:  [()]
Actual genre:  ['crime', 'science fiction', 'drama', 'fantasy', 'family', 'action'] 

Movie:  Psych 9 
Predicted genre:  [('drama', 'thriller')]
Actual genre:  ['thriller', 'mystery', 'horror', 'psychological thriller', 'suspense'] 



In [79]:
for i in range(5): 
    k = xval.sample(1).index[0] 
    print("Movie: ", df['title'][k], "\nPredicted genre: ", new_val(xval[k]))
    print("Actual genre: ",df['genre'][k], "\n")

Movie:  Box of Moonlight 
Predicted genre:  [()]
Actual genre:  ['indie', 'comedy', 'comedydrama', 'cult', 'drama'] 

Movie:  Kochu Kochu Santhoshangal 
Predicted genre:  [('drama', 'romance')]
Actual genre:  ['drama', 'comedy'] 

Movie:  Neuilly sa mère ! 
Predicted genre:  [('drama',)]
Actual genre:  ['comedy'] 

Movie:  Blues Busters 
Predicted genre:  [('comedy',)]
Actual genre:  ['comedy'] 

Movie:  Prince Daewon 
Predicted genre:  [('drama',)]
Actual genre:  ['drama'] 



In [81]:
for i in range(5): 
    print(k)
    k = xval.sample(1).index[0] 
    print("Movie: ", df['title'][k], "\nPredicted genre: ", new_val(xval[k]))
    print("Actual genre: ",df['genre'][k], "\n")

20017
Movie:  Shaadi Karke Phas Gaya Yaar 
Predicted genre:  [('drama', 'romance')]
Actual genre:  ['romance'] 

16673
Movie:  High Plains Drifter 
Predicted genre:  [('actionadventure', 'western')]
Actual genre:  ['thriller', 'actionadventure', 'western', 'action'] 

3852
Movie:  Studio Stoops 
Predicted genre:  [('blackandwhite', 'comedy', 'short')]
Actual genre:  ['short', 'comedy', 'blackandwhite'] 

22592
Movie:  Assassination Games 
Predicted genre:  [('action', 'actionadventure', 'crime', 'thriller')]
Actual genre:  ['actionadventure', 'thriller', 'action'] 

37336
Movie:  Shrimps for a Day 
Predicted genre:  [('blackandwhite', 'comedy', 'family')]
Actual genre:  ['short', 'family', 'comedy', 'blackandwhite'] 



In [62]:
for i in range(5): 
    print(k)
    k = xval.sample(1).index[0] 
    print("Movie: ", df['title'][k], "\nPredicted genre: ", new_val(xval[k]))
    print("Actual genre: ",df['genre'][k], "\n")

1640
Movie:  The Nightmare Before Christmas 
Predicted genre:  [('comedy', 'family')]
Actual genre:  ['childrensfamily', 'creature', 'musical', 'animation', 'fantasy', 'childrens', 'family'] 

1538
Movie:  Betty Boop's Hallowe'en Party 
Predicted genre:  [('comedy', 'short')]
Actual genre:  ['short', 'family', 'horror', 'musical', 'animation'] 

14716
Movie:  Querelle 
Predicted genre:  [('drama',)]
Actual genre:  ['lgbt', 'drama', 'war', 'gay', 'gay', 'drama', 'gay', 'drama', 'romance'] 

927
Movie:  It's Trad, Dad! 
Predicted genre:  [('comedy',)]
Actual genre:  ['musical', 'comedy', 'blackandwhite', 'comedy'] 

16109
Movie:  En Kadamai 
Predicted genre:  [('drama',)]
Actual genre:  ['drama'] 



In [84]:
for i in range(5): 
    print(k)
    k = xval.sample(1).index[0] 
    print("Movie: ", df['title'][k], "\nPredicted genre: ", new_val(xval[k]))
    print("Actual genre: ",df['genre'][k], "\n")

11589
Movie:  Armageddon 
Predicted genre:  [('adventure', 'science fiction')]
Actual genre:  ['thriller', 'science fiction', 'action', 'adventure'] 

174
Movie:  The Way 
Predicted genre:  [('drama',)]
Actual genre:  ['drama', 'adventure', 'comedy'] 

35115
Movie:  Little Vera 
Predicted genre:  [('drama',)]
Actual genre:  ['war', 'family', 'drama', 'romance'] 

11038
Movie:  Mr. Vampire II 
Predicted genre:  [('horror',)]
Actual genre:  ['horror', 'comedy', 'war', 'chinese'] 

33894
Movie:  Pink Pull 
Predicted genre:  [('animation', 'comedy', 'short')]
Actual genre:  ['short', 'comedy', 'animation'] 



In [85]:
for i in range(5): 
    print(k)
    k = xval.sample(1).index[0] 
    print("Movie: ", df['title'][k], "\nPredicted genre: ", new_val(xval[k]))
    print("Actual genre: ",df['genre'][k], "\n")

39773
Movie:  Words in Blue 
Predicted genre:  [('drama',)]
Actual genre:  ['drama', 'period piece'] 

20293
Movie:  Sauda 
Predicted genre:  [('drama',)]
Actual genre:  ['ensemble', 'melodrama', 'drama'] 

30125
Movie:  Republik Twitter 
Predicted genre:  [('drama', 'romance')]
Actual genre:  ['romance', 'drama'] 

41381
Movie:  Why Be Good? 
Predicted genre:  [('comedy', 'drama')]
Actual genre:  ['comedy'] 

33926
Movie:  Dance Flick 
Predicted genre:  [('comedy',)]
Actual genre:  ['parody', 'musical', 'action', 'comedy'] 



In [86]:
for i in range(5): 
    print(k)
    k = xval.sample(1).index[0] 
    print("Movie: ", df['title'][k], "\nPredicted genre: ", new_val(xval[k]))
    print("Actual genre: ",df['genre'][k], "\n")

27025
Movie:  Grabbers 
Predicted genre:  [('horror',)]
Actual genre:  ['thriller', 'science fiction', 'comedy'] 

35942
Movie:  Golmaal 
Predicted genre:  [('comedy',)]
Actual genre:  ['romance', 'action', 'comedy'] 

26179
Movie:  Checkered Flag or Crash 
Predicted genre:  [()]
Actual genre:  ['comedy', 'adventure'] 

37597
Movie:  It Runs in the Family 
Predicted genre:  [('drama',)]
Actual genre:  ['drama', 'family', 'comedydrama', 'comedy'] 

11962
Movie:  Daddy Cool: Join the Fun 
Predicted genre:  [('comedy', 'drama')]
Actual genre:  ['drama', 'war', 'bollywood', 'comedy'] 

