In [19]:
import json
import nltk
import re
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd

pd.set_option('display.max_colwidth', 300)

# Read data
df = pd.read_pickle('Data\MovieSummaries\plots_genres_balanced.pkl')

# Preprocessing functions
def clean_text(text):
    text = re.sub("\'", "", text)
    text = re.sub("[^a-zA-Z]", " ", text)
    text = " ".join(text.split())
    text = text.lower()
    return text

def remove_stopwords(text):
    no_stop = []
    for i in text.split():
        if i not in stopwords:
            no_stop.append(i)
    return " ".join(no_stop)

def freq_plot(text):
    
    words = " ".join([x for x in text])
    words = words.split()
    fdist = nltk.FreqDist(words)
    return fdist



In [20]:
df.reset_index(inplace=True)
df

Unnamed: 0,index,id,text,title,genre
0,0,2761187,danyael rosales a street preacher and the child of valerie rosales and the angel danyael from the previous film is forced to face his destiny as a nephilim he has some of the angels abilities such as regeneration and can only be killed if his heart is removed one night a blind assassin shoots da...,The Prophecy 3: The Ascent,"[cult, horror]"
1,1,21975745,the story begins with young bobby and his friends having a sundae at a restaurant but he refuses to because of his extremely strict and mean father when returning home bobby hears a bark and realizes that a beagle dog is lost although he decides to keep him his father wont let him bobby cleans t...,Boy Meets Dog,[short]
2,2,29820807,we all get dressed for bill says vogue editor anna wintour the bill in question is new york times photographer bill cunningham for decades this schwinnriding cultural anthropologist has been obsessively and inventively chronicling fashion trends and highsociety charity soirees for the timess sty...,Bill Cunningham New York,"[biography, documentary]"
3,3,9571945,plot the movie begins in jerusalem where a rabbi named rostenburg is typing a code from the bible onto his laptop computer he is shot and killed by an assassin wearing a rabbis outfit who then takes the computer disk containing the code following this two mysterious men take a page out of rosten...,The Omega Code,"[thriller, melodrama, adventure, supernatural, mystery, drama, suspense, action, indie]"
4,4,8801073,snegurochka the daughter of spring and frost yearns for the companionship of mortal humans she grows to like a shepherd named lel but her heart is unable to know love her mother takes pity and gives her this ability but as soon as she falls in love her heart warms up and she melts,The Snow Maiden,"[childrensfamily, fantasy, war, childrens]"
...,...,...,...,...,...
19489,19489,1925631,in between sprees featuring drugs fights sexual assault loud revving harley chopper engines and bongo drums the angels ride out to mecca california in the desert to look for the losers stolen motorcycle they blame a group of mexicans in a repair shop and the two groups brawl the police arrive ch...,The Wild Angels,"[action, indie, actionadventure, drama]"
19490,19490,19369363,an island run by a crazed with power duke is in turmoil the peasants plan a revolt with two buddies including cliff planning to overthrow the corrupt duke cliff invites his friend jacques to help though jacques spends most of his time with his love vanette meanwhile cliff dresses up as a female ...,The Isle of Love,"[comedy, silent, adventure, blackandwhite]"
19491,19491,5349121,after fleeing west germany following a prison break which involved the fatal shooting of a correctional officer the desperate and fastmoving fugitive raf members speed across the border as fast as they can and get an offer from the state security officer erwin hull to remain in the gdr the stasi...,The Legend of Rita,"[lgbt, thriller, war, history, drama, romance, political]"
19492,19492,10071292,see also after washington dc detective forensic psychologist and author alex cross loses control of a sting operation resulting in the death of his partner he opts to retire from the force he finds himself drawn back to police work when megan rose the daughter of a united states senator is kidna...,Along Came a Spider,"[crime, thriller, psychological thriller, mystery, suspense, adaptation]"


In [21]:
#clean text
df["text"] = df["text"].apply(lambda x : clean_text(x))

# Get stopwords
from nltk.corpus import stopwords
#if stopwords not downloaded
#nltk.download('stopwords')
stopwords = set(stopwords.words("english"))
df["text"] = df["text"].apply(lambda x : remove_stopwords(x))

# Binarize genres
from sklearn.preprocessing import MultiLabelBinarizer
multilabel_bina = MultiLabelBinarizer()
multilabel_bina.fit(df["genre"])
y = multilabel_bina.transform(df["genre"])

# TF-IDF Vectorizer
tfidf_vect = TfidfVectorizer(max_df= 0.8, max_features=10000)
xtrain, xval, ytrain, yval = train_test_split(df["text"], y, test_size = 0.2, random_state= 9)
xtrain_tfidf = tfidf_vect.fit_transform(xtrain)
xval_tfidf = tfidf_vect.transform(xval)

In [22]:
#make list aff all genre
genre_list = []
for i in range(len(df["genre"])):
    genre_list.append(df["genre"].iloc[i][0])

#unique genre
unique_genre = list(set(genre_list))
len(unique_genre)

57

In [23]:
# Documents Example with 5 Plots
documents = df["text"][0:5].reset_index(drop=True)

# Term Frequency-Inverse Document Frequency vectorizer
Tfid_vect = TfidfVectorizer()

# Transform the documents using the vectorizer
documents_vect = Tfid_vect.fit_transform(documents)

# Convert the transformed for better visability. Normally sparse matrix
#df = pd.DataFrame(documents_vect.toarray(), columns=Tfid_vect.get_feature_names_out())

#df

# Multi label logistic Regression

In [24]:

# Logistic Regression
logistic_mod = LogisticRegression()

# Separate binary classifier for each class label for multi-label classification
onevsall = OneVsRestClassifier(logistic_mod) 

# Train model
onevsall.fit(xtrain_tfidf, ytrain)

# Predict and evaluate
y_pred = onevsall.predict(xval_tfidf)
print(classification_report(yval, y_pred))

# Sample predictions
def new_val(x):  
    x = remove_stopwords(clean_text(x))
    x_vec = tfidf_vect.transform([x])
    x_pred = onevsall.predict(x_vec)
    return multilabel_bina.inverse_transform(x_pred)



KeyboardInterrupt: 

In [None]:
#write classification report to csv
report = classification_report(yval, y_pred, output_dict=True)
df_report = pd.DataFrame(report).transpose()
df_report.to_csv('Results\Metrics\classification_report_balanced_log_reg.csv', index=True, sep=';')

In [None]:
for i in range(5): 
    k = xval.sample(1).index[0] 
    print("Movie: ", df['title'][k], "\nPredicted genre: ", new_val(xval[k]))
    print("Actual genre: ",df['genre'][k], "\n")

Movie:  My Girl 2 
Predicted genre:  [('comedy', 'drama', 'romance')]
Actual genre:  ['family', 'comedydrama', 'coming of age', 'comedy', 'drama', 'romance', 'teen'] 

Movie:  The Happy Elf 
Predicted genre:  [('comedy', 'family')]
Actual genre:  ['childrensfamily', 'animation', 'musical', 'childrens', 'music', 'family'] 

Movie:  Hear and Now 
Predicted genre:  [()]
Actual genre:  ['biography', 'indie', 'documentary'] 

Movie:  Lantana 
Predicted genre:  [('drama',)]
Actual genre:  ['thriller', 'crime', 'war', 'ensemble', 'psychological thriller', 'mystery', 'drama'] 

Movie:  Magic in the Water 
Predicted genre:  [()]
Actual genre:  ['adventure', 'childrensfamily', 'family', 'fantasy', 'childrens', 'drama'] 



In [None]:
for i in range(5): 
    k = xval.sample(1).index[0] 
    print("Movie: ", df['title'][k], "\nPredicted genre: ", new_val(xval[k]))
    print("Actual genre: ",df['genre'][k], "\n")

Movie:  Michael Collins 
Predicted genre:  [('drama',)]
Actual genre:  ['biography', 'political', 'biopic feature', 'drama', 'war', 'biographical'] 

Movie:  Talk to Me 
Predicted genre:  [('drama',)]
Actual genre:  ['drama', 'television'] 

Movie:  The Hitchhiker's Guide to the Galaxy 
Predicted genre:  [('adventure', 'science fiction')]
Actual genre:  ['science fiction', 'adventure', 'comedy'] 

Movie:  Union Pacific 
Predicted genre:  [('drama',)]
Actual genre:  ['western', 'actionadventure', 'drama', 'action'] 

Movie:  The Tenth Straw 
Predicted genre:  [()]
Actual genre:  ['silent'] 



In [None]:
for i in range(5): 
    print(k)
    k = xval.sample(1).index[0] 
    print("Movie: ", df['title'][k], "\nPredicted genre: ", new_val(xval[k]))
    print("Actual genre: ",df['genre'][k], "\n")

19399
Movie:  It's a Mad, Mad, Mad, Mad World 
Predicted genre:  [('comedy', 'crime')]
Actual genre:  ['crime', 'ensemble', 'adventure', 'comedy', 'family', 'action'] 

5692
Movie:  Dawn! 
Predicted genre:  [('drama',)]
Actual genre:  ['sports', 'biographical', 'drama'] 

14927
Movie:  The White Sheik 
Predicted genre:  [('comedy',)]
Actual genre:  ['war', 'comedydrama', 'blackandwhite', 'satire', 'drama', 'comedy'] 

84
Movie:  Metropolis 
Predicted genre:  [('drama',)]
Actual genre:  ['thriller', 'science fiction', 'indie', 'war', 'silent', 'blackandwhite', 'fantasy', 'drama', 'romance', 'action', 'adventure'] 

8515
Movie:  Hysterical Blindness 
Predicted genre:  [('drama',)]
Actual genre:  ['television', 'period piece', 'drama', 'romance'] 



In [None]:
for i in range(5): 
    print(k)
    k = xval.sample(1).index[0] 
    print("Movie: ", df['title'][k], "\nPredicted genre: ", new_val(xval[k]))
    print("Actual genre: ",df['genre'][k], "\n")

6359
Movie:  Chicago 
Predicted genre:  [('drama',)]
Actual genre:  ['crime', 'silent', 'indie', 'comedydrama', 'blackandwhite', 'drama', 'comedy'] 

3600
Movie:  Medea 
Predicted genre:  [('drama',)]
Actual genre:  ['drama', 'art'] 

9225
Movie:  The Song of Bernadette 
Predicted genre:  [('drama',)]
Actual genre:  ['biography', 'period piece', 'drama', 'blackandwhite'] 

14775
Movie:  The Land Unknown 
Predicted genre:  [('adventure', 'science fiction')]
Actual genre:  ['science fiction', 'fantasy', 'adventure', 'blackandwhite'] 

10100
Movie:  Dangerous Beauty 
Predicted genre:  [('drama',)]
Actual genre:  ['drama', 'biopic feature', 'period piece', 'adaptation', 'romance', 'biographical'] 



In [None]:
f = pd.DataFrame(xval)[0:100]
f['generated_genre'] = pd.DataFrame(xval)['text'].apply(new_val)
f.merge(df, left_index=True, how='left', right_index=True)

Unnamed: 0,text_x,generated_genre,index,id,text_y,title,genre
10736,two starcrossed lovers set eloping forced big wedding bride develops plan unfortunately groom privy plan meanwhile family friends start take sides reception wedding never took placehttpothervenicefilmfestcomprogramhtml,"[(comedy,)]",10736,9854621,two starcrossed lovers set eloping forced big wedding bride develops plan unfortunately groom privy plan meanwhile family friends start take sides reception wedding never took placehttpothervenicefilmfestcomprogramhtml,Cake: A Wedding Story,"[comedy, romance, indie]"
16919,story begins much like classic fairy tale red riding hood see grandmother lives woods present plans bring grandmother tweety sylvester sees reds cargo immediately begins going primary interest tweety red boards bus sylvester continues drives woods inattentive cat striking road sign along way woo...,"[(animation, comedy, family, short)]",16919,18847566,story begins much like classic fairy tale red riding hood see grandmother lives woods present plans bring grandmother tweety sylvester sees reds cargo immediately begins going primary interest tweety red boards bus sylvester continues drives woods inattentive cat striking road sign along way woo...,Red Riding Hoodwinked,"[short, family, comedy, animation]"
11875,ellen ripley survivor space freighter nostromo rescued revived drifting fiftyseven years stasis interview panel executives employer weylandyutani corporation testimony regarding alien met extreme skepticism physical evidence ripley loses spaceflight license result questionable judgment learns lv...,"[(science fiction, thriller)]",11875,213472,ellen ripley survivor space freighter nostromo rescued revived drifting fiftyseven years stasis interview panel executives employer weylandyutani corporation testimony regarding alien met extreme skepticism physical evidence ripley loses spaceflight license result questionable judgment learns lv...,Aliens,"[thriller, science fiction, horror, adventure, creature, actionadventure, action]"
8327,film centers around sixyearold girl named savannah whose father running united states senate savannahs parents pay attention decides run away leaving note flees unfortunately father fearing note may hurt chances winning election burns without even reading aunt picks go park savannah switches car...,"[(drama,)]",8327,6794558,film centers around sixyearold girl named savannah whose father running united states senate savannahs parents pay attention decides run away leaving note flees unfortunately father fearing note may hurt chances winning election burns without even reading aunt picks go park savannah switches car...,Savannah Smiles,"[childrensfamily, childrens, crime, comedy, drama]"
10403,stoker thompson yearold hasbeen boxer tiny stokers manager sure continue lose fights takes money dive mobster sure thompson lose doesnt tell boxer setup beginning fourth last round vicious boxing match much younger heavilyfavored tiger nelson stoker learns fix even though learns little boy feare...,"[(drama,)]",10403,1570532,stoker thompson yearold hasbeen boxer tiny stokers manager sure continue lose fights takes money dive mobster sure thompson lose doesnt tell boxer setup beginning fourth last round vicious boxing match much younger heavilyfavored tiger nelson stoker learns fix even though learns little boy feare...,The Set-Up,"[sports, drama, blackandwhite, noir]"
...,...,...,...,...,...,...,...
11402,san diego california william miller teenaged aspiring rock journalist mother elaine local college professor strange mix new age conservative beliefs wants become lawyer miller writes underground papers sharing love rock music instilled gift albums given sister anita left home disgust elaines hou...,"[(drama,)]",11402,431562,san diego california william miller teenaged aspiring rock journalist mother elaine local college professor strange mix new age conservative beliefs wants become lawyer miller writes underground papers sharing love rock music instilled gift albums given sister anita left home disgust elaines hou...,Almost Famous,"[comedydrama, period piece, musical, coming of age, comedy, drama]"
16727,young spunky piglet named gordy lives meadow brook farm somewhere arkansas father taken north undetermined fate piglet learns farms rooster mother siblings taken pursued father determined locate family return farm gordy heads alone find eventually ends care jinnie sue macallister young upbeat co...,"[(comedy,)]",16727,2589373,young spunky piglet named gordy lives meadow brook farm somewhere arkansas father taken north undetermined fate piglet learns farms rooster mother siblings taken pursued father determined locate family return farm gordy heads alone find eventually ends care jinnie sue macallister young upbeat co...,Gordy,"[childrensfamily, family, comedy, childrens]"
16273,failing uhf tv station krud channel reborn christian television station kgod new format big success attracts incompatible mix fringe ministries broadcasters wanting time station series humorous vignettes show different religious shows station broadcasts faith healer radical black nationalist pre...,"[(comedy,)]",16273,5939943,failing uhf tv station krud channel reborn christian television station kgod new format big success attracts incompatible mix fringe ministries broadcasters wanting time station series humorous vignettes show different religious shows station broadcasts faith healer radical black nationalist pre...,Pray TV,"[parody, comedy]"
9220,phillip bellamy leading barrister meets wife doctor anne dyson psychiatrist lets drop case involved defending young hoodlum apparently shot policemen deserted road found police still holding smoking gun bellamy convinced guilt compelled defend legal code wife suggests take look patient devotes m...,"[(crime, drama, mystery, thriller)]",9220,24304179,phillip bellamy leading barrister meets wife doctor anne dyson psychiatrist lets drop case involved defending young hoodlum apparently shot policemen deserted road found police still holding smoking gun bellamy convinced guilt compelled defend legal code wife suggests take look patient devotes m...,Mix Me a Person,"[crime, mystery, melodrama, drama]"


Naive Bayes

In [None]:
# do with naive bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.multiclass import OneVsRestClassifier

# Create a classifier: a support vector classifier
classifier = OneVsRestClassifier(GaussianNB())

# Train the classifier on the training set
classifier.fit(xtrain_tfidf.toarray(), ytrain)

# Predict and evaluate
y_pred = classifier.predict(xval_tfidf.toarray())
print(classification_report(yval, y_pred))

# Sample predictions
def new_val(x):  
    x = remove_stopwords(clean_text(x))
    x_vec = tfidf_vect.transform([x])
    x_pred = classifier.predict(x_vec.toarray())
    return multilabel_bina.inverse_transform(x_pred)

In [None]:
#write classification report to csv
report = classification_report(yval, y_pred, output_dict=True)
df_report = pd.DataFrame(report).transpose()
df_report.to_csv('Results\Metrics\classification_report_balanced_NB.csv', index=True, sep=';')

In [None]:
for i in range(5): 
    k = xval.sample(1).index[0] 
    print("Movie: ", df['title'][k], "\nPredicted genre: ", new_val(xval[k]))
    print("Actual genre: ",df['genre'][k], "\n")

SVM

In [None]:
#do state vector machine
import sklearn.svm as svm
from sklearn.multiclass import OneVsRestClassifier

# Create a classifier: a support vector classifier
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
                                 random_state=0))

# Train the classifier on the training set
classifier.fit(xtrain_tfidf, ytrain)

# Predict and evaluate
y_pred = classifier.predict(xval_tfidf)
print(classification_report(yval, y_pred))

# Sample predictions
def new_val(x):  
    x = remove_stopwords(clean_text(x))
    x_vec = tfidf_vect.transform([x])
    x_pred = classifier.predict(x_vec)
    return multilabel_bina.inverse_transform(x_pred)

In [None]:
#write classification report to csv
report = classification_report(yval, y_pred, output_dict=True)
df_report = pd.DataFrame(report).transpose()
df_report.to_csv('Results\Metrics\classification_report_balanced_SVM.csv', index=True, sep=';')

In [None]:
for i in range(5): 
    k = xval.sample(1).index[0] 
    print("Movie: ", df['title'][k], "\nPredicted genre: ", new_val(xval[k]))
    print("Actual genre: ",df['genre'][k], "\n")