## Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from nltk.tokenize import TweetTokenizer # doesn't split at apostrophes
import nltk
from nltk import Text
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import word_tokenize  
from nltk.tokenize import sent_tokenize 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

## Uploading the data
Data used in for training the model can be found at this [link](https://www.kaggle.com/jrobischon/wikipedia-movie-plots)

In [2]:
train_df2 = pd.read_csv("wiki_movie_plots_deduped.csv", delimiter=',')
train_df2.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [3]:
train_df2.columns

Index(['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast',
       'Genre', 'Wiki Page', 'Plot'],
      dtype='object')

In [4]:
train_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34886 entries, 0 to 34885
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      34886 non-null  int64 
 1   Title             34886 non-null  object
 2   Origin/Ethnicity  34886 non-null  object
 3   Director          34886 non-null  object
 4   Cast              33464 non-null  object
 5   Genre             34886 non-null  object
 6   Wiki Page         34886 non-null  object
 7   Plot              34886 non-null  object
dtypes: int64(1), object(7)
memory usage: 2.1+ MB


In [5]:
train_df2.describe()

Unnamed: 0,Release Year
count,34886.0
mean,1981.314252
std,27.815174
min,1901.0
25%,1957.0
50%,1988.0
75%,2007.0
max,2017.0


In [6]:
train_df = pd.DataFrame(columns=['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast',
       'Genre', 'Wiki Page', 'Plot'])
train_df

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot


In [7]:
rand = np.random.randint(0,34500)
dat = train_df2.iloc[rand].copy()
print(type(dat))

<class 'pandas.core.series.Series'>


In [8]:
## Selection of 3000 random movies in the dataset
## Higher number won't be supported due to the limitation of cpu
for i in range(3000):
    rand = np.random.randint(0,34500)
    dat = train_df2.iloc[rand].copy()
    df = pd.DataFrame([dat],index = [i], columns=['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast',
       'Genre', 'Wiki Page', 'Plot']) 
    train_df = pd.concat([train_df,df])
train_df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1948,Bhaktha Jana,Tamil,P. Pullaiah,"C. Honnappa Bhagavathar, Chittor V. Nagaiah, P...",unknown,https://en.wikipedia.org/wiki/Bhaktha_Jana,Janaka (Santhakumari) has been a staunch devot...
1,1974,Paul and Michelle,British,Lewis Gilbert,"Anicée Alvina, Sean Bury",drama,https://en.wikipedia.org/wiki/Paul_and_Michelle,Taking place approximately three years after t...
2,2016,Storks,American,Doug Sweetland,,unknown,https://en.wikipedia.org/wiki/Storks_(film),"For generations, the storks of Stork Mountain ..."
3,1987,Watch the Shadows Dance,Australian,Unknown,,unknown,https://en.wikipedia.org/wiki/Watch_the_Shadow...,A tight-knit group of high school students stu...
4,1963,Murder at the Gallop,British,George Pollock,"Margaret Rutherford, Stringer Davis",mystery,https://en.wikipedia.org/wiki/Murder_at_the_Ga...,While Miss Marple (Margaret Rutherford) and Mr...


In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000 entries, 0 to 2999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      3000 non-null   object
 1   Title             3000 non-null   object
 2   Origin/Ethnicity  3000 non-null   object
 3   Director          3000 non-null   object
 4   Cast              2885 non-null   object
 5   Genre             3000 non-null   object
 6   Wiki Page         3000 non-null   object
 7   Plot              3000 non-null   object
dtypes: object(8)
memory usage: 210.9+ KB


In [10]:
train_df.describe()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
count,3000,3000,3000,3000,2885,3000,3000,3000
unique,108,2859,21,2084,2736,396,2871,2870
top,2014,Warrant,American,Unknown,Tom and Jerry,drama,https://en.wikipedia.org/wiki/The_Perils_of_Gw...,"Bobby ""Carlo"" Powers and Detective Salvatore ""..."
freq,87,3,1509,101,7,541,3,3


In [11]:
test_df = pd.read_csv('test_data_final.csv')

In [12]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    999 non-null    int64 
 1   Movie_Title   999 non-null    object
 2   Release_date  999 non-null    object
 3   Movie_cast    999 non-null    object
 4   Movie_plot    985 non-null    object
dtypes: int64(1), object(4)
memory usage: 39.1+ KB


In [13]:
test_df.describe()

Unnamed: 0.1,Unnamed: 0
count,999.0
mean,499.0
std,288.530761
min,0.0
25%,249.5
50%,499.0
75%,748.5
max,998.0


In [14]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,Movie_Title,Release_date,Movie_cast,Movie_plot
0,0,Citizen Kane,"May 1, 1941Palace Theatre) September 5, 1941",Orson Welles Joseph Cotten Dorothy Comingore E...,"In a mansion called Xanadu, part of a vast pal..."
1,1,2001: A Space Odyssey (film),2 April 1968Uptown Theater) 3 April 1968 15 Ma...,Keir Dullea Gary Lockwood,"In the prehistoric African veldt, a tribe of h..."
2,2,The Rules of the Game,7 July 1939 (Paris),Nora GregorPaulette DubostMarcel DalioRoland T...,Aviator André Jurieux (Roland Toutain) lands a...
3,3,Bicycle Thieves,24 November 1948,Enzo Staiola Lamberto Maggiorani,In the post-World War II Val Melaina neighbour...
4,4,Vertigo (film),"May 9, 1958",James Stewart Kim Novak Barbara Bel Geddes Tom...,"After a rooftop chase, where a fellow policema..."


In [15]:
test_df.drop(['Release_date','Movie_cast','Unnamed: 0'],axis = 1,inplace = True)
test_df.head()

Unnamed: 0,Movie_Title,Movie_plot
0,Citizen Kane,"In a mansion called Xanadu, part of a vast pal..."
1,2001: A Space Odyssey (film),"In the prehistoric African veldt, a tribe of h..."
2,The Rules of the Game,Aviator André Jurieux (Roland Toutain) lands a...
3,Bicycle Thieves,In the post-World War II Val Melaina neighbour...
4,Vertigo (film),"After a rooftop chase, where a fellow policema..."


In [16]:
train_df['Genre'].nunique()

396

In [17]:
train_df['Genre'].value_counts()

drama                                               541
unknown                                             481
comedy                                              385
thriller                                             96
horror                                               90
                                                   ... 
supernatural                                          1
supernatural horror                                   1
drama, sports (aquatics, swimming), comedy-drama      1
romance / fantasy                                     1
romance / drama                                       1
Name: Genre, Length: 396, dtype: int64

In [18]:
import string
alpha = string.ascii_lowercase
alpha_up = string.ascii_uppercase
print(alpha)
print(alpha_up)

def remove_punc(data):
    '''
    This function will remove the punctuations from the given string
    '''
    string = ''
    for i in data:
        if i in alpha:
            string = string + i
        elif i in alpha_up:
            string = string + i
        elif i == ' ':
            string = string + i
        else:
            continue
    return string

abcdefghijklmnopqrstuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ


In [19]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
       
    # Now just remove any stopwords
    return [word for word in mess.split() if word.lower() not in stopwords.words('english')]

In [20]:
def genre_corr(data):
    '''
    This function will classify all the given genres into the desired 6 categories
    '''
    data = data.split()
    final_gen = ''
    list_rom = ['romance','love','love story','musical b', 'romantic','rom-coms','music','musical','actionlove','romanceaction',
               'romancecomedy','romancehorror','romcom','rom\|com','rom',' \(artistic\)',"drama|romance|adult|children"
               ]
    list_act = ['act','action','adventures','kung fu','martial arts','world war ii','world war i','spy film','biker film',
               'buddy cop','buddy film','bruceploitation','drama about child soldiers',"war-time","wartime","ww1","wwii",
               'true crime','crime','\|007','gun fu','afghan war drama','actionadventure','actioncomedy','actiondrama',
                'actionlove','actionmasala','actionchildren','adventurecomedy','actionthriller','martialarts',' \(volleyball\)',
                ' \(aquatics|swimming\)',' \(aquatics|swimming\)',' \(shogi|chess\)',' (road bicycle racing)','american football',
                'dev\|nusrat jahan',' \(road bicycle racing\)','liveaction','heistcomedy','heist','historydisaster','warcomedy',
                'samurai','martial_arts','adventure','spy','superhero',"drama|romance|adult|children",'actionner',
               ]
    list_sus = ['ttriller','coming of age','coming-of-age','slice of life','psycho thriller,',"ero",'actionadventure','dramathriller',
                'dramathriller','thriler','crimethriller','actionthriller','comedysocial','erotica','erotic','comedythriller',
                  'colour\|yellow\|productions\|eros\|international',       'melodrama', 'gangsterthriller',  'ancientcostume', 
                'dramatic','biodrama','bio-drama','comedy-drama adaptation of the mordecai richler novel','drama about child soldiers',
                'drama loosely','slice of life',"comedy–drama"'actionlove','actiondrama','fantasycomedy','dramacomedy',
                'dramacomedysocial','dramathriller','comedydrama','comedyhorror','adventurecomedy','animationdrama','comedysocial',
                'erotica','erotic','biblical','biblical','colour\|yellow\|productions\|eros\|international','liveaction','superheroes',
                'heistcomedy','heist','warcomedy','dramatic','familya','familya','dramedy','dramaa','famil\|','superheroe',
                'devotionalbiography','familydrama','espionage','romancefiction','horrorthriller','suspensethriller','triller',
                'satirical','homosexual','sexual','mockumentary','periodic','politics','tv_miniseries','serial',"musical–comedy",
                "roman|porno","action—masala","horror–thriller",'family','martial_arts','horror','war','adventure','noir',
                'superhero','social','suspense',"drama|romance|adult|children",'actionner',
                ]
    list_sci = ['animated','anime','children\'s','3-d','3d','sci-fi','sci fi','science fiction','avant-garde','animationchildren',
               'computer animation',   ' in animation',   'actionchildren',  'fantasychildren\|','fantasycomedy','fantasyperiod',
                'sciencefiction','animationdrama','fantay','\|\(children\|poker\|karuta\)','superheroes','computeranimation',
                '\|\(fiction\)','science_fictionchildren','science_fiction','superhero',"drama|romance|adult|children"
                ]
    list_hor = ['psychological','j-horror','psycho thriller,',"comedy–horror",'actionadventure','comedyhorror','horror',
                ]
    for i in data:
        if i.lower() in list_rom:
            final_gen = final_gen + ' Romance'
        elif i.lower() in list_act:
            final_gen = final_gen + ' Action'
        elif i.lower() in list_sus:
            final_gen = final_gen + ' Suspense'
        elif i.lower() in list_sci:    
            final_gen = final_gen + ' Science Fiction'
        elif i.lower() in list_hor:
            final_gen = final_gen + ' Horror'
        else:
            final_gen = final_gen + ' Others'
    final_gen = set(final_gen.split())
    genre = ''
    for i in final_gen:
        genre = genre + ' {}'.format(i)
    return genre
            

In [21]:
def stem(name):
    name_list = []
    ps = PorterStemmer()
    for i in name:
        words = word_tokenize(i)
        name_list.append(words)
    return name_list

## Clearing the data
1. Genre
2. Train plot
3. Test Plot

In [22]:
train_df['Genre String'] = train_df['Genre'].apply(remove_punc)

In [23]:
train_df['Genre Corrected'] = train_df['Genre String'].apply(genre_corr)

In [24]:
train_df.head(10)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,Genre String,Genre Corrected
0,1948,Bhaktha Jana,Tamil,P. Pullaiah,"C. Honnappa Bhagavathar, Chittor V. Nagaiah, P...",unknown,https://en.wikipedia.org/wiki/Bhaktha_Jana,Janaka (Santhakumari) has been a staunch devot...,unknown,Others
1,1974,Paul and Michelle,British,Lewis Gilbert,"Anicée Alvina, Sean Bury",drama,https://en.wikipedia.org/wiki/Paul_and_Michelle,Taking place approximately three years after t...,drama,Others
2,2016,Storks,American,Doug Sweetland,,unknown,https://en.wikipedia.org/wiki/Storks_(film),"For generations, the storks of Stork Mountain ...",unknown,Others
3,1987,Watch the Shadows Dance,Australian,Unknown,,unknown,https://en.wikipedia.org/wiki/Watch_the_Shadow...,A tight-knit group of high school students stu...,unknown,Others
4,1963,Murder at the Gallop,British,George Pollock,"Margaret Rutherford, Stringer Davis",mystery,https://en.wikipedia.org/wiki/Murder_at_the_Ga...,While Miss Marple (Margaret Rutherford) and Mr...,mystery,Others
5,2010,Paathshaala,Bollywood,Milind Ukey,"Shahid Kapoor, Nana Patekar, Ayesha Takia, Swi...",social,https://en.wikipedia.org/wiki/Paathshaala,The story begins with a new English teacher Ra...,social,Suspense
6,2011,Vellaripravinte Changathi (വെള്ളരിപ്രാവിന്റെ ച...,Malayalam,Akku Akbar,"Dileep, Indrajith, Kavya Madhavan, Manoj K. Jayan",drama,https://en.wikipedia.org/wiki/Vellaripravinte_...,Vellaripravinte Changathi is based on the stor...,drama,Others
7,1980,Prom Night,American,Paul Lynch,"Jamie Lee Curtis, Leslie Nielsen",horror,https://en.wikipedia.org/wiki/Prom_Night_(1980...,"In 1974, 11-year-olds Wendy Richards, Jude Cun...",horror,Suspense
8,2003,Pauly Shore Is Dead,American,Pauly Shore,Pauly Shore,mockumentary,https://en.wikipedia.org/wiki/Pauly_Shore_Is_Dead,The film begins as an autobiographical look at...,mockumentary,Suspense
9,1939,The Big Guy,American,Arthur Lubin,"Victor McLaglen, Jackie Cooper, Ona Munson",crime,https://en.wikipedia.org/wiki/The_Big_Guy,A prison warden (Victor McLaglen) can either k...,crime,Action


In [25]:
train_df['GenreSplit']=train_df['Genre Corrected'].str.split()

In [26]:
train_df.head(10)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,Genre String,Genre Corrected,GenreSplit
0,1948,Bhaktha Jana,Tamil,P. Pullaiah,"C. Honnappa Bhagavathar, Chittor V. Nagaiah, P...",unknown,https://en.wikipedia.org/wiki/Bhaktha_Jana,Janaka (Santhakumari) has been a staunch devot...,unknown,Others,[Others]
1,1974,Paul and Michelle,British,Lewis Gilbert,"Anicée Alvina, Sean Bury",drama,https://en.wikipedia.org/wiki/Paul_and_Michelle,Taking place approximately three years after t...,drama,Others,[Others]
2,2016,Storks,American,Doug Sweetland,,unknown,https://en.wikipedia.org/wiki/Storks_(film),"For generations, the storks of Stork Mountain ...",unknown,Others,[Others]
3,1987,Watch the Shadows Dance,Australian,Unknown,,unknown,https://en.wikipedia.org/wiki/Watch_the_Shadow...,A tight-knit group of high school students stu...,unknown,Others,[Others]
4,1963,Murder at the Gallop,British,George Pollock,"Margaret Rutherford, Stringer Davis",mystery,https://en.wikipedia.org/wiki/Murder_at_the_Ga...,While Miss Marple (Margaret Rutherford) and Mr...,mystery,Others,[Others]
5,2010,Paathshaala,Bollywood,Milind Ukey,"Shahid Kapoor, Nana Patekar, Ayesha Takia, Swi...",social,https://en.wikipedia.org/wiki/Paathshaala,The story begins with a new English teacher Ra...,social,Suspense,[Suspense]
6,2011,Vellaripravinte Changathi (വെള്ളരിപ്രാവിന്റെ ച...,Malayalam,Akku Akbar,"Dileep, Indrajith, Kavya Madhavan, Manoj K. Jayan",drama,https://en.wikipedia.org/wiki/Vellaripravinte_...,Vellaripravinte Changathi is based on the stor...,drama,Others,[Others]
7,1980,Prom Night,American,Paul Lynch,"Jamie Lee Curtis, Leslie Nielsen",horror,https://en.wikipedia.org/wiki/Prom_Night_(1980...,"In 1974, 11-year-olds Wendy Richards, Jude Cun...",horror,Suspense,[Suspense]
8,2003,Pauly Shore Is Dead,American,Pauly Shore,Pauly Shore,mockumentary,https://en.wikipedia.org/wiki/Pauly_Shore_Is_Dead,The film begins as an autobiographical look at...,mockumentary,Suspense,[Suspense]
9,1939,The Big Guy,American,Arthur Lubin,"Victor McLaglen, Jackie Cooper, Ona Munson",crime,https://en.wikipedia.org/wiki/The_Big_Guy,A prison warden (Victor McLaglen) can either k...,crime,Action,[Action]


In [27]:
 train_df['GenreSplit len']= train_df['GenreSplit'].apply(len)

In [28]:
train_df.head(10)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,Genre String,Genre Corrected,GenreSplit,GenreSplit len
0,1948,Bhaktha Jana,Tamil,P. Pullaiah,"C. Honnappa Bhagavathar, Chittor V. Nagaiah, P...",unknown,https://en.wikipedia.org/wiki/Bhaktha_Jana,Janaka (Santhakumari) has been a staunch devot...,unknown,Others,[Others],1
1,1974,Paul and Michelle,British,Lewis Gilbert,"Anicée Alvina, Sean Bury",drama,https://en.wikipedia.org/wiki/Paul_and_Michelle,Taking place approximately three years after t...,drama,Others,[Others],1
2,2016,Storks,American,Doug Sweetland,,unknown,https://en.wikipedia.org/wiki/Storks_(film),"For generations, the storks of Stork Mountain ...",unknown,Others,[Others],1
3,1987,Watch the Shadows Dance,Australian,Unknown,,unknown,https://en.wikipedia.org/wiki/Watch_the_Shadow...,A tight-knit group of high school students stu...,unknown,Others,[Others],1
4,1963,Murder at the Gallop,British,George Pollock,"Margaret Rutherford, Stringer Davis",mystery,https://en.wikipedia.org/wiki/Murder_at_the_Ga...,While Miss Marple (Margaret Rutherford) and Mr...,mystery,Others,[Others],1
5,2010,Paathshaala,Bollywood,Milind Ukey,"Shahid Kapoor, Nana Patekar, Ayesha Takia, Swi...",social,https://en.wikipedia.org/wiki/Paathshaala,The story begins with a new English teacher Ra...,social,Suspense,[Suspense],1
6,2011,Vellaripravinte Changathi (വെള്ളരിപ്രാവിന്റെ ച...,Malayalam,Akku Akbar,"Dileep, Indrajith, Kavya Madhavan, Manoj K. Jayan",drama,https://en.wikipedia.org/wiki/Vellaripravinte_...,Vellaripravinte Changathi is based on the stor...,drama,Others,[Others],1
7,1980,Prom Night,American,Paul Lynch,"Jamie Lee Curtis, Leslie Nielsen",horror,https://en.wikipedia.org/wiki/Prom_Night_(1980...,"In 1974, 11-year-olds Wendy Richards, Jude Cun...",horror,Suspense,[Suspense],1
8,2003,Pauly Shore Is Dead,American,Pauly Shore,Pauly Shore,mockumentary,https://en.wikipedia.org/wiki/Pauly_Shore_Is_Dead,The film begins as an autobiographical look at...,mockumentary,Suspense,[Suspense],1
9,1939,The Big Guy,American,Arthur Lubin,"Victor McLaglen, Jackie Cooper, Ona Munson",crime,https://en.wikipedia.org/wiki/The_Big_Guy,A prison warden (Victor McLaglen) can either k...,crime,Action,[Action],1


In [29]:
train_df['GenreSplit len'].describe()

count    3000.000000
mean        1.179333
std         0.424146
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         4.000000
Name: GenreSplit len, dtype: float64

In [30]:
def pop_other(name):
    for i in range(len(name)):
        if len(name) > 1:
            if 'Others' in name:
                ind = name.index('Others')
                name.pop(ind)
        elif len(name) == 0:
            name.append('Others')
    return name

In [31]:
train_df['GenreSplit'] =  train_df['GenreSplit'].apply(pop_other)

In [32]:
train_df.head(10)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,Genre String,Genre Corrected,GenreSplit,GenreSplit len
0,1948,Bhaktha Jana,Tamil,P. Pullaiah,"C. Honnappa Bhagavathar, Chittor V. Nagaiah, P...",unknown,https://en.wikipedia.org/wiki/Bhaktha_Jana,Janaka (Santhakumari) has been a staunch devot...,unknown,Others,[Others],1
1,1974,Paul and Michelle,British,Lewis Gilbert,"Anicée Alvina, Sean Bury",drama,https://en.wikipedia.org/wiki/Paul_and_Michelle,Taking place approximately three years after t...,drama,Others,[Others],1
2,2016,Storks,American,Doug Sweetland,,unknown,https://en.wikipedia.org/wiki/Storks_(film),"For generations, the storks of Stork Mountain ...",unknown,Others,[Others],1
3,1987,Watch the Shadows Dance,Australian,Unknown,,unknown,https://en.wikipedia.org/wiki/Watch_the_Shadow...,A tight-knit group of high school students stu...,unknown,Others,[Others],1
4,1963,Murder at the Gallop,British,George Pollock,"Margaret Rutherford, Stringer Davis",mystery,https://en.wikipedia.org/wiki/Murder_at_the_Ga...,While Miss Marple (Margaret Rutherford) and Mr...,mystery,Others,[Others],1
5,2010,Paathshaala,Bollywood,Milind Ukey,"Shahid Kapoor, Nana Patekar, Ayesha Takia, Swi...",social,https://en.wikipedia.org/wiki/Paathshaala,The story begins with a new English teacher Ra...,social,Suspense,[Suspense],1
6,2011,Vellaripravinte Changathi (വെള്ളരിപ്രാവിന്റെ ച...,Malayalam,Akku Akbar,"Dileep, Indrajith, Kavya Madhavan, Manoj K. Jayan",drama,https://en.wikipedia.org/wiki/Vellaripravinte_...,Vellaripravinte Changathi is based on the stor...,drama,Others,[Others],1
7,1980,Prom Night,American,Paul Lynch,"Jamie Lee Curtis, Leslie Nielsen",horror,https://en.wikipedia.org/wiki/Prom_Night_(1980...,"In 1974, 11-year-olds Wendy Richards, Jude Cun...",horror,Suspense,[Suspense],1
8,2003,Pauly Shore Is Dead,American,Pauly Shore,Pauly Shore,mockumentary,https://en.wikipedia.org/wiki/Pauly_Shore_Is_Dead,The film begins as an autobiographical look at...,mockumentary,Suspense,[Suspense],1
9,1939,The Big Guy,American,Arthur Lubin,"Victor McLaglen, Jackie Cooper, Ona Munson",crime,https://en.wikipedia.org/wiki/The_Big_Guy,A prison warden (Victor McLaglen) can either k...,crime,Action,[Action],1


In [33]:
def lst_str (name):
    return ' '.join(name)

def str_lst (name):
    lst = []
    for i in name.split():
        lst.append(i)
    return lst

In [34]:
train_df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,Genre String,Genre Corrected,GenreSplit,GenreSplit len
0,1948,Bhaktha Jana,Tamil,P. Pullaiah,"C. Honnappa Bhagavathar, Chittor V. Nagaiah, P...",unknown,https://en.wikipedia.org/wiki/Bhaktha_Jana,Janaka (Santhakumari) has been a staunch devot...,unknown,Others,[Others],1
1,1974,Paul and Michelle,British,Lewis Gilbert,"Anicée Alvina, Sean Bury",drama,https://en.wikipedia.org/wiki/Paul_and_Michelle,Taking place approximately three years after t...,drama,Others,[Others],1
2,2016,Storks,American,Doug Sweetland,,unknown,https://en.wikipedia.org/wiki/Storks_(film),"For generations, the storks of Stork Mountain ...",unknown,Others,[Others],1
3,1987,Watch the Shadows Dance,Australian,Unknown,,unknown,https://en.wikipedia.org/wiki/Watch_the_Shadow...,A tight-knit group of high school students stu...,unknown,Others,[Others],1
4,1963,Murder at the Gallop,British,George Pollock,"Margaret Rutherford, Stringer Davis",mystery,https://en.wikipedia.org/wiki/Murder_at_the_Ga...,While Miss Marple (Margaret Rutherford) and Mr...,mystery,Others,[Others],1


In [35]:
train_df['Genre_final'] = train_df['GenreSplit']

In [36]:
train_df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,Genre String,Genre Corrected,GenreSplit,GenreSplit len,Genre_final
0,1948,Bhaktha Jana,Tamil,P. Pullaiah,"C. Honnappa Bhagavathar, Chittor V. Nagaiah, P...",unknown,https://en.wikipedia.org/wiki/Bhaktha_Jana,Janaka (Santhakumari) has been a staunch devot...,unknown,Others,[Others],1,[Others]
1,1974,Paul and Michelle,British,Lewis Gilbert,"Anicée Alvina, Sean Bury",drama,https://en.wikipedia.org/wiki/Paul_and_Michelle,Taking place approximately three years after t...,drama,Others,[Others],1,[Others]
2,2016,Storks,American,Doug Sweetland,,unknown,https://en.wikipedia.org/wiki/Storks_(film),"For generations, the storks of Stork Mountain ...",unknown,Others,[Others],1,[Others]
3,1987,Watch the Shadows Dance,Australian,Unknown,,unknown,https://en.wikipedia.org/wiki/Watch_the_Shadow...,A tight-knit group of high school students stu...,unknown,Others,[Others],1,[Others]
4,1963,Murder at the Gallop,British,George Pollock,"Margaret Rutherford, Stringer Davis",mystery,https://en.wikipedia.org/wiki/Murder_at_the_Ga...,While Miss Marple (Margaret Rutherford) and Mr...,mystery,Others,[Others],1,[Others]


In [37]:
train_df.columns

Index(['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast',
       'Genre', 'Wiki Page', 'Plot', 'Genre String', 'Genre Corrected',
       'GenreSplit', 'GenreSplit len', 'Genre_final'],
      dtype='object')

In [38]:
train_df.drop(['Release Year', 'Origin/Ethnicity', 'Director', 'Cast',
       'Genre', 'Wiki Page', 'Genre String', 'Genre Corrected',
       'GenreSplit', 'GenreSplit len'],axis = 1, inplace = True)
train_df.head()

Unnamed: 0,Title,Plot,Genre_final
0,Bhaktha Jana,Janaka (Santhakumari) has been a staunch devot...,[Others]
1,Paul and Michelle,Taking place approximately three years after t...,[Others]
2,Storks,"For generations, the storks of Stork Mountain ...",[Others]
3,Watch the Shadows Dance,A tight-knit group of high school students stu...,[Others]
4,Murder at the Gallop,While Miss Marple (Margaret Rutherford) and Mr...,[Others]


<b>Train Plot Cleaning <b>

In [39]:
train_df['Plot without punc'] = train_df['Plot'].apply(remove_punc)
train_df['Plot final clean'] = train_df['Plot without punc'].apply(text_process)
train_df.head()

Unnamed: 0,Title,Plot,Genre_final,Plot without punc,Plot final clean
0,Bhaktha Jana,Janaka (Santhakumari) has been a staunch devot...,[Others],Janaka Santhakumari has been a staunch devotee...,"[Janaka, Santhakumari, staunch, devotee, Pandu..."
1,Paul and Michelle,Taking place approximately three years after t...,[Others],Taking place approximately three years after t...,"[Taking, place, approximately, three, years, e..."
2,Storks,"For generations, the storks of Stork Mountain ...",[Others],For generations the storks of Stork Mountain d...,"[generations, storks, Stork, Mountain, deliver..."
3,Watch the Shadows Dance,A tight-knit group of high school students stu...,[Others],A tightknit group of high school students stud...,"[tightknit, group, high, school, students, stu..."
4,Murder at the Gallop,While Miss Marple (Margaret Rutherford) and Mr...,[Others],While Miss Marple Margaret Rutherford and Mr S...,"[Miss, Marple, Margaret, Rutherford, Mr, Strin..."


In [40]:
train_df['Plot Stem'] = train_df['Plot final clean'].apply(stem)
train_df.head()

Unnamed: 0,Title,Plot,Genre_final,Plot without punc,Plot final clean,Plot Stem
0,Bhaktha Jana,Janaka (Santhakumari) has been a staunch devot...,[Others],Janaka Santhakumari has been a staunch devotee...,"[Janaka, Santhakumari, staunch, devotee, Pandu...","[[Janaka], [Santhakumari], [staunch], [devotee..."
1,Paul and Michelle,Taking place approximately three years after t...,[Others],Taking place approximately three years after t...,"[Taking, place, approximately, three, years, e...","[[Taking], [place], [approximately], [three], ..."
2,Storks,"For generations, the storks of Stork Mountain ...",[Others],For generations the storks of Stork Mountain d...,"[generations, storks, Stork, Mountain, deliver...","[[generations], [storks], [Stork], [Mountain],..."
3,Watch the Shadows Dance,A tight-knit group of high school students stu...,[Others],A tightknit group of high school students stud...,"[tightknit, group, high, school, students, stu...","[[tightknit], [group], [high], [school], [stud..."
4,Murder at the Gallop,While Miss Marple (Margaret Rutherford) and Mr...,[Others],While Miss Marple Margaret Rutherford and Mr S...,"[Miss, Marple, Margaret, Rutherford, Mr, Strin...","[[Miss], [Marple], [Margaret], [Rutherford], [..."


In [41]:
def lst_str_plot (name):
    char = ' '
    for i in name:
        char = char + ' ' +str(i[0])
    return char

In [42]:
train_df['Plot final'] = train_df['Plot Stem'].apply(lst_str_plot)
train_df.head()

Unnamed: 0,Title,Plot,Genre_final,Plot without punc,Plot final clean,Plot Stem,Plot final
0,Bhaktha Jana,Janaka (Santhakumari) has been a staunch devot...,[Others],Janaka Santhakumari has been a staunch devotee...,"[Janaka, Santhakumari, staunch, devotee, Pandu...","[[Janaka], [Santhakumari], [staunch], [devotee...",Janaka Santhakumari staunch devotee Panduran...
1,Paul and Michelle,Taking place approximately three years after t...,[Others],Taking place approximately three years after t...,"[Taking, place, approximately, three, years, e...","[[Taking], [place], [approximately], [three], ...",Taking place approximately three years event...
2,Storks,"For generations, the storks of Stork Mountain ...",[Others],For generations the storks of Stork Mountain d...,"[generations, storks, Stork, Mountain, deliver...","[[generations], [storks], [Stork], [Mountain],...",generations storks Stork Mountain delivered ...
3,Watch the Shadows Dance,A tight-knit group of high school students stu...,[Others],A tightknit group of high school students stud...,"[tightknit, group, high, school, students, stu...","[[tightknit], [group], [high], [school], [stud...",tightknit group high school students studies...
4,Murder at the Gallop,While Miss Marple (Margaret Rutherford) and Mr...,[Others],While Miss Marple Margaret Rutherford and Mr S...,"[Miss, Marple, Margaret, Rutherford, Mr, Strin...","[[Miss], [Marple], [Margaret], [Rutherford], [...",Miss Marple Margaret Rutherford Mr Stringer ...


In [43]:
train_df.columns

Index(['Title', 'Plot', 'Genre_final', 'Plot without punc', 'Plot final clean',
       'Plot Stem', 'Plot final'],
      dtype='object')

In [44]:
train_df.drop([ 'Plot', 'Plot without punc', 'Plot final clean',
       'Plot Stem'], axis = 1, inplace = True)
train_df.head()

Unnamed: 0,Title,Genre_final,Plot final
0,Bhaktha Jana,[Others],Janaka Santhakumari staunch devotee Panduran...
1,Paul and Michelle,[Others],Taking place approximately three years event...
2,Storks,[Others],generations storks Stork Mountain delivered ...
3,Watch the Shadows Dance,[Others],tightknit group high school students studies...
4,Murder at the Gallop,[Others],Miss Marple Margaret Rutherford Mr Stringer ...


In [45]:
train_df['Plot len'] = train_df['Plot final'].apply(len)
train_df['Plot len'].value_counts()

513     8
101     7
336     6
219     6
502     6
       ..
257     1
2310    1
2314    1
2320    1
2053    1
Name: Plot len, Length: 1906, dtype: int64

In [46]:
train_df['Plot len'].describe()

count    3000.000000
mean     1457.624333
std      1155.491332
min        18.000000
25%       498.000000
50%      1129.500000
75%      2280.250000
max      9459.000000
Name: Plot len, dtype: float64

<b> Removing those entries where plot length is less than 50 words <b>

In [47]:
train_df = train_df[train_df['Plot len'] > 50 ]
train_df.head()

Unnamed: 0,Title,Genre_final,Plot final,Plot len
0,Bhaktha Jana,[Others],Janaka Santhakumari staunch devotee Panduran...,767
1,Paul and Michelle,[Others],Taking place approximately three years event...,520
2,Storks,[Others],generations storks Stork Mountain delivered ...,2493
3,Watch the Shadows Dance,[Others],tightknit group high school students studies...,1367
4,Murder at the Gallop,[Others],Miss Marple Margaret Rutherford Mr Stringer ...,1924


In [48]:
train_df['Plot len'].describe()

count    2988.000000
mean     1463.319946
std      1154.301114
min        51.000000
25%       502.000000
50%      1135.000000
75%      2284.500000
max      9459.000000
Name: Plot len, dtype: float64

In [49]:
train_df.drop([ 'Plot len'], axis = 1, inplace = True)
train_df.head()

Unnamed: 0,Title,Genre_final,Plot final
0,Bhaktha Jana,[Others],Janaka Santhakumari staunch devotee Panduran...
1,Paul and Michelle,[Others],Taking place approximately three years event...
2,Storks,[Others],generations storks Stork Mountain delivered ...
3,Watch the Shadows Dance,[Others],tightknit group high school students studies...
4,Murder at the Gallop,[Others],Miss Marple Margaret Rutherford Mr Stringer ...


In [50]:
train_df.to_csv('train_final_data.csv')

<b>Test plot data cleaning<b>

In [51]:
test_df.head()

Unnamed: 0,Movie_Title,Movie_plot
0,Citizen Kane,"In a mansion called Xanadu, part of a vast pal..."
1,2001: A Space Odyssey (film),"In the prehistoric African veldt, a tribe of h..."
2,The Rules of the Game,Aviator André Jurieux (Roland Toutain) lands a...
3,Bicycle Thieves,In the post-World War II Val Melaina neighbour...
4,Vertigo (film),"After a rooftop chase, where a fellow policema..."


In [52]:
def test_text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    mess = str(mess)
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation and type(char)== str]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [53]:
test_df['Plot without punc and stopwords'] = test_df['Movie_plot'].apply(test_text_process)
test_df['Plot Stem'] = test_df['Plot without punc and stopwords'].apply(stem)
test_df.head()

Unnamed: 0,Movie_Title,Movie_plot,Plot without punc and stopwords,Plot Stem
0,Citizen Kane,"In a mansion called Xanadu, part of a vast pal...","[mansion, called, Xanadu, part, vast, palatial...","[[mansion], [called], [Xanadu], [part], [vast]..."
1,2001: A Space Odyssey (film),"In the prehistoric African veldt, a tribe of h...","[prehistoric, African, veldt, tribe, hominids,...","[[prehistoric], [African], [veldt], [tribe], [..."
2,The Rules of the Game,Aviator André Jurieux (Roland Toutain) lands a...,"[Aviator, André, Jurieux, Roland, Toutain, lan...","[[Aviator], [André], [Jurieux], [Roland], [Tou..."
3,Bicycle Thieves,In the post-World War II Val Melaina neighbour...,"[postWorld, War, II, Val, Melaina, neighbourho...","[[postWorld], [War], [II], [Val], [Melaina], [..."
4,Vertigo (film),"After a rooftop chase, where a fellow policema...","[rooftop, chase, fellow, policeman, falls, dea...","[[rooftop], [chase], [fellow], [policeman], [f..."


In [54]:
test_df['Plot final'] = test_df['Plot Stem'].apply(lst_str_plot)
test_df.head()

Unnamed: 0,Movie_Title,Movie_plot,Plot without punc and stopwords,Plot Stem,Plot final
0,Citizen Kane,"In a mansion called Xanadu, part of a vast pal...","[mansion, called, Xanadu, part, vast, palatial...","[[mansion], [called], [Xanadu], [part], [vast]...",mansion called Xanadu part vast palatial est...
1,2001: A Space Odyssey (film),"In the prehistoric African veldt, a tribe of h...","[prehistoric, African, veldt, tribe, hominids,...","[[prehistoric], [African], [veldt], [tribe], [...",prehistoric African veldt tribe hominids dri...
2,The Rules of the Game,Aviator André Jurieux (Roland Toutain) lands a...,"[Aviator, André, Jurieux, Roland, Toutain, lan...","[[Aviator], [André], [Jurieux], [Roland], [Tou...",Aviator André Jurieux Roland Toutain lands L...
3,Bicycle Thieves,In the post-World War II Val Melaina neighbour...,"[postWorld, War, II, Val, Melaina, neighbourho...","[[postWorld], [War], [II], [Val], [Melaina], [...",postWorld War II Val Melaina neighbourhood R...
4,Vertigo (film),"After a rooftop chase, where a fellow policema...","[rooftop, chase, fellow, policeman, falls, dea...","[[rooftop], [chase], [fellow], [policeman], [f...",rooftop chase fellow policeman falls death S...


In [55]:
test_df.columns

Index(['Movie_Title', 'Movie_plot', 'Plot without punc and stopwords',
       'Plot Stem', 'Plot final'],
      dtype='object')

In [56]:
test_df.drop(['Movie_plot', 'Plot without punc and stopwords',
       'Plot Stem'],axis = 1, inplace = True)
test_df.head()

Unnamed: 0,Movie_Title,Plot final
0,Citizen Kane,mansion called Xanadu part vast palatial est...
1,2001: A Space Odyssey (film),prehistoric African veldt tribe hominids dri...
2,The Rules of the Game,Aviator André Jurieux Roland Toutain lands L...
3,Bicycle Thieves,postWorld War II Val Melaina neighbourhood R...
4,Vertigo (film),rooftop chase fellow policeman falls death S...


In [57]:
type(test_df['Plot final'])

pandas.core.series.Series

In [58]:
print(test_df['Plot final'])

0        mansion called Xanadu part vast palatial est...
1        prehistoric African veldt tribe hominids dri...
2        Aviator André Jurieux Roland Toutain lands L...
3        postWorld War II Val Melaina neighbourhood R...
4        rooftop chase fellow policeman falls death S...
                             ...                        
994      perfect life wealthy New York City wife Eric...
995      Manhattan teaching hospital life Dr Bock Geo...
996      early 1930s Chuck Glover Montgomery Clift ar...
997      brawling card game wharf area New Orleans ma...
998      Mac Sledge Robert Duvall washed alcoholic co...
Name: Plot final, Length: 999, dtype: object


In [59]:
print(type(test_df['Plot final'][0]))

<class 'str'>


In [60]:
df_dict = {'Plot' : test_df['Plot final']}
df = pd.DataFrame(df_dict)
print(df)

                                                  Plot
0      mansion called Xanadu part vast palatial est...
1      prehistoric African veldt tribe hominids dri...
2      Aviator André Jurieux Roland Toutain lands L...
3      postWorld War II Val Melaina neighbourhood R...
4      rooftop chase fellow policeman falls death S...
..                                                 ...
994    perfect life wealthy New York City wife Eric...
995    Manhattan teaching hospital life Dr Bock Geo...
996    early 1930s Chuck Glover Montgomery Clift ar...
997    brawling card game wharf area New Orleans ma...
998    Mac Sledge Robert Duvall washed alcoholic co...

[999 rows x 1 columns]


In [61]:
print(type(df['Plot'][0]))

<class 'str'>


In [62]:
test_df = pd.concat([test_df,df],axis = 1)

In [63]:
test_df.head()

Unnamed: 0,Movie_Title,Plot final,Plot
0,Citizen Kane,mansion called Xanadu part vast palatial est...,mansion called Xanadu part vast palatial est...
1,2001: A Space Odyssey (film),prehistoric African veldt tribe hominids dri...,prehistoric African veldt tribe hominids dri...
2,The Rules of the Game,Aviator André Jurieux Roland Toutain lands L...,Aviator André Jurieux Roland Toutain lands L...
3,Bicycle Thieves,postWorld War II Val Melaina neighbourhood R...,postWorld War II Val Melaina neighbourhood R...
4,Vertigo (film),rooftop chase fellow policeman falls death S...,rooftop chase fellow policeman falls death S...


In [64]:
print(type(test_df['Plot'][0]))
print(type(test_df['Movie_Title'][0]))
print(type(test_df['Plot final'][0]))

<class 'str'>
<class 'str'>
<class 'str'>


In [65]:
def check(name):
    string = ''
    for i in name:
        string = string + i
    return string

In [66]:
test_df.drop(['Plot final'],axis = 1, inplace = True)

In [67]:
test_df.head()

Unnamed: 0,Movie_Title,Plot
0,Citizen Kane,mansion called Xanadu part vast palatial est...
1,2001: A Space Odyssey (film),prehistoric African veldt tribe hominids dri...
2,The Rules of the Game,Aviator André Jurieux Roland Toutain lands L...
3,Bicycle Thieves,postWorld War II Val Melaina neighbourhood R...
4,Vertigo (film),rooftop chase fellow policeman falls death S...


In [68]:
test_df['Plot len'] = test_df['Plot'].apply(len)

In [69]:
test_df.head()

Unnamed: 0,Movie_Title,Plot,Plot len
0,Citizen Kane,mansion called Xanadu part vast palatial est...,2711
1,2001: A Space Odyssey (film),prehistoric African veldt tribe hominids dri...,2867
2,The Rules of the Game,Aviator André Jurieux Roland Toutain lands L...,2615
3,Bicycle Thieves,postWorld War II Val Melaina neighbourhood R...,1745
4,Vertigo (film),rooftop chase fellow policeman falls death S...,2712


In [70]:
test_df.describe()

Unnamed: 0,Plot len
count,999.0
mean,2343.305305
std,967.192617
min,5.0
25%,1787.0
50%,2463.0
75%,2955.0
max,8599.0


In [71]:
test_df = test_df[test_df['Plot len'] > 50 ]
test_df.head()

Unnamed: 0,Movie_Title,Plot,Plot len
0,Citizen Kane,mansion called Xanadu part vast palatial est...,2711
1,2001: A Space Odyssey (film),prehistoric African veldt tribe hominids dri...,2867
2,The Rules of the Game,Aviator André Jurieux Roland Toutain lands L...,2615
3,Bicycle Thieves,postWorld War II Val Melaina neighbourhood R...,1745
4,Vertigo (film),rooftop chase fellow policeman falls death S...,2712


In [72]:
test_df['Plot len'].describe()

count     984.000000
mean     2378.907520
std       930.178434
min        96.000000
25%      1811.750000
50%      2497.000000
75%      2969.250000
max      8599.000000
Name: Plot len, dtype: float64

In [73]:
test_df.drop([ 'Plot len'], axis = 1, inplace = True)
test_df.head()

Unnamed: 0,Movie_Title,Plot
0,Citizen Kane,mansion called Xanadu part vast palatial est...
1,2001: A Space Odyssey (film),prehistoric African veldt tribe hominids dri...
2,The Rules of the Game,Aviator André Jurieux Roland Toutain lands L...
3,Bicycle Thieves,postWorld War II Val Melaina neighbourhood R...
4,Vertigo (film),rooftop chase fellow policeman falls death S...


In [74]:
test_df.to_csv('test_final_data.csv')

In [75]:
test_df.head()

Unnamed: 0,Movie_Title,Plot
0,Citizen Kane,mansion called Xanadu part vast palatial est...
1,2001: A Space Odyssey (film),prehistoric African veldt tribe hominids dri...
2,The Rules of the Game,Aviator André Jurieux Roland Toutain lands L...
3,Bicycle Thieves,postWorld War II Val Melaina neighbourhood R...
4,Vertigo (film),rooftop chase fellow policeman falls death S...


## Model Training
For the purpose of training Logistic Regression Classifier is used.

In [76]:
train_df.columns

Index(['Title', 'Genre_final', 'Plot final'], dtype='object')

In [77]:
test_df.columns

Index(['Movie_Title', 'Plot'], dtype='object')

In [78]:
test_df.head()

Unnamed: 0,Movie_Title,Plot
0,Citizen Kane,mansion called Xanadu part vast palatial est...
1,2001: A Space Odyssey (film),prehistoric African veldt tribe hominids dri...
2,The Rules of the Game,Aviator André Jurieux Roland Toutain lands L...
3,Bicycle Thieves,postWorld War II Val Melaina neighbourhood R...
4,Vertigo (film),rooftop chase fellow policeman falls death S...


In [79]:
train_df.head()

Unnamed: 0,Title,Genre_final,Plot final
0,Bhaktha Jana,[Others],Janaka Santhakumari staunch devotee Panduran...
1,Paul and Michelle,[Others],Taking place approximately three years event...
2,Storks,[Others],generations storks Stork Mountain delivered ...
3,Watch the Shadows Dance,[Others],tightknit group high school students studies...
4,Murder at the Gallop,[Others],Miss Marple Margaret Rutherford Mr Stringer ...


In [80]:
train_df['Genre len'] = train_df['Genre_final'].apply(len)
train_df['Genre_final'].describe()

count         2988
unique          15
top       [Others]
freq          1966
Name: Genre_final, dtype: object

In [81]:
## for y_train

from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(train_df['Genre_final'])
# transform target variable
y_train = multilabel_binarizer.transform(train_df['Genre_final'])

In [82]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)

In [83]:
x_tr, x_val, y_tr, y_val = train_test_split(train_df['Plot final'],y_train, test_size=0.2, random_state=9)

In [84]:
x_tr_tfidf = tfidf_vectorizer.fit_transform(x_tr)
x_val_tfidf = tfidf_vectorizer.transform(x_val)

In [85]:
from sklearn.linear_model import LogisticRegression

# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier

# Performance metric
from sklearn.metrics import f1_score

In [86]:
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)
# fit model on train data
clf.fit(x_tr_tfidf, y_tr)
# make predictions for validation set
pred = clf.predict(x_val_tfidf)
pred[100]

array([0, 0, 0, 1, 0, 0, 0])

In [87]:
multilabel_binarizer.inverse_transform(pred)[100]

('Others',)

In [88]:
# evaluate performance
f1_score(y_val, pred, average="micro")

0.6304347826086957

In [89]:
print(classification_report(y_val,pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        86
           1       0.00      0.00      0.00        10
           2       0.00      0.00      0.00         0
           3       0.65      0.98      0.79       383
           4       0.00      0.00      0.00        62
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00        68

   micro avg       0.65      0.61      0.63       619
   macro avg       0.09      0.14      0.11       619
weighted avg       0.40      0.61      0.49       619
 samples avg       0.63      0.63      0.63       619



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [90]:
x_te_tfidf = tfidf_vectorizer.transform(test_df['Plot'])
pred_test = clf.predict(x_te_tfidf)

In [91]:
len(pred_test)

984

In [92]:
len(test_df)

984

In [93]:
print(pred_test[2])

[0 0 0 1 0 0 0]


In [94]:
pred_list = []
for i in range(len(pred_test)):
    pred_list.append(multilabel_binarizer.inverse_transform(pred_test)[i])
pred_list

[('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 (),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 (),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 (),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Others',),
 ('Ot

In [95]:
type(pred_list[0])

tuple

In [96]:
pred_list_str = []
for i in range(len(pred_list)):
    list_cache = []
    length = len(pred_list[i])
    for j in range(length):
        list_cache.append(pred_list[i][j])
    pred_list_str.append(list_cache)

In [97]:
test_df.head()

Unnamed: 0,Movie_Title,Plot
0,Citizen Kane,mansion called Xanadu part vast palatial est...
1,2001: A Space Odyssey (film),prehistoric African veldt tribe hominids dri...
2,The Rules of the Game,Aviator André Jurieux Roland Toutain lands L...
3,Bicycle Thieves,postWorld War II Val Melaina neighbourhood R...
4,Vertigo (film),rooftop chase fellow policeman falls death S...


In [98]:
def list_str_2(char):
    return ','.join(char)
pred_list_final = []
for i in range(len(pred_list_str)):
    val = list_str_2(pred_list_str[i])
    pred_list_final.append(val)
pred_list_final

['Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 '',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 '',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 '',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Others',
 'Othe

In [99]:
val = [x for x in range(len(pred_list_final))]
df = pd.DataFrame(pred_list_final,index = val ,columns = ['Genre'])
df.head()

Unnamed: 0,Genre
0,Others
1,Others
2,Others
3,Others
4,Others


In [100]:
test_df = pd.concat([test_df,df],axis = 1)

In [101]:
test_df.head()

Unnamed: 0,Movie_Title,Plot,Genre
0,Citizen Kane,mansion called Xanadu part vast palatial est...,Others
1,2001: A Space Odyssey (film),prehistoric African veldt tribe hominids dri...,Others
2,The Rules of the Game,Aviator André Jurieux Roland Toutain lands L...,Others
3,Bicycle Thieves,postWorld War II Val Melaina neighbourhood R...,Others
4,Vertigo (film),rooftop chase fellow policeman falls death S...,Others


In [103]:
test_df.to_csv('Final Predictions.csv')

## References:
1. https://www.analyticsvidhya.com/blog/2019/04/predicting-movie-genres-nlp-multi-label-classification/
2. https://www.kaggle.com/aminejallouli/genre-classification-based-on-wiki-movies-plots
3. https://stackoverflow.com/ for various queries.
4. https://www.udemy.com/course/python-for-data-science-and-machine-learning-bootcamp/
5. https://medium.com