In [2]:
import pandas as pd
import numpy as np
import json
import nltk
import re
import csv
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
%matplotlib inline
pd.set_option('display.max_colwidth', 300)

In [3]:
meta = pd.read_csv("MovieSummaries/movie.metadata.tsv", sep = '\t', header = None)
meta.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science Fiction"", ""/m/03npn"": ""Horror"", ""/m/03k9fj"": ""Adventure"", ""/m/0fdjb"": ""Supernatural"", ""/m/02kdv5l"": ""Action"", ""/m/09zvmj"": ""Space western""}"
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey Mystery,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biographical film"", ""/m/07s9rl0"": ""Drama"", ""/m/0hj3n01"": ""Crime Drama""}"
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""Drama""}"
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic thriller"", ""/m/09blyk"": ""Psychological thriller""}"
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [4]:
meta.columns = ["movie_id",1,"movie_name",3,4,5,6,7,"genre"]

In [5]:
plots = []

with open("MovieSummaries/plot_summaries.txt", 'r') as f:
       reader = csv.reader(f, dialect='excel-tab') 
       for row in tqdm(reader):
            plots.append(row)

42303it [00:04, 8636.05it/s] 


In [6]:
movie_id = []
plot = []

# extract movie Ids and plot summaries
for i in tqdm(plots):
  movie_id.append(i[0])
  plot.append(i[1])

# create dataframe
movies = pd.DataFrame({'movie_id': movie_id, 'plot': plot})

100%|██████████| 42303/42303 [00:00<00:00, 1510737.97it/s]


In [7]:
# change datatype of 'movie_id'
meta['movie_id'] = meta['movie_id'].astype(str)

# merge meta with movies
movies = pd.merge(movies, meta[['movie_id', 'movie_name', 'genre']], on = 'movie_id')

movies.head()

Unnamed: 0,movie_id,plot,movie_name,genre
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all.",Taxi Blues,"{""/m/07s9rl0"": ""Drama"", ""/m/03q4nz"": ""World cinema""}"
1,31186339,"The nation of Panem consists of a wealthy Capitol and twelve poorer districts. As punishment for a past rebellion, each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games. The tributes must fight to the death in an arena; the sole...",The Hunger Games,"{""/m/03btsm8"": ""Action/Adventure"", ""/m/06n90"": ""Science Fiction"", ""/m/02kdv5l"": ""Action"", ""/m/07s9rl0"": ""Drama""}"
2,20663735,"Poovalli Induchoodan is sentenced for six years prison life for murdering his classmate. Induchoodan, the only son of Justice Maranchery Karunakara Menon was framed in the case by Manapally Madhavan Nambiar and his crony DYSP Sankaranarayanan to take revenge on idealist judge Menon who had e...",Narasimham,"{""/m/04t36"": ""Musical"", ""/m/02kdv5l"": ""Action"", ""/m/07s9rl0"": ""Drama"", ""/m/01chg"": ""Bollywood""}"
3,2231378,"The Lemon Drop Kid , a New York City swindler, is illegally touting horses at a Florida racetrack. After several successful hustles, the Kid comes across a beautiful, but gullible, woman intending to bet a lot of money. The Kid convinces her to switch her bet, employing a prefabricated con. Unfo...",The Lemon Drop Kid,"{""/m/06qm3"": ""Screwball comedy"", ""/m/01z4y"": ""Comedy""}"
4,595909,"Seventh-day Adventist Church pastor Michael Chamberlain, his wife Lindy, their two sons, and their nine-week-old daughter Azaria are on a camping holiday in the Outback. With the baby sleeping in their tent, the family is enjoying a barbecue with their fellow campers when a cry is heard. Lindy r...",A Cry in the Dark,"{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""Drama"", ""/m/01f9r0"": ""Docudrama"", ""/m/03q4nz"": ""World cinema"", ""/m/05bh16v"": ""Courtroom Drama""}"


In [8]:
# an empty list
genres = [] 

# extract genres
for i in movies['genre']: 
  genres.append(list(json.loads(i).values())) 

# add to 'movies' dataframe  
movies['genre_new'] = genres

In [9]:
from tqdm import tqdm # I love this handy tool! 
print(">> Generating Count Based And Demographical Features")
movies['length'] = movies['plot'].apply(lambda x : len(x))
movies['capitals'] = movies['plot'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
movies['caps_vs_length'] = movies.apply(lambda row: float(row['capitals'])/float(row['length']),axis=1)
movies['num_exclamation_marks'] = movies['plot'].apply(lambda comment: comment.count('!'))
movies['num_question_marks'] = movies['plot'].apply(lambda comment: comment.count('?'))
movies['num_punctuation'] = movies['plot'].apply(lambda comment: sum(comment.count(w) for w in '.,;:'))
movies['num_symbols'] = movies['plot'].apply(lambda comment: sum(comment.count(w) for w in '*&$%'))
movies['num_words'] = movies['plot'].apply(lambda comment: len(comment.split()))
movies['num_unique_words'] = movies['plot'].apply(lambda comment: len(set(w for w in comment.split())))
movies['words_vs_unique'] = movies['num_unique_words'] / movies['num_words']
movies['num_smilies'] = movies['plot'].apply(lambda comment: sum(comment.count(w) for w in (':-)', ':)', ';-)', ';)')))
movies['num_sad'] = movies['plot'].apply(lambda comment: sum(comment.count(w) for w in (':-<', ':()', ';-()', ';(')))

>> Generating Count Based And Demographical Features


In [11]:
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

def tag_part_of_speech(text):
    text_splited = text.split(' ')
    text_splited = [''.join(c for c in s if c not in string.punctuation) for s in text_splited]
    text_splited = [s for s in text_splited if s]
    pos_list = pos_tag(text_splited)
    noun_count = len([w for w in pos_list if w[1] in ('NN','NNP','NNPS','NNS')])
    adjective_count = len([w for w in pos_list if w[1] in ('JJ','JJR','JJS')])
    verb_count = len([w for w in pos_list if w[1] in ('VB','VBD','VBG','VBN','VBP','VBZ')])
    return[noun_count, adjective_count, verb_count]

In [12]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/quantiphi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [13]:
print(">> Generating POS Features")
movies['nouns'], movies['adjectives'], movies['verbs'] = zip(*movies['plot'].apply(lambda comment: tag_part_of_speech(comment)))
movies['nouns_vs_length'] = movies['nouns'] / movies['length']
movies['adjectives_vs_length'] = movies['adjectives'] / movies['length']
movies['verbs_vs_length'] = movies['verbs'] /movies['length']
movies['nouns_vs_words'] = movies['nouns'] / movies['num_words']
movies['adjectives_vs_words'] = movies['adjectives'] / movies['num_words']
movies['verbs_vs_words'] = movies['verbs'] / movies['num_words']

>> Generating POS Features


In [16]:
movies.head()

Unnamed: 0,movie_id,plot,movie_name,genre,genre_new,length,capitals,caps_vs_length,num_exclamation_marks,num_question_marks,...,num_sad,nouns,adjectives,verbs,nouns_vs_length,adjectives_vs_length,verbs_vs_length,nouns_vs_words,adjectives_vs_words,verbs_vs_words
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all.",Taxi Blues,"{""/m/07s9rl0"": ""Drama"", ""/m/03q4nz"": ""World cinema""}","[Drama, World cinema]",178,2,0.011236,0,0,...,0,9,2,3,0.050562,0.011236,0.016854,0.36,0.08,0.12
1,31186339,"The nation of Panem consists of a wealthy Capitol and twelve poorer districts. As punishment for a past rebellion, each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games. The tributes must fight to the death in an arena; the sole...",The Hunger Games,"{""/m/03btsm8"": ""Action/Adventure"", ""/m/06n90"": ""Science Fiction"", ""/m/02kdv5l"": ""Action"", ""/m/07s9rl0"": ""Drama""}","[Action/Adventure, Science Fiction, Action, Drama]",4559,141,0.030928,0,0,...,0,244,33,154,0.053521,0.007238,0.033779,0.313625,0.042416,0.197943
2,20663735,"Poovalli Induchoodan is sentenced for six years prison life for murdering his classmate. Induchoodan, the only son of Justice Maranchery Karunakara Menon was framed in the case by Manapally Madhavan Nambiar and his crony DYSP Sankaranarayanan to take revenge on idealist judge Menon who had e...",Narasimham,"{""/m/04t36"": ""Musical"", ""/m/02kdv5l"": ""Action"", ""/m/07s9rl0"": ""Drama"", ""/m/01chg"": ""Bollywood""}","[Musical, Action, Drama, Bollywood]",3099,102,0.032914,0,0,...,0,184,28,78,0.059374,0.009035,0.025169,0.370968,0.056452,0.157258
3,2231378,"The Lemon Drop Kid , a New York City swindler, is illegally touting horses at a Florida racetrack. After several successful hustles, the Kid comes across a beautiful, but gullible, woman intending to bet a lot of money. The Kid convinces her to switch her bet, employing a prefabricated con. Unfo...",The Lemon Drop Kid,"{""/m/06qm3"": ""Screwball comedy"", ""/m/01z4y"": ""Comedy""}","[Screwball comedy, Comedy]",4917,160,0.03254,0,0,...,0,269,53,151,0.054708,0.010779,0.03071,0.316843,0.062426,0.177856
4,595909,"Seventh-day Adventist Church pastor Michael Chamberlain, his wife Lindy, their two sons, and their nine-week-old daughter Azaria are on a camping holiday in the Outback. With the baby sleeping in their tent, the family is enjoying a barbecue with their fellow campers when a cry is heard. Lindy r...",A Cry in the Dark,"{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""Drama"", ""/m/01f9r0"": ""Docudrama"", ""/m/03q4nz"": ""World cinema"", ""/m/05bh16v"": ""Courtroom Drama""}","[Crime Fiction, Drama, Docudrama, World cinema, Courtroom Drama]",2425,38,0.01567,0,0,...,0,118,27,77,0.04866,0.011134,0.031753,0.297229,0.06801,0.193955


In [18]:
# remove samples with 0 genre tags
movies_new = movies[~(movies['genre_new'].str.len() == 0)]
movies_new.shape, movies.shape

((41793, 26), (42204, 26))

In [21]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(movies_new['genre_new'])

# transform target variable
y = multilabel_binarizer.transform(movies_new['genre_new'])

In [37]:
xtrain, xval, ytrain, yval = train_test_split(movies_new.iloc[:,5:-1], y, test_size=0.2, random_state=9)

In [38]:
# tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)
# # create TF-IDF features
# xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain['clean_plot'])
# xval_tfidf = tfidf_vectorizer.transform(xval['clean_plot'])

In [41]:
matrix = pd.DataFrame(xtrain_tfidf.toarray())

In [34]:
from sklearn.linear_model import LogisticRegression

# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier

# Performance metric
from sklearn.metrics import f1_score

In [36]:
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)
# fit model on train data
clf.fit(xtrain, ytrain)
# make predictions for validation set
y_pred = clf.predict(xval)
# evaluate performance
f1_score(yval, y_pred, average="micro")



  str(classes[c]))












  str(classes[c]))




  str(classes[c]))


  str(classes[c]))












0.06496992133271633

In [None]:
# # More Handy Features
# movies["count_words_title"] = movies["plot"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
# movies["mean_word_len"] = movies["plot"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
# movies['punct_percent']= movies['plot']*100/movies['num_words']

In [19]:
# function for text cleaning 
def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
#     Removing stop words
    text = text.split(' ')
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in text if not w in stop_words]
    
    # lemmatizing of words
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    lemmatize = ' '.join(lemmatizer.lemmatize(word) for word in words)
    
    return lemmatize

In [20]:
movies_new['clean_plot'] = movies_new['plot'].apply(lambda x: clean_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
