In [145]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, platform, sys
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn.preprocessing  import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.linear_model import SGDClassifier,LogisticRegression
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,LabelEncoder,MinMaxScaler
from sklearn.linear_model import LinearRegression
import string
import re
from nltk.corpus import sentiwordnet as swn
import time
import spacy
from sklearn.neighbors import KNeighborsClassifier
import en_core_web_sm

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\Anaconda3\lib\site-packages\nltk...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\Anaconda3\lib\site-packages\nltk...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\Anaconda3\lib\site-packages\nltk...
[nltk_data]   Package wordnet is already up-to-date!


True

In [118]:
omdb_data = pd.read_json('data/omdb-data.json.gz', orient='records', lines=True)
rotten_tomatoes = pd.read_json('data/rotten-tomatoes.json.gz', orient='records', lines=True)

def change_pdFrame(row, new_df):
   temp_genres = row['omdb_genres']
   counter = 0
   while counter < len(temp_genres):
       movies_row = row
       movies_row['genres'] = temp_genres[counter]
       new_df.append(movies_row)
       print(new_df)
       counter = counter + 1

movies_data = pd.merge(omdb_data, rotten_tomatoes, on=['imdb_id'])
# c = movies_data.columns
# new_df = pd.DataFrame(columns=c)
refined_movies = movies_data.omdb_genres.apply(pd.Series).stack().reset_index(level=1, drop=True).to_frame('omdb_genres')
refined_movies = refined_movies.reset_index()
movies_data = movies_data.reset_index()
#movies_data = pd.merge(refined_movies, movies_data, on=['index'])

movies_data.head()

Unnamed: 0,index,imdb_id,omdb_awards,omdb_genres,omdb_plot,audience_average,audience_percent,audience_ratings,critic_average,critic_percent,rotten_tomatoes_id
0,0,tt0060814,Nominated for 2 Oscars. Another 2 nominations.,"[Drama, History, War]","In this sprawling, star-laden film, we see the...",3.5,72.0,1311.0,6.4,80.0,m/is_paris_burning
1,1,tt2379713,Won 1 Oscar. Another 7 wins & 32 nominations.,"[Action, Adventure, Thriller]",A cryptic message from the past sends James Bo...,3.5,61.0,105323.0,6.4,63.0,m/spectre_2015
2,2,tt0230575,,"[Comedy, Horror]","The makers of this parody of ""Night of the Liv...",3.3,57.0,109.0,,,m/night_of_the_day_of_the_dawn_of_the_son_of_t...
3,3,tt0065988,Nominated for 1 Oscar. Another 5 wins & 8 nomi...,"[Adventure, Comedy, Drama]",Jack Crabb is 121 years old as the film begins...,3.8,87.0,17368.0,7.9,96.0,m/little_big_man
4,4,tt1995390,2 nominations.,"[Crime, Drama, Thriller]","When Perry and his girlfriend, Gail, cross pat...",3.2,50.0,7206.0,6.0,72.0,m/our_kind_of_traitor_2016


In [127]:
STOPWORDS = stopwords.words('english')
nlp = en_core_web_sm.load()

def tokenize_text(row,col) : 
    ar = nltk.word_tokenize(row[col])
    return [i.lower() for i in ar]

def remove_stopwords(row,col) : 
    token_set = set(row[col])
    sw_set = set(STOPWORDS)
    return list(token_set.difference(sw_set))

def remove_characters_after_tokenization(tokens):
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
    return filtered_tokens
    
def remove_punct(row,col):
    return list(remove_characters_after_tokenization(row[col]))

def text_purity(row,col_original,col_new) : 
    original_text = len(row[col_original])
    new_text = len(row[col_new])
    return int(new_text)/int(original_text)
    
def stemming(row,col) : 
    stemmer = nltk.stem.WordNetLemmatizer()
    #stemmer = nltk.stem.snowball.PorterStemmer()
    return [stemmer.lemmatize(w) for w in row[col]]
    
def stem_word_count(row,col) : 
    return len(row[col].values.tolist())

def tag_tokens(row,col,tokens=False) : 
    doc = nlp(row[col])
    return [(t.text,t.tag_) for t in doc]

def extract_keyword_weights(row,col,NUM_TOPICS) : 
    sents=row[col]
    print(sents)
    tokens = list(map(lambda x:nltk.word_tokenize(x),sents))
    lsi = train_lsi_model_gensim(tokens,total_topics=NUM_TOPICS)
    parse_weighted_lsi_model(lsi)
    return parse_weighted_lsi_model

def feature_matrix_fit(data,vect_type,data_col) : 
    
    input_data = data[data_col].values
    if vect_type == 'count' : vectorizer = CountVectorizer(min_df=0.0,max_df=1.0,analyzer='word',ngram_range=(1,1))
    elif vect_type == 'tfidf' :   vectorizer = TfidfVectorizer(min_df=0.0,max_df=1.0,analyzer='word',ngram_range=(1,1))   
    features = vectorizer.fit_transform(input_data)
    return vectorizer,features

dataset = movies_data[['audience_percent', 'omdb_plot']]



dataset['tokens'] = dataset.apply(tokenize_text, args=('omdb_plot', ), axis=1)
dataset['tokens'] = dataset.apply(remove_punct,args=('tokens',),axis=1)
dataset['token_string'] = dataset.apply(lambda x:' '.join(x['tokens']),axis=1)
dataset['no_sw'] = dataset.apply(remove_stopwords,args=('tokens',),axis=1)
dataset['purity'] = dataset.apply(text_purity,args=('tokens','no_sw',),axis=1)
dataset['no_sw_string'] = dataset.apply(lambda x:' '.join(x['no_sw']),axis=1)
dataset['stemmed'] = dataset.apply(stemming,args=('no_sw',),axis=1)
dataset['wc_stemmed'] = dataset.apply(lambda x:len(x['stemmed']),axis=1)
dataset['stemmed_string'] = dataset.apply(lambda x:' '.join(x['stemmed']),axis=1)





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [128]:

scaler = MinMaxScaler(feature_range=(0,1))
wc_stemmed_feature = scaler.fit_transform(dataset['wc_stemmed'].values.reshape(-1,1))
dataset['wc_stemmed_scaled'] = wc_stemmed_feature




In [131]:
dataset = dataset.dropna()
dataset

Unnamed: 0,audience_percent,omdb_plot,tokens,token_string,no_sw,purity,no_sw_string,stemmed,wc_stemmed,stemmed_string,wc_stemmed_scaled
0,72.0,"In this sprawling, star-laden film, we see the...","[in, this, sprawling, starladen, film, we, see...",in this sprawling starladen film we see the st...,"[swedish, centers, resistance, city, charge, l...",0.505882,swedish centers resistance city charge leaders...,"[swedish, center, resistance, city, charge, le...",43,swedish center resistance city charge leader v...,0.103704
1,61.0,A cryptic message from the past sends James Bo...,"[a, cryptic, message, from, the, past, sends, ...",a cryptic message from the past sends james bo...,"[assassin, meets, relevance, james, covertly, ...",0.513889,assassin meets relevance james covertly eventu...,"[assassin, meet, relevance, james, covertly, e...",74,assassin meet relevance james covertly eventua...,0.180247
2,57.0,"The makers of this parody of ""Night of the Liv...","[the, makers, of, this, parody, of, night, of,...",the makers of this parody of night of the livi...,"[living, george, took, dialogue, night, redubb...",0.555556,living george took dialogue night redubbed par...,"[living, george, took, dialogue, night, redubb...",15,living george took dialogue night redubbed par...,0.034568
3,87.0,Jack Crabb is 121 years old as the film begins...,"[jack, crabb, is, 121, years, old, as, the, fi...",jack crabb is 121 years old as the film begins...,"[recounts, oral, asks, old, armstrong, jack, c...",0.576923,recounts oral asks old armstrong jack collecto...,"[recount, oral, asks, old, armstrong, jack, co...",30,recount oral asks old armstrong jack collector...,0.071605
4,50.0,"When Perry and his girlfriend, Gail, cross pat...","[when, perry, and, his, girlfriend, gail, cros...",when perry and his girlfriend gail cross paths...,"[begin, london, mafia, whose, safe, city, inte...",0.463235,begin london mafia whose safe city intelligenc...,"[begin, london, mafia, whose, safe, city, inte...",63,begin london mafia whose safe city intelligenc...,0.153086
5,93.0,"Protagonist Alex DeLarge is an ""ultraviolent"" ...","[protagonist, alex, delarge, is, an, ultraviol...",protagonist alex delarge is an ultraviolent yo...,"[creating, luck, prison, eventually, convicts,...",0.388889,creating luck prison eventually convicts delar...,"[creating, luck, prison, eventually, convict, ...",35,creating luck prison eventually convict delarg...,0.083951
6,54.0,Captain Jack Sparrow (Depp) crosses paths with...,"[captain, jack, sparrow, depp, crosses, paths,...",captain jack sparrow depp crosses paths with a...,"[using, cruz, depp, know, finds, jack, adventu...",0.421687,using cruz depp know finds jack adventure love...,"[using, cruz, depp, know, find, jack, adventur...",35,using cruz depp know find jack adventure love ...,0.083951
7,88.0,It is the early 60s in France. The remaining s...,"[it, is, the, early, 60s, in, france, the, rem...",it is the early 60s in france the remaining su...,"[kill, degaulle, million, watch, attempts, fai...",0.477876,kill degaulle million watch attempts fail pres...,"[kill, degaulle, million, watch, attempt, fail...",54,kill degaulle million watch attempt fail presi...,0.130864
8,75.0,"On July 2nd, communications systems worldwide ...","[on, july, 2nd, communications, systems, world...",on july 2nd communications systems worldwide a...,"[aliens, attempts, number, 2nd, well, survivor...",0.491124,aliens attempts number 2nd well survivors inte...,"[alien, attempt, number, 2nd, well, survivor, ...",83,alien attempt number 2nd well survivor interfe...,0.202469
9,94.0,Paranoid Brigadier General Jack D. Ripper of B...,"[paranoid, brigadier, general, jack, d, ripper...",paranoid brigadier general jack d ripper of bu...,"[held, chair, work, jack, dismayed, american, ...",0.448980,held chair work jack dismayed american codes m...,"[held, chair, work, jack, dismayed, american, ...",132,held chair work jack dismayed american code me...,0.323457


In [133]:
## REMOVING OUTLIERS
train_data = dataset
vect,train_features = feature_matrix_fit(train_data,vect_type='count',data_col='stemmed_string')

## MANUALLY ADDING FEATURES

print(train_features.shape)
print(train_data['wc_stemmed_scaled'].shape)

train_features = np.append(train_features.toarray(),train_data['wc_stemmed_scaled'].values.reshape(-1,1),axis=1)
train_features = np.append(train_features,train_data['purity'].values.reshape(-1,1),axis=1)

train_labels = train_data['audience_percent']
xtrain, xtest, ytrain, ytest = train_test_split(train_features,train_labels)

(8755, 37907)
(8755,)


In [139]:
print(xtrain.shape)
print(ytrain.shape)

(6566, 37909)
(6566,)


In [146]:
model = make_pipeline(PolynomialFeatures(3), Ridge())
model.fit(xtrain, ytrain)

NameError: name 'degree' is not defined

In [140]:
model = LinearRegression()
model.fit(xtrain, ytrain)
model.score(xtest, ytest)

-3.2175598831046557e+25

In [103]:
a=pd.DataFrame(np.array(encoder.inverse_transform(dataset['omdb_genres_x'])))
a[0].unique()

  if diff:


array(['Drama', 'History', 'War', 'Action', 'Adventure', 'Thriller',
       'Comedy', 'Horror', 'Crime', 'Sci-Fi', 'Fantasy', 'Mystery',
       'Biography', 'Music', 'Family', 'Musical', 'Romance', 'Animation',
       'Documentary', 'Short', 'Sport', 'N/A', 'Western', 'News',
       'Film-Noir', 'Adult'], dtype=object)

In [111]:
movies_data['omdb_genres_y'].apply(lambda x:str(x)).value_counts()

['Comedy', 'Drama', 'Romance']          1032
['Drama', 'Romance']                     672
['Comedy', 'Drama']                      584
['Action', 'Crime', 'Drama']             576
['Drama']                                492
['Animation', 'Adventure', 'Comedy']     450
['Crime', 'Drama', 'Thriller']           432
['Action', 'Adventure', 'Comedy']        381
['Action', 'Adventure', 'Sci-Fi']        366
['Comedy', 'Romance']                    364
['Crime', 'Drama', 'Mystery']            336
['Action', 'Comedy', 'Crime']            315
['Comedy']                               313
['Action', 'Adventure', 'Fantasy']       309
['Documentary']                          308
['Biography', 'Drama', 'History']        285
['Action', 'Adventure', 'Drama']         267
['Action', 'Crime', 'Thriller']          267
['Crime', 'Drama']                       250
['Comedy', 'Crime', 'Drama']             240
['Horror', 'Thriller']                   226
['Animation', 'Action', 'Adventure']     219
['Drama', 