In [93]:
# Libraries

import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')
import datetime
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, KFold
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
import os
from sklearn import model_selection
from sklearn.metrics import accuracy_score
import json
import ast
from urllib.request import urlopen
from PIL import Image
from sklearn.preprocessing import LabelEncoder
import time
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn import linear_model

In [94]:
#credit for data cleaning code to: https://www.kaggle.com/artgor/eda-feature-engineering-and-model-interpretation

In [95]:
train = pd.read_csv('~/Netflix/TMDB/tmdbtrain.csv')
# from this kernel: https://www.kaggle.com/gravix/gradient-in-a-box
dict_columns = ['belongs_to_collection', 'genres', 'production_companies',
                'production_countries', 'spoken_languages', 'Keywords', 'cast', 'crew']

def text_to_dict(df):
    for column in dict_columns:
        df[column] = df[column].apply(lambda x: {} if pd.isna(x) else ast.literal_eval(x) )
    return df
        
train = text_to_dict(train)

In [96]:
train.head()

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,/tQtWuwvMf0hCc2QR2tkolwl7c3c.jpg,"[{'name': 'Paramount Pictures', 'id': 4}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,/w9Z7A0GHEhIp7etpj0vyKOeU1Wx.jpg,"[{'name': 'Walt Disney Pictures', 'id': 2}]","[{'iso_3166_1': 'US', 'name': 'United States o...",8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435
2,3,{},3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,/lIv1QinFqz4dlp5U4lQ6HaiskOZ.jpg,"[{'name': 'Bold Films', 'id': 2266}, {'name': ...","[{'iso_3166_1': 'US', 'name': 'United States o...",10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000
3,4,{},1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,/aTXRaPrWSinhcmCrcfJK17urp3F.jpg,{},"[{'iso_3166_1': 'IN', 'name': 'India'}]",3/9/12,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000
4,5,{},0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,/m22s7zvkVFDU9ir56PiiqIEWFdT.jpg,{},"[{'iso_3166_1': 'KR', 'name': 'South Korea'}]",2/5/09,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,{},"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970


In [97]:
# DATA CLEANING

In [98]:
#drop belongs to collection information
#genre
for i, e in enumerate(train['genres'][:5]):
    print(i, e)

0 [{'id': 35, 'name': 'Comedy'}]
1 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10751, 'name': 'Family'}, {'id': 10749, 'name': 'Romance'}]
2 [{'id': 18, 'name': 'Drama'}]
3 [{'id': 53, 'name': 'Thriller'}, {'id': 18, 'name': 'Drama'}]
4 [{'id': 28, 'name': 'Action'}, {'id': 53, 'name': 'Thriller'}]


In [99]:
print('Number of genres in films')
train['genres'].apply(lambda x: len(x) if x != {} else 0).value_counts()

Number of genres in films


2    972
3    900
1    593
4    393
5    111
6     21
0      7
7      3
Name: genres, dtype: int64

In [100]:
train['belongs_to_collection'].apply(lambda x: len(x) if x != {} else 0).value_counts()

0    2396
1     604
Name: belongs_to_collection, dtype: int64

In [101]:
list_of_genres = list(train['genres'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)

In [102]:
Counter([i for j in list_of_genres for i in j]).most_common()

[('Drama', 1531),
 ('Comedy', 1028),
 ('Thriller', 789),
 ('Action', 741),
 ('Romance', 571),
 ('Crime', 469),
 ('Adventure', 439),
 ('Horror', 301),
 ('Science Fiction', 290),
 ('Family', 260),
 ('Fantasy', 232),
 ('Mystery', 225),
 ('Animation', 141),
 ('History', 132),
 ('Music', 100),
 ('War', 100),
 ('Documentary', 87),
 ('Western', 43),
 ('Foreign', 31),
 ('TV Movie', 1)]

In [103]:
#columns created for top 15 genres
train['num_genres'] = train['genres'].apply(lambda x: len(x) if x != {} else 0)
train['all_genres'] = train['genres'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
top_genres = [m[0] for m in Counter([i for j in list_of_genres for i in j]).most_common(15)]
for g in top_genres:
    train['genre_' + g] = train['all_genres'].apply(lambda x: 1 if g in x else 0)

In [104]:
train = train.drop(['genres'], axis=1)

In [105]:
# production companies

for i, e in enumerate(train['production_companies'][:5]):
    print(i, e)

0 [{'name': 'Paramount Pictures', 'id': 4}, {'name': 'United Artists', 'id': 60}, {'name': 'Metro-Goldwyn-Mayer (MGM)', 'id': 8411}]
1 [{'name': 'Walt Disney Pictures', 'id': 2}]
2 [{'name': 'Bold Films', 'id': 2266}, {'name': 'Blumhouse Productions', 'id': 3172}, {'name': 'Right of Way Films', 'id': 32157}]
3 {}
4 {}


In [106]:
print('Number of production companies in films')
train['production_companies'].apply(lambda x: len(x) if x != {} else 0).value_counts()

Number of production companies in films


1     775
2     734
3     582
4     312
5     166
0     156
6     118
7      62
8      42
9      29
11      7
10      7
12      3
16      2
15      2
14      1
13      1
17      1
Name: production_companies, dtype: int64

In [107]:
train[train['production_companies'].apply(lambda x: len(x) if x != {} else 0) > 11]

Unnamed: 0,id,belongs_to_collection,budget,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue,num_genres,all_genres,genre_Drama,genre_Comedy,genre_Thriller,genre_Action,genre_Romance,genre_Crime,genre_Adventure,genre_Horror,genre_Science Fiction,genre_Family,genre_Fantasy,genre_Mystery,genre_Animation,genre_History,genre_Music
31,32,{},0,http://www.cache-derfilm.at,tt0387898,fr,Caché,A married couple is terrorized by a series of ...,5.69586,/i1Zl8S4DgM3IDLW5dhZzBnIdCOe.jpg,"[{'name': 'Les Films du Losange', 'id': 223}, ...","[{'iso_3166_1': 'AT', 'name': 'Austria'}, {'is...",5/2/05,117.0,"[{'iso_639_1': 'fr', 'name': 'Français'}]",Released,,Caché,"[{'id': 90, 'name': 'paris'}, {'id': 213, 'nam...","[{'cast_id': 2, 'character': 'Georges Laurent'...","[{'credit_id': '52fe4244c3a36847f8011073', 'de...",36000000,3,Drama Mystery Thriller,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0
116,117,{},0,,tt2113822,zh,一九四二,"In 1942, Henan Province was devastated by the ...",1.678013,/xxz2gi8vijqqJySGO3kQy2i8mv.jpg,"[{'name': 'Emperor Motion Pictures', 'id': 272...","[{'iso_3166_1': 'CN', 'name': 'China'}]",11/1/12,145.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Back to 1942,"[{'id': 478, 'name': 'china'}, {'id': 180999, ...","[{'cast_id': 3, 'character': 'Theodore Harold ...","[{'credit_id': '5761375dc3a36808aa001554', 'de...",311,1,Drama,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
363,364,{},15400000,,tt2053425,fr,De rouille et d'os,"Put in charge of his young son, Ali leaves Bel...",8.400049,/cHCwW8xPl8yPKQwpNzKVinwvirT.jpg,"[{'name': 'France 2 Cinéma', 'id': 83}, {'name...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",5/17/12,123.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Rust and Bone,"[{'id': 494, 'name': 'father son relationship'...","[{'cast_id': 2, 'character': 'St√©phanie', 'cr...","[{'credit_id': '52fe49dd9251416c750d5e05', 'de...",25762027,2,Drama Romance,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
392,393,{},0,,tt5072406,fr,Moka,Diane Kramer is led by one obsession: to find ...,2.404466,/5VKVaTJJsyDeOzY6fLcyTo1RA9g.jpg,"[{'name': 'Canal+', 'id': 5358}, {'name': 'Cin...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",8/17/16,89.0,"[{'iso_639_1': 'fr', 'name': 'Français'}]",Released,,Moka,{},"[{'cast_id': 1, 'character': 'Diane', 'credit_...","[{'credit_id': '59619ddd9251410bfa0cf8e5', 'de...",126463,1,Drama,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
449,450,{},80000000,http://asoundofthunder.warnerbros.com/,tt0318081,en,A Sound of Thunder,When a hunter sent back to the prehistoric era...,4.980191,/gsqOX1ReJ5lcmTuDdkhOXLug8Ug.jpg,"[{'name': 'Epsilon Motion Pictures', 'id': 117...","[{'iso_3166_1': 'CZ', 'name': 'Czech Republic'...",5/15/05,110.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Some Rules Should Never Be Broken.,A Sound of Thunder,"[{'id': 3737, 'name': 'dying and death'}, {'id...","[{'cast_id': 34, 'character': 'Alicia Wallenbe...","[{'credit_id': '52fe431e9251416c7500438b', 'de...",5989640,4,Action Adventure Science Fiction Thriller,0,0,1,1,0,0,1,0,1,0,0,0,0,0,0
554,555,{},14500000,,tt0293416,ja,Metropolis,Duke Red has overseen the construction of a ma...,9.298092,/1EK7mGCpRKYmSg25FaojvnS9opm.jpg,"[{'name': 'Bandai Visual Company', 'id': 528},...","[{'iso_3166_1': 'JP', 'name': 'Japan'}]",5/26/01,108.0,"[{'iso_639_1': 'ru', 'name': 'Pусский'}, {'iso...",Released,Welcome to Metropolis,Metropolis,"[{'id': 931, 'name': 'jealousy'}, {'id': 1761,...","[{'cast_id': 1, 'character': 'Tima (voice)', '...","[{'credit_id': '52fe4510c3a36847f80ba41d', 'de...",95789342,2,Animation Science Fiction,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
1079,1080,{},7000000,,tt2737050,fr,"Deux jours, une nuit",Sandra is a young woman who has only one weeke...,6.868022,/1mYAejpMskvskGr0J0SaBvdjmrH.jpg,"[{'name': 'BIM Distribuzione', 'id': 225}, {'n...","[{'iso_3166_1': 'IT', 'name': 'Italy'}, {'iso_...",5/21/14,95.0,"[{'iso_639_1': 'fr', 'name': 'Français'}]",Released,A quest for self-improvement,"Two Days, One Night","[{'id': 894, 'name': 'depression'}, {'id': 125...","[{'cast_id': 1, 'character': 'Sandra', 'credit...","[{'credit_id': '563e3c839251414c7000b23b', 'de...",6860853,1,Drama,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2170,2171,{},4000000,http://tickets.picturehouseentertainment.co.uk...,tt3464902,en,The Lobster,"In a dystopian near future, single people, acc...",11.223033,/yR60EqMGS9hHq9I5Pkq2hG984TP.jpg,"[{'name': 'Haut et Court', 'id': 726}, {'name'...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",10/8/15,118.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,An unconventional love story...,The Lobster,"[{'id': 4565, 'name': 'dystopia'}]","[{'cast_id': 5, 'character': 'David', 'credit_...","[{'credit_id': '55253dd0925141720c001aef', 'de...",15656193,5,Comedy Drama Romance Science Fiction Thriller,1,1,1,0,1,0,0,0,1,0,0,0,0,0,0
2395,2396,{},0,,tt4082068,fr,Dheepan,Dheepan is a Sri Lankan Tamil warrior who flee...,5.752416,/lgONuekbRlM0eMvMHJBEDL6MsBP.jpg,"[{'name': 'France 2 Cinéma', 'id': 83}, {'name...","[{'iso_3166_1': 'FR', 'name': 'France'}]",8/26/15,115.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,When the war continues to rage in your heart a...,Dheepan,"[{'id': 90, 'name': 'paris'}, {'id': 254, 'nam...","[{'cast_id': 5, 'character': 'Dheepan', 'credi...","[{'credit_id': '5562176d92514171ab002620', 'de...",248392,2,Crime Drama,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2517,2518,"[{'id': 2396, 'name': 'Asterix and Obelix Coll...",97250400,http://www.asterixauxjeuxolympiques.com/index.php,tt0463872,fr,Astérix aux Jeux Olympiques,Ast√©rix and Ob√©lix have to win the Olympic G...,9.671944,/tKL0RJOeuccc1rrpcDKg8qhedIz.jpg,"[{'name': 'Constantin Film', 'id': 47}, {'name...","[{'iso_3166_1': 'BE', 'name': 'Belgium'}, {'is...",1/13/08,116.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Released,,Asterix at the Olympic Games,"[{'id': 271, 'name': 'competition'}, {'id': 12...","[{'cast_id': 15, 'character': 'Asterix', 'cred...","[{'credit_id': '52fe4354c3a36847f804c0b1', 'de...",132900000,4,Adventure Comedy Family Fantasy,0,1,0,0,0,0,1,0,0,1,1,0,0,0,0


In [108]:
list_of_companies = list(train['production_companies'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)

In [109]:
Counter([i for j in list_of_companies for i in j]).most_common(30)

[('Warner Bros.', 202),
 ('Universal Pictures', 188),
 ('Paramount Pictures', 161),
 ('Twentieth Century Fox Film Corporation', 138),
 ('Columbia Pictures', 91),
 ('Metro-Goldwyn-Mayer (MGM)', 84),
 ('New Line Cinema', 75),
 ('Touchstone Pictures', 63),
 ('Walt Disney Pictures', 62),
 ('Columbia Pictures Corporation', 61),
 ('TriStar Pictures', 53),
 ('Relativity Media', 48),
 ('Canal+', 46),
 ('United Artists', 44),
 ('Miramax Films', 40),
 ('Village Roadshow Pictures', 36),
 ('Regency Enterprises', 31),
 ('BBC Films', 30),
 ('Dune Entertainment', 30),
 ('Working Title Films', 30),
 ('Fox Searchlight Pictures', 29),
 ('StudioCanal', 28),
 ('Lionsgate', 28),
 ('DreamWorks SKG', 27),
 ('Fox 2000 Pictures', 25),
 ('Summit Entertainment', 24),
 ('Hollywood Pictures', 24),
 ('Orion Pictures', 24),
 ('Amblin Entertainment', 23),
 ('Dimension Films', 23)]

In [110]:
#create columns for top 30 production companies

train['num_companies'] = train['production_companies'].apply(lambda x: len(x) if x != {} else 0)
train['all_production_companies'] = train['production_companies'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
top_companies = [m[0] for m in Counter([i for j in list_of_companies for i in j]).most_common(30)]
for g in top_companies:
    train['production_company_' + g] = train['all_production_companies'].apply(lambda x: 1 if g in x else 0)
    
train = train.drop(['production_companies', 'all_production_companies'], axis=1)

In [111]:
#production countries

for i, e in enumerate(train['production_countries'][:5]):
    print(i, e)

0 [{'iso_3166_1': 'US', 'name': 'United States of America'}]
1 [{'iso_3166_1': 'US', 'name': 'United States of America'}]
2 [{'iso_3166_1': 'US', 'name': 'United States of America'}]
3 [{'iso_3166_1': 'IN', 'name': 'India'}]
4 [{'iso_3166_1': 'KR', 'name': 'South Korea'}]


In [112]:
print('Number of production countries in films')
train['production_countries'].apply(lambda x: len(x) if x != {} else 0).value_counts()

Number of production countries in films


1    2222
2     525
3     116
4      57
0      55
5      21
6       3
8       1
Name: production_countries, dtype: int64

In [113]:
list_of_countries = list(train['production_countries'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
Counter([i for j in list_of_countries for i in j]).most_common(25)

[('United States of America', 2282),
 ('United Kingdom', 380),
 ('France', 222),
 ('Germany', 167),
 ('Canada', 120),
 ('India', 81),
 ('Italy', 64),
 ('Japan', 61),
 ('Australia', 61),
 ('Russia', 58),
 ('Spain', 54),
 ('China', 42),
 ('Hong Kong', 42),
 ('Ireland', 23),
 ('Belgium', 23),
 ('South Korea', 22),
 ('Mexico', 19),
 ('Sweden', 18),
 ('New Zealand', 17),
 ('Netherlands', 15),
 ('Czech Republic', 14),
 ('Denmark', 13),
 ('Brazil', 12),
 ('Luxembourg', 10),
 ('South Africa', 10)]

In [114]:
train['num_countries'] = train['production_countries'].apply(lambda x: len(x) if x != {} else 0)
train['all_countries'] = train['production_countries'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
top_countries = [m[0] for m in Counter([i for j in list_of_countries for i in j]).most_common(25)]
for g in top_countries:
    train['production_country_' + g] = train['all_countries'].apply(lambda x: 1 if g in x else 0)
    
train = train.drop(['production_countries', 'all_countries'], axis=1)

In [115]:
#spoken languages

for i, e in enumerate(train['spoken_languages'][:5]):
    print(i, e)

0 [{'iso_639_1': 'en', 'name': 'English'}]
1 [{'iso_639_1': 'en', 'name': 'English'}]
2 [{'iso_639_1': 'en', 'name': 'English'}]
3 [{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'hi', 'name': 'हिन्दी'}]
4 [{'iso_639_1': 'ko', 'name': '한국어/조선말'}]


In [116]:
print('Number of spoken languages in films')
train['spoken_languages'].apply(lambda x: len(x) if x != {} else 0).value_counts()

Number of spoken languages in films


1    2105
2     549
3     216
4      72
5      23
0      20
7       6
6       6
8       2
9       1
Name: spoken_languages, dtype: int64

In [117]:
list_of_languages = list(train['spoken_languages'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
Counter([i for j in list_of_languages for i in j]).most_common(15)

[('English', 2618),
 ('Français', 288),
 ('Español', 239),
 ('Deutsch', 169),
 ('Pусский', 152),
 ('Italiano', 124),
 ('日本語', 89),
 ('普通话', 68),
 ('हिन्दी', 56),
 ('', 47),
 ('Português', 43),
 ('العربية', 40),
 ('한국어/조선말', 37),
 ('广州话 / 廣州話', 36),
 ('தமிழ்', 27)]

In [118]:
train['num_languages'] = train['spoken_languages'].apply(lambda x: len(x) if x != {} else 0)
train['all_languages'] = train['spoken_languages'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
top_languages = [m[0] for m in Counter([i for j in list_of_languages for i in j]).most_common(30)]
for g in top_languages:
    train['language_' + g] = train['all_languages'].apply(lambda x: 1 if g in x else 0)
    
train = train.drop(['spoken_languages', 'all_languages'], axis=1)

In [119]:
#keywords

for i, e in enumerate(train['Keywords'][:5]):
    print(i, e)

0 [{'id': 4379, 'name': 'time travel'}, {'id': 9663, 'name': 'sequel'}, {'id': 11830, 'name': 'hot tub'}, {'id': 179431, 'name': 'duringcreditsstinger'}]
1 [{'id': 2505, 'name': 'coronation'}, {'id': 4263, 'name': 'duty'}, {'id': 6038, 'name': 'marriage'}, {'id': 13072, 'name': 'falling in love'}]
2 [{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'name': 'obsession'}, {'id': 1640, 'name': 'conservatory'}, {'id': 2176, 'name': 'music teacher'}, {'id': 14512, 'name': 'new york city'}, {'id': 14819, 'name': 'violence'}, {'id': 33896, 'name': 'montage'}, {'id': 156823, 'name': 'drummer'}, {'id': 170418, 'name': 'public humiliation'}, {'id': 176095, 'name': 'jazz band'}, {'id': 206298, 'name': 'young adult'}, {'id': 207739, 'name': 'music school'}]
3 [{'id': 10092, 'name': 'mystery'}, {'id': 10540, 'name': 'bollywood'}, {'id': 11734, 'name': 'police corruption'}, {'id': 14536, 'name': 'crime'}, {'id': 14636, 'name': 'india'}, {'id': 208364, 'name': 'missing husband'}, {'id': 220935, 'name': 'ne

In [120]:
print('Number of Keywords in films')
train['Keywords'].apply(lambda x: len(x) if x != {} else 0).value_counts().head(10)

Number of Keywords in films


5    293
0    276
4    248
3    228
6    227
2    207
7    192
1    187
8    161
9    134
Name: Keywords, dtype: int64

In [121]:
list_of_keywords = list(train['Keywords'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)

In [122]:
train['num_Keywords'] = train['Keywords'].apply(lambda x: len(x) if x != {} else 0)
train['all_Keywords'] = train['Keywords'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
top_keywords = [m[0] for m in Counter([i for j in list_of_keywords for i in j]).most_common(30)]
for g in top_keywords:
    train['keyword_' + g] = train['all_Keywords'].apply(lambda x: 1 if g in x else 0)
    
train = train.drop(['Keywords', 'all_Keywords'], axis=1)

In [86]:
#cast

for i, e in enumerate(train['cast'][:1]):
    print(i, e)

0 [{'cast_id': 4, 'character': 'Lou', 'credit_id': '52fe4ee7c3a36847f82afae7', 'gender': 2, 'id': 52997, 'name': 'Rob Corddry', 'order': 0, 'profile_path': '/k2zJL0V1nEZuFT08xUdOd3ucfXz.jpg'}, {'cast_id': 5, 'character': 'Nick', 'credit_id': '52fe4ee7c3a36847f82afaeb', 'gender': 2, 'id': 64342, 'name': 'Craig Robinson', 'order': 1, 'profile_path': '/tVaRMkJXOEVhYxtnnFuhqW0Rjzz.jpg'}, {'cast_id': 6, 'character': 'Jacob', 'credit_id': '52fe4ee7c3a36847f82afaef', 'gender': 2, 'id': 54729, 'name': 'Clark Duke', 'order': 2, 'profile_path': '/oNzK0umwm5Wn0wyEbOy6TVJCSBn.jpg'}, {'cast_id': 7, 'character': 'Adam Jr.', 'credit_id': '52fe4ee7c3a36847f82afaf3', 'gender': 2, 'id': 36801, 'name': 'Adam Scott', 'order': 3, 'profile_path': '/5gb65xz8bzd42yjMAl4zwo4cvKw.jpg'}, {'cast_id': 8, 'character': 'Hot Tub Repairman', 'credit_id': '52fe4ee7c3a36847f82afaf7', 'gender': 2, 'id': 54812, 'name': 'Chevy Chase', 'order': 4, 'profile_path': '/svjpyYtPwtjvRxX9IZnOmOkhDOt.jpg'}, {'cast_id': 9, 'characte

In [123]:
print('Number of casted persons in films')
train['cast'].apply(lambda x: len(x) if x != {} else 0).value_counts().head(10)

Number of casted persons in films


15    212
16    165
10    135
13    129
12    124
11    122
9     118
17    118
18    115
14    110
Name: cast, dtype: int64

In [124]:
list_of_cast_names = list(train['cast'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
Counter([i for j in list_of_cast_names for i in j]).most_common(15)

[('Samuel L. Jackson', 30),
 ('Robert De Niro', 30),
 ('Morgan Freeman', 27),
 ('J.K. Simmons', 25),
 ('Bruce Willis', 25),
 ('Liam Neeson', 25),
 ('Susan Sarandon', 25),
 ('Bruce McGill', 24),
 ('John Turturro', 24),
 ('Forest Whitaker', 23),
 ('Willem Dafoe', 23),
 ('Bill Murray', 22),
 ('Owen Wilson', 22),
 ('Nicolas Cage', 22),
 ('Sylvester Stallone', 21)]

In [125]:
list_of_cast_names_url = list(train['cast'].apply(lambda x: [(i['name'], i['profile_path']) for i in x] if x != {} else []).values)
d = Counter([i for j in list_of_cast_names_url for i in j]).most_common(16)
d

[(('Samuel L. Jackson', '/AvCReLikjzYEf9XjTQxbv3JWgKT.jpg'), 30),
 (('Robert De Niro', '/lvTSwUcvJRLAJ2FB5qFaukel516.jpg'), 30),
 (('Morgan Freeman', '/oGJQhOpT8S1M56tvSsbEBePV5O1.jpg'), 27),
 (('J.K. Simmons', '/jPoNW5fugs5h8AbcE7H5OBm04Tm.jpg'), 25),
 (('Bruce Willis', '/2B7RySy2WMVJKKEFN2XA3IFb8w0.jpg'), 25),
 (('Liam Neeson', '/9mdAohLsDu36WaXV2N3SQ388bvz.jpg'), 25),
 (('Susan Sarandon', '/giqZPokZi2nKLtYw8hrMVf8Vita.jpg'), 25),
 (('Bruce McGill', '/r9UNvqwTYB9C3AcGyBpcjzkCnVD.jpg'), 24),
 (('John Turturro', '/70V4hwvWN0J3aX2LzQg7eKCeq29.jpg'), 24),
 (('Forest Whitaker', '/4pMQkelS5lK661m9Kz3oIxLYiyS.jpg'), 23),
 (('Willem Dafoe', '/xM5lhOR5tWWdIlFpBDeZJx9opIP.jpg'), 23),
 (('Bill Murray', '/7BOoOAIA1CnSzFSVSJP7saniQaB.jpg'), 22),
 (('Owen Wilson', '/j7oYgvfDiO34VcFdSB7GhM2CSle.jpg'), 22),
 (('Nicolas Cage', '/ti2h1OS1n1VwoJHWFaJD8dMZuEE.jpg'), 22),
 (('Sylvester Stallone', '/gnmwOa46C2TP35N7ARSzboTdx2u.jpg'), 21),
 (('Jason Statham', '/PhWiWgasncGWD9LdbsGcmxkV4r.jpg'), 21)]

In [126]:
list_of_cast_genders = list(train['cast'].apply(lambda x: [i['gender'] for i in x] if x != {} else []).values)
Counter([i for j in list_of_cast_genders for i in j]).most_common()

[(2, 27949), (0, 20329), (1, 13533)]

In [None]:
#0 is unspecified, 1 is female, and 2 is male

In [127]:
list_of_cast_characters = list(train['cast'].apply(lambda x: [i['character'] for i in x] if x != {} else []).values)
Counter([i for j in list_of_cast_characters for i in j]).most_common(15)

[('', 818),
 ('Himself', 610),
 ('Herself', 155),
 ('Dancer', 144),
 ('Additional Voices (voice)', 100),
 ('Doctor', 77),
 ('Reporter', 70),
 ('Waitress', 69),
 ('Nurse', 65),
 ('Bartender', 55),
 ('Jack', 54),
 ('Debutante', 54),
 ('Security Guard', 50),
 ('Paul', 48),
 ('Frank', 44)]

In [130]:
train['num_cast'] = train['cast'].apply(lambda x: len(x) if x != {} else 0)
top_cast_names = [m[0] for m in Counter([i for j in list_of_cast_names for i in j]).most_common(15)]
for g in top_cast_names:
    train['cast_name_' + g] = train['cast'].apply(lambda x: 1 if g in str(x) else 0)
train['genders_0_cast'] = train['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
train['genders_1_cast'] = train['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
train['genders_2_cast'] = train['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 2]))
top_cast_characters = [m[0] for m in Counter([i for j in list_of_cast_characters for i in j]).most_common(15)]
for g in top_cast_characters:
    train['cast_character_' + g] = train['cast'].apply(lambda x: 1 if g in str(x) else 0)

In [131]:
train = train.drop(['cast'], axis=1)

In [132]:
# crew

for i, e in enumerate(train['crew'][:1]):
    print(i, e[:10])

0 [{'credit_id': '59ac067c92514107af02c8c8', 'department': 'Directing', 'gender': 0, 'id': 1449071, 'job': 'First Assistant Director', 'name': 'Kelly Cantley', 'profile_path': None}, {'credit_id': '52fe4ee7c3a36847f82afad7', 'department': 'Directing', 'gender': 2, 'id': 3227, 'job': 'Director', 'name': 'Steve Pink', 'profile_path': '/myHOgo8mQSCiCAZNGMRdHVr03jr.jpg'}, {'credit_id': '5524ed25c3a3687ded000d88', 'department': 'Writing', 'gender': 2, 'id': 347335, 'job': 'Writer', 'name': 'Josh Heald', 'profile_path': '/pwXJIenrDMrG7t3zNfLvr8w1RGU.jpg'}, {'credit_id': '5524ed2d925141720c001128', 'department': 'Writing', 'gender': 2, 'id': 347335, 'job': 'Characters', 'name': 'Josh Heald', 'profile_path': '/pwXJIenrDMrG7t3zNfLvr8w1RGU.jpg'}, {'credit_id': '5524ed3d92514166c1004a5d', 'department': 'Production', 'gender': 2, 'id': 57822, 'job': 'Producer', 'name': 'Andrew Panay', 'profile_path': None}, {'credit_id': '5524ed4bc3a3687df3000dd2', 'department': 'Production', 'gender': 0, 'id': 14

In [133]:
print('Number of casted persons in films')
train['crew'].apply(lambda x: len(x) if x != {} else 0).value_counts().head(10)

Number of casted persons in films


2     179
11    127
10    126
3     126
12    110
9     109
8     109
14    104
4     101
7      94
Name: crew, dtype: int64

In [134]:
list_of_crew_names = list(train['crew'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
Counter([i for j in list_of_crew_names for i in j]).most_common(15)

[('Avy Kaufman', 50),
 ('Robert Rodriguez', 44),
 ('Deborah Aquila', 40),
 ('James Newton Howard', 39),
 ('Mary Vernieu', 38),
 ('Steven Spielberg', 37),
 ('Luc Besson', 37),
 ('Jerry Goldsmith', 37),
 ('Francine Maisler', 35),
 ('Tricia Wood', 35),
 ('James Horner', 33),
 ('Kerry Barden', 32),
 ('Bob Weinstein', 30),
 ('Harvey Weinstein', 30),
 ('Janet Hirshenson', 30)]

In [135]:
list_of_crew_names_url = list(train['crew'].apply(lambda x: [(i['name'], i['profile_path'], i['job']) for i in x] if x != {} else []).values)
d = Counter([i for j in list_of_crew_names_url for i in j]).most_common(16)

In [136]:
list_of_crew_jobs = list(train['crew'].apply(lambda x: [i['job'] for i in x] if x != {} else []).values)
Counter([i for j in list_of_crew_jobs for i in j]).most_common(15)

[('Producer', 6011),
 ('Executive Producer', 3459),
 ('Director', 3225),
 ('Screenplay', 2996),
 ('Editor', 2824),
 ('Casting', 2483),
 ('Director of Photography', 2288),
 ('Original Music Composer', 1947),
 ('Art Direction', 1821),
 ('Production Design', 1650),
 ('Costume Design', 1573),
 ('Writer', 1523),
 ('Set Decoration', 1345),
 ('Makeup Artist', 1108),
 ('Sound Re-Recording Mixer', 970)]

In [137]:
list_of_crew_genders = list(train['crew'].apply(lambda x: [i['gender'] for i in x] if x != {} else []).values)
Counter([i for j in list_of_crew_genders for i in j]).most_common(15)

[(0, 41787), (2, 24898), (1, 6412)]

In [138]:
list_of_crew_departments = list(train['crew'].apply(lambda x: [i['department'] for i in x] if x != {} else []).values)
Counter([i for j in list_of_crew_departments for i in j]).most_common(14)

[('Production', 15887),
 ('Sound', 9319),
 ('Art', 8069),
 ('Crew', 7315),
 ('Writing', 6567),
 ('Costume & Make-Up', 6156),
 ('Camera', 5424),
 ('Directing', 4954),
 ('Editing', 4508),
 ('Visual Effects', 3591),
 ('Lighting', 1303),
 ('Actors', 4)]

In [139]:
list_of_crew_names = train['crew'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values
Counter([i for j in list_of_crew_names for i in j]).most_common(15)

[('Avy Kaufman', 50),
 ('Robert Rodriguez', 44),
 ('Deborah Aquila', 40),
 ('James Newton Howard', 39),
 ('Mary Vernieu', 38),
 ('Steven Spielberg', 37),
 ('Luc Besson', 37),
 ('Jerry Goldsmith', 37),
 ('Francine Maisler', 35),
 ('Tricia Wood', 35),
 ('James Horner', 33),
 ('Kerry Barden', 32),
 ('Bob Weinstein', 30),
 ('Harvey Weinstein', 30),
 ('Janet Hirshenson', 30)]

In [142]:
train['num_crew'] = train['crew'].apply(lambda x: len(x) if x != {} else 0)
top_crew_names = [m[0] for m in Counter([i for j in list_of_crew_names for i in j]).most_common(15)]
for g in top_crew_names:
    train['crew_name_' + g] = train['crew'].apply(lambda x: 1 if g in str(x) else 0)
train['genders_0_crew'] = train['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
train['genders_1_crew'] = train['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
train['genders_2_crew'] = train['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 2]))
top_crew_jobs = [m[0] for m in Counter([i for j in list_of_crew_jobs for i in j]).most_common(15)]
for j in top_crew_jobs:
    train['jobs_' + j] = train['crew'].apply(lambda x: sum([1 for i in x if i['job'] == j]))
top_crew_departments = [m[0] for m in Counter([i for j in list_of_crew_departments for i in j]).most_common(15)]
for j in top_crew_departments:
    train['departments_' + j] = train['crew'].apply(lambda x: sum([1 for i in x if i['department'] == j])) 
    
train = train.drop(['crew'], axis=1)

In [None]:
train.to_csv('~/Netflix/TMDB/tmdbtrain2a.csv', index = False)

In [4]:
#joined tmbdtrain2 with imbd ratings.csv dataset
tmdb = pd.read_csv('~/Netflix/TMDB/tmdbtrain2imdbratings.csv')

In [5]:
tmdb.head()

Unnamed: 0,id,budget,imdb_id,original_language,original_title,overview,popularity,release_date,runtime,title,revenue,has_collection,num_genres,all_genres,genre_Drama,genre_Comedy,genre_Thriller,genre_Action,genre_Romance,genre_Crime,genre_Adventure,genre_Horror,genre_Science Fiction,genre_Family,genre_Fantasy,genre_Mystery,genre_Animation,genre_History,genre_Music,num_companies,production_company_Warner Bros.,production_company_Universal Pictures,production_company_Paramount Pictures,production_company_Twentieth Century Fox Film Corporation,production_company_Columbia Pictures,production_company_Metro-Goldwyn-Mayer (MGM),production_company_New Line Cinema,production_company_Touchstone Pictures,production_company_Walt Disney Pictures,production_company_Columbia Pictures Corporation,production_company_TriStar Pictures,production_company_Relativity Media,production_company_Canal+,production_company_United Artists,production_company_Miramax Films,production_company_Village Roadshow Pictures,production_company_Regency Enterprises,production_company_BBC Films,production_company_Dune Entertainment,production_company_Working Title Films,production_company_Fox Searchlight Pictures,production_company_StudioCanal,production_company_Lionsgate,production_company_DreamWorks SKG,production_company_Fox 2000 Pictures,production_company_Summit Entertainment,production_company_Hollywood Pictures,production_company_Orion Pictures,production_company_Amblin Entertainment,production_company_Dimension Films,num_countries,production_country_United States of America,production_country_United Kingdom,production_country_France,production_country_Germany,production_country_Canada,production_country_India,production_country_Italy,production_country_Japan,production_country_Australia,production_country_Russia,production_country_Spain,production_country_China,production_country_Hong Kong,production_country_Ireland,production_country_Belgium,production_country_South Korea,production_country_Mexico,production_country_Sweden,production_country_New Zealand,production_country_Netherlands,production_country_Czech Republic,production_country_Denmark,production_country_Brazil,production_country_Luxembourg,production_country_South Africa,num_languages,language_English,language_Français,language_Español,language_Deutsch,language_Pусский,language_Italiano,language_日本語,language_普通话,language_हिन्दी,language_,language_Português,language_العربية,language_한국어/조선말,language_广州话 / 廣州話,language_தமிழ்,language_Polski,language_Magyar,language_Latin,language_svenska,language_ภาษาไทย,language_Český,language_עִבְרִית,language_ελληνικά,language_Türkçe,language_Dansk,language_Nederlands,language_فارسی,language_Tiếng Việt,language_اردو,language_Română,num_Keywords,keyword_woman director,keyword_independent film,keyword_duringcreditsstinger,keyword_murder,keyword_based on novel,keyword_violence,keyword_sport,keyword_biography,keyword_aftercreditsstinger,keyword_dystopia,keyword_revenge,keyword_friendship,keyword_sex,keyword_suspense,keyword_sequel,keyword_love,keyword_police,keyword_teenager,keyword_nudity,keyword_female nudity,keyword_drug,keyword_prison,keyword_musical,keyword_high school,keyword_los angeles,keyword_new york,keyword_family,keyword_father son relationship,keyword_kidnapping,keyword_investigation,num_cast,cast_name_Samuel L. Jackson,cast_name_Robert De Niro,cast_name_Morgan Freeman,cast_name_J.K. Simmons,cast_name_Bruce Willis,cast_name_Liam Neeson,cast_name_Susan Sarandon,cast_name_Bruce McGill,cast_name_John Turturro,cast_name_Forest Whitaker,cast_name_Willem Dafoe,cast_name_Bill Murray,cast_name_Owen Wilson,cast_name_Nicolas Cage,cast_name_Sylvester Stallone,genders_0_cast,genders_1_cast,genders_2_cast,cast_character_,cast_character_Himself,cast_character_Herself,cast_character_Dancer,cast_character_Additional Voices (voice),cast_character_Doctor,cast_character_Reporter,cast_character_Waitress,cast_character_Nurse,cast_character_Bartender,cast_character_Jack,cast_character_Debutante,cast_character_Security Guard,cast_character_Paul,cast_character_Frank,num_crew,crew_name_Avy Kaufman,crew_name_Robert Rodriguez,crew_name_Deborah Aquila,crew_name_James Newton Howard,crew_name_Mary Vernieu,crew_name_Steven Spielberg,crew_name_Luc Besson,crew_name_Jerry Goldsmith,crew_name_Francine Maisler,crew_name_Tricia Wood,crew_name_James Horner,crew_name_Kerry Barden,crew_name_Bob Weinstein,crew_name_Harvey Weinstein,crew_name_Janet Hirshenson,genders_0_crew,genders_1_crew,genders_2_crew,jobs_Producer,jobs_Executive Producer,jobs_Director,jobs_Screenplay,jobs_Editor,jobs_Casting,jobs_Director of Photography,jobs_Original Music Composer,jobs_Art Direction,jobs_Production Design,jobs_Costume Design,jobs_Writer,jobs_Set Decoration,jobs_Makeup Artist,jobs_Sound Re-Recording Mixer,departments_Production,departments_Sound,departments_Art,departments_Crew,departments_Writing,departments_Costume & Make-Up,departments_Camera,departments_Directing,departments_Editing,departments_Visual Effects,departments_Lighting,departments_Actors,weighted_average_vote,total_votes,mean_vote,median_vote,votes_10,votes_9,votes_8,votes_7,votes_6,votes_5,votes_4,votes_3,votes_2,votes_1,allgenders_0age_avg_vote,allgenders_0age_votes,allgenders_18age_avg_vote,allgenders_18age_votes,allgenders_30age_avg_vote,allgenders_30age_votes,allgenders_45age_avg_vote,allgenders_45age_votes,males_allages_avg_vote,males_allages_votes,males_0age_avg_vote,males_0age_votes,males_18age_avg_vote,males_18age_votes,males_30age_avg_vote,males_30age_votes,males_45age_avg_vote,males_45age_votes,females_allages_avg_vote,females_allages_votes,females_0age_avg_vote,females_0age_votes,females_18age_avg_vote,females_18age_votes,females_30age_avg_vote,females_30age_votes,females_45age_avg_vote,females_45age_votes,top1000_voters_rating,top1000_voters_votes,us_voters_rating,us_voters_votes,non_us_voters_rating,non_us_voters_votes
0,1,14000000,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,2/20/15,93.0,Hot Tub Time Machine 2,12314651,1,1,Comedy,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,8,10,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,72,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,59,0,13,1,3,1,0,1,1,1,1,1,1,1,1,1,4,2,9,10,12,4,2,13,8,4,2,4,4,0,5.1,36333.0,5.2,5.0,1291.0,591.0,1650.0,4353.0,8070.0,8490.0,5127.0,3000.0,1903.0,1858.0,4.6,12.0,5.3,7860.0,5.0,16795.0,4.7,3684.0,5.0,26622.0,4.5,10.0,5.3,6814.0,5.0,14967.0,4.7,3247.0,5.0,3181.0,5.0,2.0,5.3,940.0,4.9,1639.0,4.7,379.0,4.6,305.0,5.1,6774.0,5.0,16197.0
1,2,40000000,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,8/6/04,113.0,The Princess Diaries 2: Royal Engagement,95149435,1,4,Comedy Drama Family Romance,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,10,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4,4,3,1,1,1,1,0,1,1,0,0,0,0,0,0,0,4,1,0,0,1,0,1,1,1,0,0,0,5.8,72487.0,6.2,6.0,6773.0,3124.0,6953.0,11965.0,17133.0,13661.0,6353.0,3055.0,1703.0,1767.0,6.6,54.0,6.0,23474.0,5.5,24253.0,5.7,5200.0,5.2,20179.0,5.6,9.0,5.2,6123.0,5.1,9911.0,5.3,3209.0,6.2,35021.0,7.0,38.0,6.4,16840.0,5.9,14050.0,6.3,1911.0,5.0,382.0,5.8,13534.0,5.6,30354.0
2,3,3300000,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,10/10/14,105.0,Whiplash,13092000,0,1,Drama,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,51,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,31,7,13,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,49,4,11,4,4,1,1,1,2,1,1,1,1,1,0,1,1,2,18,9,5,9,1,5,4,3,6,3,1,0,8.5,690732.0,8.4,9.0,150263.0,222670.0,190135.0,81256.0,24384.0,9146.0,4671.0,2552.0,1834.0,3821.0,8.8,672.0,8.7,193118.0,8.4,218920.0,8.1,42168.0,8.5,399942.0,8.9,486.0,8.7,148527.0,8.4,178434.0,8.1,34769.0,8.3,92224.0,8.0,109.0,8.5,40301.0,8.2,36635.0,8.2,6409.0,7.8,696.0,8.6,67101.0,8.4,273970.0
3,4,1200000,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,3/9/12,122.0,Kahaani,16000000,0,2,Drama Thriller,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,8.1,56468.0,8.3,9.0,12523.0,16285.0,15570.0,6879.0,2174.0,876.0,409.0,262.0,236.0,1254.0,8.2,26.0,8.3,16818.0,8.0,21378.0,7.1,1860.0,8.1,38664.0,7.9,20.0,8.3,15291.0,8.0,19274.0,7.1,1531.0,8.1,4112.0,8.2,5.0,8.2,1430.0,8.0,1963.0,7.2,308.0,5.8,208.0,7.7,3221.0,8.0,23351.0
4,5,0,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,2/5/09,118.0,Marine Boy,3923970,0,2,Action Thriller,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,5.7,291.0,6.1,6.0,26.0,8.0,15.0,52.0,88.0,54.0,23.0,14.0,2.0,9.0,,,7.0,25.0,5.8,173.0,5.4,51.0,5.8,216.0,,,7.7,14.0,5.8,152.0,5.4,45.0,5.8,35.0,,,5.7,10.0,5.8,20.0,5.8,5.0,5.3,13.0,6.3,36.0,5.7,202.0


In [7]:
tmdb.shape

(3000, 276)

In [None]:
#LINEAR MODELS

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.formula.api as smf
#version 3
def statsmodels_train_test_split(df, stratify=None, **kwargs):

    if stratify is None:
        y, X = df.iloc[:,0], df.drop(columns=df.columns[0])
        X_train, X_test, y_train, y_test = train_test_split(X,y, **kwargs)
    else:
        y, X = stratify, df.drop(columns = stratify.name)
        X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y, **kwargs)
    
    return pd.concat([X_train, y_train], axis=1), pd.concat([X_test, y_test], axis=1)

In [39]:
tmdb.columns = [item.replace(" ","_") for item in tmdb.columns]
tmdb

Unnamed: 0,id,budget,imdb_id,original_language,original_title,overview,popularity,release_date,runtime,title,revenue,has_collection,num_genres,all_genres,genre_Drama,genre_Comedy,genre_Thriller,genre_Action,genre_Romance,genre_Crime,genre_Adventure,genre_Horror,genre_Science_Fiction,genre_Family,genre_Fantasy,genre_Mystery,genre_Animation,genre_History,genre_Music,num_companies,production_company_Warner_Bros.,production_company_Universal_Pictures,production_company_Paramount_Pictures,production_company_Twentieth_Century_Fox_Film_Corporation,production_company_Columbia_Pictures,production_company_Metro-Goldwyn-Mayer_(MGM),production_company_New_Line_Cinema,production_company_Touchstone_Pictures,production_company_Walt_Disney_Pictures,production_company_Columbia_Pictures_Corporation,production_company_TriStar_Pictures,production_company_Relativity_Media,production_company_Canal+,production_company_United_Artists,production_company_Miramax_Films,production_company_Village_Roadshow_Pictures,production_company_Regency_Enterprises,production_company_BBC_Films,production_company_Dune_Entertainment,production_company_Working_Title_Films,production_company_Fox_Searchlight_Pictures,production_company_StudioCanal,production_company_Lionsgate,production_company_DreamWorks_SKG,production_company_Fox_2000_Pictures,production_company_Summit_Entertainment,production_company_Hollywood_Pictures,production_company_Orion_Pictures,production_company_Amblin_Entertainment,production_company_Dimension_Films,num_countries,production_country_United_States_of_America,production_country_United_Kingdom,production_country_France,production_country_Germany,production_country_Canada,production_country_India,production_country_Italy,production_country_Japan,production_country_Australia,production_country_Russia,production_country_Spain,production_country_China,production_country_Hong_Kong,production_country_Ireland,production_country_Belgium,production_country_South_Korea,production_country_Mexico,production_country_Sweden,production_country_New_Zealand,production_country_Netherlands,production_country_Czech_Republic,production_country_Denmark,production_country_Brazil,production_country_Luxembourg,production_country_South_Africa,num_languages,language_English,language_Français,language_Español,language_Deutsch,language_Pусский,language_Italiano,language_日本語,language_普通话,language_हिन्दी,language_,language_Português,language_العربية,language_한국어/조선말,language_广州话_/_廣州話,language_தமிழ்,language_Polski,language_Magyar,language_Latin,language_svenska,language_ภาษาไทย,language_Český,language_עִבְרִית,language_ελληνικά,language_Türkçe,language_Dansk,language_Nederlands,language_فارسی,language_Tiếng_Việt,language_اردو,language_Română,num_Keywords,keyword_woman_director,keyword_independent_film,keyword_duringcreditsstinger,keyword_murder,keyword_based_on_novel,keyword_violence,keyword_sport,keyword_biography,keyword_aftercreditsstinger,keyword_dystopia,keyword_revenge,keyword_friendship,keyword_sex,keyword_suspense,keyword_sequel,keyword_love,keyword_police,keyword_teenager,keyword_nudity,keyword_female_nudity,keyword_drug,keyword_prison,keyword_musical,keyword_high_school,keyword_los_angeles,keyword_new_york,keyword_family,keyword_father_son_relationship,keyword_kidnapping,keyword_investigation,num_cast,cast_name_Samuel_L._Jackson,cast_name_Robert_De_Niro,cast_name_Morgan_Freeman,cast_name_J.K._Simmons,cast_name_Bruce_Willis,cast_name_Liam_Neeson,cast_name_Susan_Sarandon,cast_name_Bruce_McGill,cast_name_John_Turturro,cast_name_Forest_Whitaker,cast_name_Willem_Dafoe,cast_name_Bill_Murray,cast_name_Owen_Wilson,cast_name_Nicolas_Cage,cast_name_Sylvester_Stallone,genders_0_cast,genders_1_cast,genders_2_cast,cast_character_,cast_character_Himself,cast_character_Herself,cast_character_Dancer,cast_character_Additional_Voices_(voice),cast_character_Doctor,cast_character_Reporter,cast_character_Waitress,cast_character_Nurse,cast_character_Bartender,cast_character_Jack,cast_character_Debutante,cast_character_Security_Guard,cast_character_Paul,cast_character_Frank,num_crew,crew_name_Avy_Kaufman,crew_name_Robert_Rodriguez,crew_name_Deborah_Aquila,crew_name_James_Newton_Howard,crew_name_Mary_Vernieu,crew_name_Steven_Spielberg,crew_name_Luc_Besson,crew_name_Jerry_Goldsmith,crew_name_Francine_Maisler,crew_name_Tricia_Wood,crew_name_James_Horner,crew_name_Kerry_Barden,crew_name_Bob_Weinstein,crew_name_Harvey_Weinstein,crew_name_Janet_Hirshenson,genders_0_crew,genders_1_crew,genders_2_crew,jobs_Producer,jobs_Executive_Producer,jobs_Director,jobs_Screenplay,jobs_Editor,jobs_Casting,jobs_Director_of_Photography,jobs_Original_Music_Composer,jobs_Art_Direction,jobs_Production_Design,jobs_Costume_Design,jobs_Writer,jobs_Set_Decoration,jobs_Makeup_Artist,jobs_Sound_Re-Recording_Mixer,departments_Production,departments_Sound,departments_Art,departments_Crew,departments_Writing,departments_Costume_&_Make-Up,departments_Camera,departments_Directing,departments_Editing,departments_Visual_Effects,departments_Lighting,departments_Actors,weighted_average_vote,total_votes,mean_vote,median_vote,votes_10,votes_9,votes_8,votes_7,votes_6,votes_5,votes_4,votes_3,votes_2,votes_1,allgenders_0age_avg_vote,allgenders_0age_votes,allgenders_18age_avg_vote,allgenders_18age_votes,allgenders_30age_avg_vote,allgenders_30age_votes,allgenders_45age_avg_vote,allgenders_45age_votes,males_allages_avg_vote,males_allages_votes,males_0age_avg_vote,males_0age_votes,males_18age_avg_vote,males_18age_votes,males_30age_avg_vote,males_30age_votes,males_45age_avg_vote,males_45age_votes,females_allages_avg_vote,females_allages_votes,females_0age_avg_vote,females_0age_votes,females_18age_avg_vote,females_18age_votes,females_30age_avg_vote,females_30age_votes,females_45age_avg_vote,females_45age_votes,top1000_voters_rating,top1000_voters_votes,us_voters_rating,us_voters_votes,non_us_voters_rating,non_us_voters_votes
0,1,14000000,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,2/20/15,93.0,Hot Tub Time Machine 2,12314651,1,1,Comedy,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,8,10,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,72,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,59,0,13,1,3,1,0,1,1,1,1,1,1,1,1,1,4,2,9,10,12,4,2,13,8,4,2,4,4,0,5.1,36333.0,5.2,5.0,1291.0,591.0,1650.0,4353.0,8070.0,8490.0,5127.0,3000.0,1903.0,1858.0,4.6,12.0,5.3,7860.0,5.0,16795.0,4.7,3684.0,5.0,26622.0,4.5,10.0,5.3,6814.0,5.0,14967.0,4.7,3247.0,5.0,3181.0,5.0,2.0,5.3,940.0,4.9,1639.0,4.7,379.0,4.6,305.0,5.1,6774.0,5.0,16197.0
1,2,40000000,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,8/6/04,113.0,The Princess Diaries 2: Royal Engagement,95149435,1,4,Comedy Drama Family Romance,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,10,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4,4,3,1,1,1,1,0,1,1,0,0,0,0,0,0,0,4,1,0,0,1,0,1,1,1,0,0,0,5.8,72487.0,6.2,6.0,6773.0,3124.0,6953.0,11965.0,17133.0,13661.0,6353.0,3055.0,1703.0,1767.0,6.6,54.0,6.0,23474.0,5.5,24253.0,5.7,5200.0,5.2,20179.0,5.6,9.0,5.2,6123.0,5.1,9911.0,5.3,3209.0,6.2,35021.0,7.0,38.0,6.4,16840.0,5.9,14050.0,6.3,1911.0,5.0,382.0,5.8,13534.0,5.6,30354.0
2,3,3300000,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.299990,10/10/14,105.0,Whiplash,13092000,0,1,Drama,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,51,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,31,7,13,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,49,4,11,4,4,1,1,1,2,1,1,1,1,1,0,1,1,2,18,9,5,9,1,5,4,3,6,3,1,0,8.5,690732.0,8.4,9.0,150263.0,222670.0,190135.0,81256.0,24384.0,9146.0,4671.0,2552.0,1834.0,3821.0,8.8,672.0,8.7,193118.0,8.4,218920.0,8.1,42168.0,8.5,399942.0,8.9,486.0,8.7,148527.0,8.4,178434.0,8.1,34769.0,8.3,92224.0,8.0,109.0,8.5,40301.0,8.2,36635.0,8.2,6409.0,7.8,696.0,8.6,67101.0,8.4,273970.0
3,4,1200000,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,3/9/12,122.0,Kahaani,16000000,0,2,Drama Thriller,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,8.1,56468.0,8.3,9.0,12523.0,16285.0,15570.0,6879.0,2174.0,876.0,409.0,262.0,236.0,1254.0,8.2,26.0,8.3,16818.0,8.0,21378.0,7.1,1860.0,8.1,38664.0,7.9,20.0,8.3,15291.0,8.0,19274.0,7.1,1531.0,8.1,4112.0,8.2,5.0,8.2,1430.0,8.0,1963.0,7.2,308.0,5.8,208.0,7.7,3221.0,8.0,23351.0
4,5,0,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.148070,2/5/09,118.0,Marine Boy,3923970,0,2,Action Thriller,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,5.7,291.0,6.1,6.0,26.0,8.0,15.0,52.0,88.0,54.0,23.0,14.0,2.0,9.0,,,7.0,25.0,5.8,173.0,5.4,51.0,5.8,216.0,,,7.7,14.0,5.8,152.0,5.4,45.0,5.8,35.0,,,5.7,10.0,5.8,20.0,5.8,5.0,5.3,13.0,6.3,36.0,5.7,202.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2996,0,tt0109403,en,Chasers,Military men Rock Reilly and Eddie Devane are ...,9.853270,4/22/94,102.0,Chasers,1596687,0,2,Comedy Romance,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,19,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,11,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,14,1,1,1,3,1,1,1,0,1,0,0,0,0,0,0,6,2,1,0,5,0,1,1,1,0,0,0,5.2,5675.0,5.2,5.0,239.0,101.0,226.0,658.0,1151.0,1452.0,895.0,489.0,239.0,225.0,5.0,2.0,5.7,189.0,5.1,2628.0,5.1,1571.0,5.1,4075.0,6.0,1.0,5.5,145.0,5.1,2382.0,5.1,1432.0,5.3,395.0,4.0,1.0,6.3,43.0,5.0,221.0,5.4,122.0,5.0,233.0,5.1,1352.0,5.1,2776.0
2996,2997,0,tt2364975,sv,Vi är bäst!,Three girls in 1980s Stockholm decide to form ...,3.727996,3/28/13,102.0,We Are the Best!,180590,0,2,Drama Music,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,1,4,1,0,1,1,1,2,1,0,0,0,0,0,0,0,0,8,1,0,0,2,0,1,2,1,0,0,0,7.2,9875.0,7.2,7.0,642.0,1112.0,2597.0,2851.0,1549.0,588.0,211.0,132.0,82.0,111.0,6.5,2.0,7.3,2127.0,7.2,4480.0,6.9,1535.0,7.2,6181.0,,,7.4,1228.0,7.2,3434.0,6.9,1316.0,7.2,2096.0,6.5,2.0,7.3,845.0,7.2,964.0,6.9,188.0,6.2,145.0,7.5,1381.0,7.1,5669.0
2997,2998,65000000,tt0116908,en,The Long Kiss Goodnight,"Samantha Caine, suburban homemaker, is the ide...",14.482345,10/11/96,120.0,The Long Kiss Goodnight,89456761,0,4,Action Crime Mystery Thriller,0,0,1,1,0,1,0,0,0,0,0,1,0,0,0,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,4,9,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,6,4,0,1,1,1,0,1,1,0,0,0,0,0,0,0,4,1,0,1,1,0,1,1,1,0,0,0,6.8,71497.0,6.9,7.0,5301.0,5335.0,13684.0,21446.0,14274.0,6030.0,2584.0,1271.0,709.0,863.0,6.5,4.0,6.6,3789.0,6.7,34724.0,6.9,16020.0,6.7,47814.0,7.0,3.0,6.6,2983.0,6.7,29827.0,6.9,13521.0,7.0,7950.0,5.0,1.0,6.7,769.0,7.0,4557.0,7.2,2274.0,6.4,578.0,6.8,15311.0,6.7,34584.0
2998,2999,42000000,tt0343135,en,Along Came Polly,Reuben Feffer is a guy who's spent his entire ...,15.725542,1/16/04,90.0,Along Came Polly,171963386,0,2,Comedy Romance,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,9,16,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,89,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,61,10,18,3,2,1,0,2,1,1,1,1,1,1,1,1,2,0,14,13,12,10,1,14,9,4,9,0,3,0,6.0,124480.0,6.0,6.0,4027.0,3294.0,10535.0,27701.0,39413.0,23081.0,8929.0,3857.0,1894.0,1749.0,6.4,18.0,6.0,20213.0,5.9,64373.0,6.0,14238.0,6.0,73970.0,6.6,12.0,6.0,12149.0,6.0,47855.0,6.0,11373.0,5.8,27561.0,5.8,6.0,5.9,7794.0,5.8,15884.0,5.9,2657.0,5.6,518.0,5.9,26267.0,5.9,61434.0


In [None]:
#PREDICTING REVENUE 

In [41]:
train, test = statsmodels_train_test_split(tmdb)

In [42]:
formula = 'revenue ~ C(has_collection) + budget'
lm = smf.ols(formula=formula, data=train).fit()
lm.summary()

0,1,2,3
Dep. Variable:,revenue,R-squared:,0.583
Model:,OLS,Adj. R-squared:,0.582
Method:,Least Squares,F-statistic:,1568.0
Date:,"Mon, 21 Sep 2020",Prob (F-statistic):,0.0
Time:,16:48:53,Log-Likelihood:,-44315.0
No. Observations:,2250,AIC:,88640.0
Df Residuals:,2247,BIC:,88650.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-3.836e+06,2.26e+06,-1.694,0.090,-8.28e+06,6.04e+05
C(has_collection)[T.1],6.587e+07,4.65e+06,14.150,0.000,5.67e+07,7.5e+07
budget,2.5521,0.051,49.998,0.000,2.452,2.652

0,1,2,3
Omnibus:,1603.107,Durbin-Watson:,1.985
Prob(Omnibus):,0.0,Jarque-Bera (JB):,49484.329
Skew:,2.971,Prob(JB):,0.0
Kurtosis:,25.193,Cond. No.,112000000.0


In [43]:
y_test_pred = lm.predict(test)

In [44]:
y_test_pred = lm.predict(test)
r2_score(test['revenue'], y_test_pred)
#mean_squared_error(test['revenue'], y_test_pred)

0.6331998044628573

In [45]:
mean_squared_error(test['revenue'], y_test_pred)

7975991747965862.0

In [143]:
formula = 'revenue ~ C(has_collection) + budget + C(production_company_Walt_Disney_Pictures) + C(genre_Adventure) + C(keyword_suspense) + C(num_languages) + C(production_company_Miramax_Films)'
lm = smf.ols(formula=formula, data=train).fit()
lm.summary()

PatsyError: Error evaluating factor: NameError: name 'has_collection' is not defined
    revenue ~ C(has_collection) + budget + C(production_company_Walt_Disney_Pictures) + C(genre_Adventure) + C(keyword_suspense) + C(num_languages) + C(production_company_Miramax_Films)
              ^^^^^^^^^^^^^^^^^

In [46]:
#PREDICTING RATING (WEIGHTED AVG VOTE)

In [52]:
formula = 'weighted_average_vote ~ C(has_collection) + budget + C(production_company_Walt_Disney_Pictures) + C(genre_Adventure) + C(keyword_suspense) + C(num_languages) + C(production_company_Miramax_Films)'
lm2 = smf.ols(formula=formula, data=train).fit()
lm2.summary()

0,1,2,3
Dep. Variable:,weighted_average_vote,R-squared:,0.033
Model:,OLS,Adj. R-squared:,0.027
Method:,Least Squares,F-statistic:,5.634
Date:,"Mon, 21 Sep 2020",Prob (F-statistic):,3.16e-10
Time:,16:51:41,Log-Likelihood:,-2981.4
No. Observations:,2145,AIC:,5991.0
Df Residuals:,2131,BIC:,6070.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5.6860,0.282,20.166,0.000,5.133,6.239
C(has_collection)[T.1],-0.1927,0.054,-3.558,0.000,-0.299,-0.087
C(production_company_Walt_Disney_Pictures)[T.1],0.2159,0.158,1.366,0.172,-0.094,0.526
C(genre_Adventure)[T.1],-0.1558,0.065,-2.406,0.016,-0.283,-0.029
C(keyword_suspense)[T.1],0.1012,0.131,0.773,0.440,-0.156,0.358
C(num_languages)[T.1],0.7364,0.283,2.600,0.009,0.181,1.292
C(num_languages)[T.2],0.8631,0.287,3.012,0.003,0.301,1.425
C(num_languages)[T.3],1.0848,0.293,3.701,0.000,0.510,1.660
C(num_languages)[T.4],1.3159,0.313,4.203,0.000,0.702,1.930

0,1,2,3
Omnibus:,180.499,Durbin-Watson:,1.939
Prob(Omnibus):,0.0,Jarque-Bera (JB):,282.217
Skew:,-0.634,Prob(JB):,5.22e-62
Kurtosis:,4.244,Cond. No.,8.800000000000001e+27


In [30]:
tmdb.columns

Index(['id', 'budget', 'imdb_id', 'original_language', 'original_title',
       'overview', 'popularity', 'release_date', 'runtime', 'title',
       ...
       'females_30age_avg_vote', 'females_30age_votes',
       'females_45age_avg_vote', 'females_45age_votes',
       'top1000_voters_rating', 'top1000_voters_votes', 'us_voters_rating',
       'us_voters_votes', 'non_us_voters_rating', 'non_us_voters_votes'],
      dtype='object', length=276)