In [1]:
import pandas as pd
import numpy as np
import random as random

# plotly standard imports
import plotly.graph_objs as go
import plotly.plotly as py



In [2]:
# Cufflinks wrapper on plotly
import cufflinks
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from plotly.offline import iplot
cufflinks.go_offline()

# Set global theme
cufflinks.set_config_file(world_readable=True, theme='pearl')
import plotly.figure_factory as ff


# Data input

In [3]:
input_df=pd.read_csv("train.csv", sep=",")
X=input_df.drop("revenue", axis=1) # drop labels for training set
y=input_df["revenue"]

In [4]:
input_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 23 columns):
id                       3000 non-null int64
belongs_to_collection    604 non-null object
budget                   3000 non-null int64
genres                   2993 non-null object
homepage                 946 non-null object
imdb_id                  3000 non-null object
original_language        3000 non-null object
original_title           3000 non-null object
overview                 2992 non-null object
popularity               3000 non-null float64
poster_path              2999 non-null object
production_companies     2844 non-null object
production_countries     2945 non-null object
release_date             3000 non-null object
runtime                  2998 non-null float64
spoken_languages         2980 non-null object
status                   3000 non-null object
tagline                  2403 non-null object
title                    3000 non-null object
Keywords             

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
type(y_train)

pandas.core.series.Series

In [7]:
X_train.reset_index(inplace=True);
X_test.reset_index(inplace=True);
y_train=pd.DataFrame(y_train)
y_test=pd.DataFrame(y_test)

In [8]:
y_train.reset_index(inplace=True);
y_test.reset_index(inplace=True);

In [9]:
y_train=y_train.drop("index", axis=1) # drop labels for training set
y_test=y_test.drop("index", axis=1) # drop labels for training set

In [10]:
df=pd.concat([X_train, y_train], axis=1, sort=False)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2100 entries, 0 to 2099
Data columns (total 24 columns):
index                    2100 non-null int64
id                       2100 non-null int64
belongs_to_collection    425 non-null object
budget                   2100 non-null int64
genres                   2095 non-null object
homepage                 673 non-null object
imdb_id                  2100 non-null object
original_language        2100 non-null object
original_title           2100 non-null object
overview                 2092 non-null object
popularity               2100 non-null float64
poster_path              2099 non-null object
production_companies     1978 non-null object
production_countries     2059 non-null object
release_date             2100 non-null object
runtime                  2098 non-null float64
spoken_languages         2085 non-null object
status                   2100 non-null object
tagline                  1659 non-null object
title                 

# Removing Nulls and Data exploration

In [12]:
X_train.isna().sum()

index                       0
id                          0
belongs_to_collection    1675
budget                      0
genres                      5
homepage                 1427
imdb_id                     0
original_language           0
original_title              0
overview                    8
popularity                  0
poster_path                 1
production_companies      122
production_countries       41
release_date                0
runtime                     2
spoken_languages           15
status                      0
tagline                   441
title                       0
Keywords                  194
cast                        9
crew                       12
dtype: int64

In [13]:
from sklearn.base import TransformerMixin

In [14]:
class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)


In [15]:
X_train_t=DataFrameImputer().fit_transform(X_train)

In [16]:
df_t=DataFrameImputer().fit_transform(df)

In [17]:
X_train_t.isna().sum()

index                    0
id                       0
belongs_to_collection    0
budget                   0
genres                   0
homepage                 0
imdb_id                  0
original_language        0
original_title           0
overview                 0
popularity               0
poster_path              0
production_companies     0
production_countries     0
release_date             0
runtime                  0
spoken_languages         0
status                   0
tagline                  0
title                    0
Keywords                 0
cast                     0
crew                     0
dtype: int64

In [18]:
#Comparing correlatons between Consumption_per_NIA/log_NIA/Property Type 


figure = ff.create_scatterplotmatrix(
    df[['budget', 'popularity',"runtime","revenue"]],
    height=1000,
    width=1000,
    diag='histogram',
    title="Data Exploration")
iplot(figure)

In [19]:
corrs = df[["revenue","popularity","runtime","budget"]].corr()
figure = ff.create_annotated_heatmap(
    z=corrs.values,
    x=list(corrs.columns),
    y=list(corrs.index),
    annotation_text=corrs.round(2).values,
    showscale=True)

iplot(figure)

# Feature engineering

In [20]:
import ast


def cat_list(line):
    c_list=[]
    line_eval=ast.literal_eval(line)
    for d in line_eval:
        c_list.append(d["name"])
    return(c_list)

In [21]:
def get_left(string):
    try:
        if len(string)==8:
            return(int(string[:2]))
        else:
            return(int(string[:1]))
    except: return 0

In [22]:

def get_year(string):
    try:
        if int(string[-2:])< 20:
            return int(string[-2:])+2000
        else:
            return int(string[-2:])+1900
    except:
        return 0

In [23]:
X_train_t.head()

Unnamed: 0,index,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,...,production_countries,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew
0,611,612,"[{'id': 645, 'name': 'James Bond Collection', ...",8575000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://www.transformersmovie.com/,tt4016934,ko,아가씨,"1930s Korea, in the period of Japanese occupat...",...,"[{'iso_3166_1': 'KR', 'name': 'South Korea'}]",6/1/16,145.0,"[{'iso_639_1': 'ja', 'name': '日本語'}, {'iso_639...",Released,Never did they expect to get into a controvers...,The Handmaiden,"[{'id': 293, 'name': 'female nudity'}, {'id': ...","[{'cast_id': 3, 'character': 'Lady Hideko', 'c...","[{'credit_id': '54056d0b0e0a2658f100c167', 'de..."
1,530,531,"[{'id': 366444, 'name': 'Demetrius Filmreihe',...",4100000,"[{'id': 18, 'name': 'Drama'}]",http://www.transformersmovie.com/,tt0046247,en,The Robe,Marcellus is a tribune in the time of Christ. ...,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",9/16/53,135.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The first motion picture in CinemaScope--the m...,The Robe,"[{'id': 3571, 'name': 'crucifixion'}, {'id': 5...","[{'cast_id': 1, 'character': 'Marcellus Gallio...","[{'credit_id': '52fe4603c3a368484e07be21', 'de..."
2,2787,2788,"[{'id': 645, 'name': 'James Bond Collection', ...",80000000,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",http://www.transformersmovie.com/,tt1037705,en,The Book of Eli,"A post-apocalyptic tale, in which a lone man f...",...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1/14/10,118.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Some will kill to have it. He will kill to pro...,The Book of Eli,"[{'id': 3096, 'name': 'book'}, {'id': 4458, 'n...","[{'cast_id': 1, 'character': 'Eli', 'credit_id...","[{'credit_id': '52fe43e9c3a368484e0058ad', 'de..."
3,49,50,"[{'id': 115570, 'name': 'Star Trek: The Next G...",38000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",http://www.transformersmovie.com/,tt0111280,en,Star Trek: Generations,Captain Jean-Luc Picard and the crew of the En...,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",11/17/94,118.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Boldly go.,Star Trek: Generations,"[{'id': 10988, 'name': 'based on tv series'}, ...","[{'cast_id': 5, 'character': 'Captain Jean-Luc...","[{'credit_id': '52fe4225c3a36847f80076c3', 'de..."
4,1883,1884,"[{'id': 454520, 'name': 'Captain Harlock Colle...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 878, ...",http://www.transformersmovie.com/,tt2668134,ja,キャプテンハーロック,Space Pirate Captain Harlock and his fearless ...,...,"[{'iso_3166_1': 'JP', 'name': 'Japan'}]",9/7/13,115.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,This is the heroic story of the men on the U.S...,Space Pirate Captain Harlock,"[{'id': 10183, 'name': 'independent film'}]","[{'cast_id': 1, 'character': 'Captain Harlock ...","[{'credit_id': '52fe4cac9251416c910fc711', 'de..."


In [24]:
from sklearn.base import BaseEstimator, TransformerMixin

In [25]:
df.columns

Index(['index', 'id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue'],
      dtype='object')

In [26]:

class DateAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        """

        init
        """
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
       
        year=X['release_date'].apply(lambda x: get_year(x))
        df_year=pd.DataFrame(year.values,columns=["year"])
        month=X['release_date'].apply(lambda x:get_left(x))
        df_month=pd.DataFrame(month.values,columns=["month"])
       
        processed_X=pd.concat([X, df_year, df_month], axis=1, sort=False)
      
        return processed_X

In [27]:
attr_adder=DateAttributesAdder()
X_train_t2=attr_adder.fit_transform(X_train_t)

In [28]:
X_train_t2.columns

Index(['index', 'id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'year',
       'month'],
      dtype='object')

# Unpacking categories and spoken languages

In [239]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder(sparse=False)

class CatAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        genres_list=X["genres"].apply(lambda x: cat_list(x))
        genres_label_data=mlb.fit_transform(genres_list)
        genres_labelClasses = mlb.classes_
        genresLabels=pd.DataFrame(genres_label_data, columns=genres_labelClasses)
        
        language_list=X['spoken_languages'].apply(lambda x: cat_list(x))
        language_label_data=mlb.fit_transform(language_list)
        language_labelClasses =  "spoken_lan_" + mlb.classes_
        languageLabels=pd.DataFrame(language_label_data, columns=language_labelClasses)
        languageLabels=languageLabels[[ 'spoken_lan_Deutsch', 'spoken_lan_English', 'spoken_lan_Español', 'spoken_lan_Français', 'spoken_lan_Italiano', 'spoken_lan_Pусский']]
        
        
        prod_country=X['production_countries'].apply(lambda x: cat_list(x))
        prod_country_label_data=mlb.fit_transform(prod_country)
        prod_country_labelClasses =  "prod_country_" + mlb.classes_
        prod_countryLabels=pd.DataFrame(prod_country_label_data, columns=prod_country_labelClasses)
        prod_countryLabels=prod_countryLabels[prod_countryLabels.columns[prod_countryLabels.sum()>len(X)/20]]
        
        
        prod_company=X['production_companies'].apply(lambda x: cat_list(x))
        prod_company_label_data=mlb.fit_transform(prod_company)
        prod_company_labelClasses = mlb.classes_
        prod_companyLabels=pd.DataFrame(prod_company_label_data, columns=prod_company_labelClasses)
        prod_companyLabels=prod_companyLabels[prod_companyLabels.columns[prod_companyLabels.sum()>len(X)/50]]
        
        org_lan_cat_1hot = cat_encoder.fit_transform(X[["original_language"]])
        df_org_lan=pd.DataFrame(org_lan_cat_1hot, columns=cat_encoder.categories_[0])
        df_org_lan=df_org_lan[df_org_lan.columns[df_org_lan.sum()>len(X)/100]]
        
        processed_X=pd.concat([X, genresLabels,languageLabels, prod_countryLabels, df_org_lan, prod_companyLabels], axis=1, sort=False)
        
         
        processed_X=processed_X[['budget',
                                   'popularity',
                                      'runtime',
                                       'Action',
                                    'Adventure',
                                    'Animation',
                                       'Comedy',
                                        'Crime',
                                  'Documentary',
                                        'Drama',
                                       'Family',
                                      'Fantasy',
                                      'Foreign',
                                      'History',
                                       'Horror',
                                        'Music',
                                      'Mystery',
                                      'Romance',
                              'Science Fiction',
                                     'Thriller',
                                          'War',
                                      'Western',
                           'spoken_lan_Deutsch',
                           'spoken_lan_English',
                           'spoken_lan_Español',
                          'spoken_lan_Français',
                          'spoken_lan_Italiano',
                           'spoken_lan_Pусский',
                          'prod_country_France',
                         'prod_country_Germany',
                  'prod_country_United Kingdom',
        'prod_country_United States of America',
                                        'en',
                                        'es',
                                        'fr',
                                        'hi',
                            'Columbia Pictures',
                    'Metro-Goldwyn-Mayer (MGM)',
                              'New Line Cinema',
                           'Paramount Pictures',
                          'Touchstone Pictures',
       'Twentieth Century Fox Film Corporation',
                           'Universal Pictures',
                                 'Warner Bros.', 
            "year", 
            "month"]]
        
        X["decade"]=(X["year"]-X["year"]%10)
        

        decade_cat_1hot = cat_encoder.fit_transform(X[["decade"]])
        df_year=pd.DataFrame(decade_cat_1hot, columns=cat_encoder.categories_[0])
        
        month_cat_1hot = cat_encoder.fit_transform(X[["month"]])
        df_month=pd.DataFrame(month_cat_1hot, columns=list(cat_encoder.categories_[0]))
        df_month.rename(columns={1: "Jan", 2: "Feb", 3:"Mar", 4:"Apr", 5:"May", 6:"Jun", 7:"Jul", 8:"Aug", 9:"Sep", 10:"Nov", 11:"Oct", 12:"Dec"}, inplace = True)


        
        processed_X=pd.concat([processed_X, df_year, df_month], axis=1, sort=False)
        
        processed_X=processed_X.drop(columns=["year","month"])
        return(processed_X)
        
        
     

In [240]:

attr_adder=CatAttributesAdder()
X_train_t3=attr_adder.fit_transform(X_train_t2)

In [241]:
X_train_t3.columns

Index([                                'budget',
                                   'popularity',
                                      'runtime',
                                       'Action',
                                    'Adventure',
                                    'Animation',
                                       'Comedy',
                                        'Crime',
                                  'Documentary',
                                        'Drama',
                                       'Family',
                                      'Fantasy',
                                      'Foreign',
                                      'History',
                                       'Horror',
                                        'Music',
                                      'Mystery',
                                      'Romance',
                              'Science Fiction',
                                     'Thriller',
                    

In [242]:
X=X_train_t2
X["decade"]=(X["year"]-X["year"]%10)
        

year_cat_1hot = cat_encoder.fit_transform(X[["decade"]])
df_year=pd.DataFrame(year_cat_1hot, columns=cat_encoder.categories_)

In [243]:
df_year.columns

MultiIndex(levels=[[1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]],
           codes=[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])

In [182]:
month_cat_1hot = cat_encoder.fit_transform(X[["month"]])
df_month=pd.DataFrame(month_cat_1hot, columns=list(cat_encoder.categories_[0]))
df_month.rename(columns={1: "Jan", 2: "Feb", 3:"Mar", 4:"Apr", 5:"May", 6:"Jun", 7:"Jul", 8:"Aug", 9:"Sep", 10:"Nov", 11:"Oct", 12:"Dec"}, inplace=True)

In [183]:
df_month.columns

Index(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Nov',
       'Oct', 'Dec'],
      dtype='object')

In [184]:
list(cat_encoder.categories_[0])

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

# Preparation Pipeline

In [479]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

preparation_pipeline = Pipeline([
    ("imputer", DataFrameImputer()),
    ("date_adder", DateAttributesAdder()),
    ("cat_adder", CatAttributesAdder()),
    ])

In [480]:
X_train_prep=preparation_pipeline.fit_transform(X_train)

In [481]:
X_train_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2100 entries, 0 to 2099
Data columns (total 66 columns):
budget                                    2100 non-null int64
popularity                                2100 non-null float64
runtime                                   2100 non-null float64
Action                                    2100 non-null int32
Adventure                                 2100 non-null int32
Animation                                 2100 non-null int32
Comedy                                    2100 non-null int32
Crime                                     2100 non-null int32
Documentary                               2100 non-null int32
Drama                                     2100 non-null int32
Family                                    2100 non-null int32
Fantasy                                   2100 non-null int32
Foreign                                   2100 non-null int32
History                                   2100 non-null int32
Horror               

In [482]:
X_train_prep.head()

Unnamed: 0,budget,popularity,runtime,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,...,Mar,Apr,May,Jun,Jul,Aug,Sep,Nov,Oct,Dec
0,8575000,16.727405,145.0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4100000,3.826281,135.0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,80000000,14.39853,118.0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,38000000,8.105708,118.0,1,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,30000000,7.857666,115.0,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [483]:
len(X_train_prep)

2100

In [484]:
X_train_prep.isna().sum()

budget                                    0
popularity                                0
runtime                                   0
Action                                    0
Adventure                                 0
Animation                                 0
Comedy                                    0
Crime                                     0
Documentary                               0
Drama                                     0
Family                                    0
Fantasy                                   0
Foreign                                   0
History                                   0
Horror                                    0
Music                                     0
Mystery                                   0
Romance                                   0
Science Fiction                           0
Thriller                                  0
War                                       0
Western                                   0
spoken_lan_Deutsch              

In [485]:
X_test_prep=preparation_pipeline.fit_transform(X_test)

In [486]:
X_test_prep.columns

Index([                                'budget',
                                   'popularity',
                                      'runtime',
                                       'Action',
                                    'Adventure',
                                    'Animation',
                                       'Comedy',
                                        'Crime',
                                  'Documentary',
                                        'Drama',
                                       'Family',
                                      'Fantasy',
                                      'Foreign',
                                      'History',
                                       'Horror',
                                        'Music',
                                      'Mystery',
                                      'Romance',
                              'Science Fiction',
                                     'Thriller',
                    

In [487]:
X_test_prep.shape

(900, 66)

In [488]:
X_test_prep.tail(100)

Unnamed: 0,budget,popularity,runtime,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,...,Mar,Apr,May,Jun,Jul,Aug,Sep,Nov,Oct,Dec
800,3500000,5.732016,93.0,0,0,0,1,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
801,0,7.696023,85.0,0,0,0,1,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
802,0,4.071187,122.0,1,0,0,0,1,0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
803,10000000,2.849779,99.0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
804,62000000,10.618141,137.0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
805,8376800,12.237488,213.0,1,1,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
806,48000000,8.338222,147.0,0,0,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
807,7000000,2.483032,93.0,1,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
808,800000,7.974726,91.0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
809,0,8.096719,96.0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [489]:
X_test_prep.isna().sum()

budget                                    0
popularity                                0
runtime                                   0
Action                                    0
Adventure                                 0
Animation                                 0
Comedy                                    0
Crime                                     0
Documentary                               0
Drama                                     0
Family                                    0
Fantasy                                   0
Foreign                                   0
History                                   0
Horror                                    0
Music                                     0
Mystery                                   0
Romance                                   0
Science Fiction                           0
Thriller                                  0
War                                       0
Western                                   0
spoken_lan_Deutsch              

# Linear regression

In [490]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train_prep, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [491]:
X_test_prep.shape

(900, 66)

In [492]:
X_train_prep.shape

(2100, 66)

In [493]:
X_test_prep.shape

(900, 66)

In [494]:
X_test_prep.columns

Index([                                'budget',
                                   'popularity',
                                      'runtime',
                                       'Action',
                                    'Adventure',
                                    'Animation',
                                       'Comedy',
                                        'Crime',
                                  'Documentary',
                                        'Drama',
                                       'Family',
                                      'Fantasy',
                                      'Foreign',
                                      'History',
                                       'Horror',
                                        'Music',
                                      'Mystery',
                                      'Romance',
                              'Science Fiction',
                                     'Thriller',
                    

In [495]:
from sklearn.metrics import mean_squared_error

reve_pred=lin_reg.predict(X_test_prep)
lin_mse = mean_squared_error(reve_pred, y_test.values)
lin_rmse=np.sqrt(lin_mse)
lin_rmse

81033925.43998726

In [496]:
lin_rmse/y_test.mean()

revenue    1.186986
dtype: float64

In [497]:
lin_reg.score(X_test_prep, y_test.values)

0.5855572640248142

In [498]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)
forest_reg.fit(X_train_prep, y_train.values)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [499]:
reve_pred=forest_reg.predict(X_test_prep)
rfr_mse = mean_squared_error(reve_pred, y_test)
rfr_rmse=np.sqrt(rfr_mse)
rfr_rmse

75402263.20115954

In [500]:
rfr_rmse/y_test.mean()

revenue    1.104494
dtype: float64

In [501]:
forest_reg.score(X_test_prep, y_test.values)

0.6411610807813619

In [502]:
from sklearn.model_selection import GridSearchCV

In [503]:
param_grid = [
    {'n_estimators':[10,30,60], "max_features":[4,6,8,12]},
    
    {"bootstrap":[False], "n_estimators":[3,10],"max_features":[2,3,4]},
    
]

In [504]:
forest_reg=RandomForestRegressor (random_state=10)

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring='neg_mean_squared_error',return_train_score=True)

grid_search.fit(X_train_prep, y_train.values)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Pl


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Pl


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Pl

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=10, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [10, 30, 60], 'max_features': [4, 6, 8, 12]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [505]:
grid_search.best_params_

{'max_features': 12, 'n_estimators': 60}

In [506]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([3.26114461e-01, 2.26943046e-01, 8.87902892e-02, 1.58275699e-02,
       4.49542650e-02, 1.07221204e-02, 1.17697828e-02, 4.69009316e-03,
       4.44420435e-04, 1.39051356e-02, 1.22583096e-02, 1.16714474e-02,
       3.10397362e-05, 1.60933058e-03, 5.35747659e-03, 1.69112971e-03,
       2.98672247e-03, 6.16439043e-03, 1.04621306e-02, 7.12884718e-03,
       1.99293161e-03, 6.47373786e-04, 3.64123880e-03, 3.70879771e-03,
       2.97385775e-03, 4.87542888e-03, 2.27437440e-03, 2.57672261e-03,
       2.19898275e-03, 3.95367799e-03, 6.45339815e-03, 1.11560730e-02,
       3.11357697e-03, 2.43341752e-05, 1.35424658e-04, 4.99596080e-04,
       3.12302277e-03, 1.58646327e-03, 6.02007650e-03, 7.56466469e-03,
       2.07894800e-03, 6.50314129e-03, 6.31887018e-03, 7.96919104e-03,
       3.66532044e-04, 2.43243743e-05, 1.67309215e-04, 2.21745272e-04,
       8.17565494e-04, 1.76765500e-03, 2.65862311e-03, 5.70250269e-03,
       8.86565156e-03, 1.14128611e-02, 3.96994726e-03, 5.04690590e-03,
      

In [507]:
attributes = X_train_prep.columns

In [508]:
sorted(zip(feature_importances, attributes), reverse=True)

[(0.32611446097354774, 'budget'),
 (0.22694304587074043, 'popularity'),
 (0.08879028919047521, 'runtime'),
 (0.04495426496161972, 'Adventure'),
 (0.01582756992472342, 'Action'),
 (0.013905135556012438, 'Drama'),
 (0.012989913900826027, 'Apr'),
 (0.012870558339148982, 'Jun'),
 (0.01225830959118299, 'Family'),
 (0.01176978280424943, 'Comedy'),
 (0.011671447430682262, 'Fantasy'),
 (0.011412861138137378, 2010),
 (0.011156072962519525, 'prod_country_United States of America'),
 (0.01072212037526982, 'Animation'),
 (0.010462130570345802, 'Science Fiction'),
 (0.008865651556335192, 2000),
 (0.007969191035489466, 'Warner Bros.'),
 (0.00760724908954799, 'May'),
 (0.007564664694627239, 'Paramount Pictures'),
 (0.007128847180091997, 'Thriller'),
 (0.006766749129042371, 'Mar'),
 (0.006503141288683326, 'Twentieth Century Fox Film Corporation'),
 (0.006453398145750313, 'prod_country_United Kingdom'),
 (0.00640037482568766, 'Dec'),
 (0.0063188701820130625, 'Universal Pictures'),
 (0.00616439042645941

In [509]:
final_model = grid_search.best_estimator_

In [510]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=12, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=60, n_jobs=None, oob_score=False, random_state=10,
           verbose=0, warm_start=False)

In [511]:
reve_pred_fin=final_model.predict(X_test_prep)
fin_mse = mean_squared_error(reve_pred_fin, y_test)
fin_rmse=np.sqrt(fin_mse)
fin_rmse

70312417.33060166

In [512]:
fin_rmse/y_test.mean()

revenue    1.029937
dtype: float64

In [513]:
final_pred=pd.Series(final_model.predict(X_test_prep))

In [514]:
final_model.score(X_test_prep, y_test)

0.6879710889512463

In [515]:
final_pred=pd.DataFrame(final_pred, columns=["prediction"])

In [516]:
final_pred.shape

(900, 1)

In [517]:
df_final_pred=pd.concat([final_pred, y_test], axis=1, sort=False)
df_final_pred.sort_values(by="prediction", inplace=True);
df_final_pred.reset_index(inplace=True);

In [518]:
df_final_pred.head()

Unnamed: 0,index,prediction,revenue
0,745,633582.8,115605
1,493,711063.466667,105656
2,405,795815.716667,22321
3,308,888171.083333,11000
4,571,919354.866667,2586511


In [519]:
trace0=go.Scatter(
        y=df_final_pred.prediction,
        x=df_final_pred.index,
        name="prediction",
        mode='lines',
        marker=dict(
        color="blue",
        size=10,
        opacity=0.2
        )
    )



trace1=go.Scatter(
        y=df_final_pred.revenue,
        x=df_final_pred.index,
        name="revenue",
        mode='markers',
        marker=dict(
        color="red",
        size=10,
        opacity=0.2,
    
        )
    )



data=[trace0, trace1]
figure=go.Figure(
    data=data,
    layout=go.Layout(
            title="Basic features revenue prediction",
        xaxis=dict(title="Index"),
        yaxis=dict(title="Revenue"),
    
    ))
iplot(figure)

# Polynomial features

In [520]:
from sklearn.preprocessing import PolynomialFeatures
Poly_Feat=PolynomialFeatures(degree=3, include_bias=False)

class PolyAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        """

        init
        """
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_num=X[["budget","runtime","popularity"]]
        X_poly=Poly_Feat.fit_transform(X_num)
        poly_columns=[]
        for col in Poly_Feat.get_feature_names():
            col=str(col).replace("x0","budget")
            col=str(col).replace("x1","runtime")
            col=str(col).replace("x2","popularity")
            poly_columns.append(col)

        
        df_poly=pd.DataFrame(X_poly, columns=poly_columns)
        df_poly.drop(["budget","runtime","popularity"], axis=1, inplace=True)
        processed_X=pd.concat([X, df_poly], axis=1, sort=False)
    
        return processed_X
  

In [521]:
X_num=X_train_t[["budget","runtime","popularity"]]

In [522]:
 X_poly=Poly_Feat.fit_transform(X_num)
   

In [523]:
pol_col=Poly_Feat.get_feature_names()

In [524]:
d_col=dict([("x0","budget"),("x1", "runtime"), ("x2","popularity")])

In [525]:
poly_columns=[]
for col in Poly_Feat.get_feature_names():
    col=str(col).replace("x0","budget")
    col=str(col).replace("x1","runtime")
    col=str(col).replace("x2","popularity")
    poly_columns.append(col)


In [526]:
poly_att=PolyAttributesAdder()
X_train_poly=poly_att.fit_transform(X_train_t3)

In [527]:
X_train_poly.head()

Unnamed: 0,budget,popularity,runtime,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,...,budget^3,budget^2 runtime,budget^2 popularity,budget runtime^2,budget runtime popularity,budget popularity^2,runtime^3,runtime^2 popularity,runtime popularity^2,popularity^3
0,8575000,16.727405,145.0,0,0,0,0,0,0,1,...,6.305251e+20,1.066194e+16,1229977000000000.0,180289400000.0,20798440000.0,2399337000.0,3048625.0,351693.690125,40571.881315,4680.429589
1,4100000,3.826281,135.0,0,0,0,0,0,0,1,...,6.8921e+19,2269350000000000.0,64319780000000.0,74722500000.0,2117847000.0,60025750.0,2460375.0,69733.971225,1976.457549,56.018385
2,80000000,14.39853,118.0,1,0,0,0,0,0,0,...,5.12e+23,7.552e+17,9.215059e+16,1113920000000.0,135922100000.0,16585410000.0,1643032.0,200485.13172,24463.484607,2985.069636
3,38000000,8.105708,118.0,1,1,0,0,0,0,0,...,5.4872e+22,1.70392e+17,1.170464e+16,529112000000.0,36345990000.0,2496695000.0,1643032.0,112863.878192,7752.895257,532.565298
4,30000000,7.857666,115.0,0,0,1,0,0,0,0,...,2.7e+22,1.035e+17,7071899000000000.0,396750000000.0,27108950000.0,1852287000.0,1520875.0,103917.63285,7100.435221,485.155204


In [528]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_atrribs=[["budget", "runtime","popularity"]]

polynomial_pipeline = Pipeline([
    ("imputer", DataFrameImputer()),
    ("date_adder", DateAttributesAdder()),
    ("cat_adder", CatAttributesAdder()),
    ("polynomial_adder", PolyAttributesAdder(),)
    ])

In [529]:
X_train_poly.columns

Index([                                'budget',
                                   'popularity',
                                      'runtime',
                                       'Action',
                                    'Adventure',
                                    'Animation',
                                       'Comedy',
                                        'Crime',
                                  'Documentary',
                                        'Drama',
                                       'Family',
                                      'Fantasy',
                                      'Foreign',
                                      'History',
                                       'Horror',
                                        'Music',
                                      'Mystery',
                                      'Romance',
                              'Science Fiction',
                                     'Thriller',
                    

In [530]:
X_test_poly=polynomial_pipeline.fit_transform(X_test)

In [531]:
forest_reg_poly = RandomForestRegressor(n_estimators=10, random_state=42)
forest_reg_poly.fit(X_train_poly, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [532]:
reve_pred_poly=forest_reg_poly.predict(X_test_poly)
rfr_poly_mse = mean_squared_error(reve_pred_poly, y_test)
rfr_poly_rmse=np.sqrt(rfr_mse)
rfr_poly_rmse

75402263.20115954

In [533]:
rfr_poly_rmse/y_test.mean()

revenue    1.104494
dtype: float64

In [534]:
forest_reg_poly.score(X_test_poly, y_test)

0.6246038171693433

In [535]:
forest_reg=RandomForestRegressor (random_state=10)

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring='neg_mean_squared_error',return_train_score=True)

grid_search.fit(X_train_poly, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Pl


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Pl


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Pl

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=10, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [10, 30, 60], 'max_features': [4, 6, 8, 12]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [536]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 3, 'n_estimators': 10}

In [537]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances;

In [538]:
attributes = X_train_poly.columns

In [539]:
sorted(zip(feature_importances, attributes), reverse=True)

[(0.13406593012534593, 'budget popularity'),
 (0.09292264694482608, 'budget'),
 (0.08852175835560729, 'budget popularity^2'),
 (0.08004574726378114, 'budget runtime'),
 (0.07722800556726281, 'budget^2'),
 (0.058682455431872074, 'runtime^2 popularity'),
 (0.0493085155427748, 'budget^2 popularity'),
 (0.04462926771925785, 'budget^2 runtime'),
 (0.03137829907948025, 'budget runtime popularity'),
 (0.029351946696103505, 'popularity^2'),
 (0.026481961335355214, 'runtime popularity^2'),
 (0.024599431636504278, 'popularity'),
 (0.023635087909860814, 'budget runtime^2'),
 (0.020626892185156325, 'runtime popularity'),
 (0.02052972976737074, 'popularity^3'),
 (0.01926414786674997, 'runtime^3'),
 (0.019059894800845304, 'runtime'),
 (0.013949653610922883, 'runtime^2'),
 (0.013342995286182038, 'budget^3'),
 (0.008875122728487138, 'prod_country_United States of America'),
 (0.007717448309692274, 'Action'),
 (0.007416410178716568, 'Family'),
 (0.007205578780858477, 'Jun'),
 (0.005870711335733707, 'Dr

In [540]:
final_model = grid_search.best_estimator_

In [541]:
reve_pred_poly=final_model.predict(X_test_poly)
rfr_poly_mse = mean_squared_error(reve_pred_poly, y_test)
rfr_poly_rmse=np.sqrt(rfr_mse)
rfr_poly_rmse



75402263.20115954

In [542]:
rfr_poly_rmse/y_test.mean()

revenue    1.104494
dtype: float64

In [543]:
forest_reg_poly.score(X_test_poly, y_test)

0.6246038171693433

In [544]:
final_pred=pd.Series(final_model.predict(X_test_poly))

In [545]:
final_pred=pd.DataFrame(final_pred, columns=["prediction"])

In [546]:
df_final_pred=pd.concat([final_pred, y_test], axis=1, sort=False)
df_final_pred=df_final_pred.sort_values(by="prediction")

In [547]:
df_final_pred.reset_index(inplace=True)

In [548]:
trace0=go.Scatter(
        y=df_final_pred.prediction,
        x=df_final_pred.index,
        name="prediction",
        mode='lines',
        marker=dict(
        color="blue",
        size=10,
        opacity=0.2
        )
    )



trace1=go.Scatter(
        y=df_final_pred.revenue,
        x=df_final_pred.index,
        name="revenue",
        mode='markers',
        marker=dict(
        color="red",
        size=10,
        opacity=0.2,
    
        )
    )



data=[trace0, trace1]
figure=go.Figure(
    data=data,
    layout=go.Layout(
        title="Polynomial features revenue prediction",
        xaxis=dict(title="Index"),
        yaxis=dict(title="Revenue"),

    
    ))
iplot(figure)

In [549]:
from sklearn import ensemble


In [550]:
ens_reg=ensemble.GradientBoostingRegressor(n_estimators = 400, max_depth=5, min_samples_split=2,
                                         learning_rate=0.1, loss="ls")

In [552]:
ens_reg.fit(X_train_prep, y_train.values)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=400, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [553]:
ens_reg.score(X_test_prep, y_test.values)

0.6434047179792654