In [1557]:
import pandas as pd
import numpy as np
import random as random

# plotly standard imports
import plotly.graph_objs as go
import plotly.plotly as py



In [1558]:
# Cufflinks wrapper on plotly
import cufflinks
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from plotly.offline import iplot
cufflinks.go_offline()

# Set global theme
cufflinks.set_config_file(world_readable=True, theme='pearl')
import plotly.figure_factory as ff


# Data input

In [1669]:
input_df=pd.read_csv("train.csv", sep=",")
X=input_df.drop("revenue", axis=1) # drop labels for training set
y=input_df["revenue"]

In [1670]:
input_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 23 columns):
id                       3000 non-null int64
belongs_to_collection    604 non-null object
budget                   3000 non-null int64
genres                   2993 non-null object
homepage                 946 non-null object
imdb_id                  3000 non-null object
original_language        3000 non-null object
original_title           3000 non-null object
overview                 2992 non-null object
popularity               3000 non-null float64
poster_path              2999 non-null object
production_companies     2844 non-null object
production_countries     2945 non-null object
release_date             3000 non-null object
runtime                  2998 non-null float64
spoken_languages         2980 non-null object
status                   3000 non-null object
tagline                  2403 non-null object
title                    3000 non-null object
Keywords             

In [1750]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [1751]:
type(y_train)

pandas.core.series.Series

In [1752]:
X_train.reset_index(inplace=True);
X_test.reset_index(inplace=True);
y_train=pd.DataFrame(y_train)
y_test=pd.DataFrame(y_test)

In [1753]:
y_train.reset_index(inplace=True);
y_test.reset_index(inplace=True);

In [1755]:
y_train=y_train.drop("index", axis=1) # drop labels for training set
y_test=y_test.drop("index", axis=1) # drop labels for training set

In [1712]:
df=pd.concat([X_train, y_train], axis=1, sort=False)

In [1713]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2100 entries, 0 to 2099
Data columns (total 25 columns):
index                    2100 non-null int64
id                       2100 non-null int64
belongs_to_collection    425 non-null object
budget                   2100 non-null int64
genres                   2095 non-null object
homepage                 673 non-null object
imdb_id                  2100 non-null object
original_language        2100 non-null object
original_title           2100 non-null object
overview                 2092 non-null object
popularity               2100 non-null float64
poster_path              2099 non-null object
production_companies     1978 non-null object
production_countries     2059 non-null object
release_date             2100 non-null object
runtime                  2098 non-null float64
spoken_languages         2085 non-null object
status                   2100 non-null object
tagline                  1659 non-null object
title                 

# Removing Nulls and Data exploration

In [1565]:
X_train.isna().sum()

index                       0
id                          0
belongs_to_collection    1675
budget                      0
genres                      5
homepage                 1427
imdb_id                     0
original_language           0
original_title              0
overview                    8
popularity                  0
poster_path                 1
production_companies      122
production_countries       41
release_date                0
runtime                     2
spoken_languages           15
status                      0
tagline                   441
title                       0
Keywords                  194
cast                        9
crew                       12
dtype: int64

In [1566]:
from sklearn.base import TransformerMixin

In [1567]:
class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)


In [1568]:
X_train_t=DataFrameImputer().fit_transform(X_train)

In [1569]:
df_t=DataFrameImputer().fit_transform(df)

In [1570]:
X_train_t.isna().sum()

index                    0
id                       0
belongs_to_collection    0
budget                   0
genres                   0
homepage                 0
imdb_id                  0
original_language        0
original_title           0
overview                 0
popularity               0
poster_path              0
production_companies     0
production_countries     0
release_date             0
runtime                  0
spoken_languages         0
status                   0
tagline                  0
title                    0
Keywords                 0
cast                     0
crew                     0
dtype: int64

In [1571]:
#Comparing correlatons between Consumption_per_NIA/log_NIA/Property Type 


figure = ff.create_scatterplotmatrix(
    df[['budget', 'popularity',"runtime","revenue"]],
    height=1000,
    width=1000,
    diag='histogram',
    title="Data Exploration")
iplot(figure)

In [1572]:
corrs = df[["revenue","popularity","runtime","budget"]].corr()
figure = ff.create_annotated_heatmap(
    z=corrs.values,
    x=list(corrs.columns),
    y=list(corrs.index),
    annotation_text=corrs.round(2).values,
    showscale=True)

iplot(figure)

# Feature engineering

In [1573]:
import ast


def cat_list(line):
    c_list=[]
    line_eval=ast.literal_eval(line)
    for d in line_eval:
        c_list.append(d["name"])
    return(c_list)

In [1574]:
def get_left(string):
    try:
        if len(string)==8:
            return(int(string[:2]))
        else:
            return(int(string[:1]))
    except: return 0

In [1575]:

def get_year(string):
    try:
        if int(string[-2:])< 20:
            return int(string[-2:])+2000
        else:
            return int(string[-2:])+1900
    except:
        return 0

In [1576]:
X_train_t.head()

Unnamed: 0,index,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,...,production_countries,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew
0,611,612,"[{'id': 645, 'name': 'James Bond Collection', ...",8575000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://www.transformersmovie.com/,tt4016934,ko,아가씨,"1930s Korea, in the period of Japanese occupat...",...,"[{'iso_3166_1': 'KR', 'name': 'South Korea'}]",6/1/16,145.0,"[{'iso_639_1': 'ja', 'name': '日本語'}, {'iso_639...",Released,Never did they expect to get into a controvers...,The Handmaiden,"[{'id': 293, 'name': 'female nudity'}, {'id': ...","[{'cast_id': 3, 'character': 'Lady Hideko', 'c...","[{'credit_id': '54056d0b0e0a2658f100c167', 'de..."
1,530,531,"[{'id': 366444, 'name': 'Demetrius Filmreihe',...",4100000,"[{'id': 18, 'name': 'Drama'}]",http://www.transformersmovie.com/,tt0046247,en,The Robe,Marcellus is a tribune in the time of Christ. ...,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",9/16/53,135.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The first motion picture in CinemaScope--the m...,The Robe,"[{'id': 3571, 'name': 'crucifixion'}, {'id': 5...","[{'cast_id': 1, 'character': 'Marcellus Gallio...","[{'credit_id': '52fe4603c3a368484e07be21', 'de..."
2,2787,2788,"[{'id': 645, 'name': 'James Bond Collection', ...",80000000,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",http://www.transformersmovie.com/,tt1037705,en,The Book of Eli,"A post-apocalyptic tale, in which a lone man f...",...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1/14/10,118.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Some will kill to have it. He will kill to pro...,The Book of Eli,"[{'id': 3096, 'name': 'book'}, {'id': 4458, 'n...","[{'cast_id': 1, 'character': 'Eli', 'credit_id...","[{'credit_id': '52fe43e9c3a368484e0058ad', 'de..."
3,49,50,"[{'id': 115570, 'name': 'Star Trek: The Next G...",38000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",http://www.transformersmovie.com/,tt0111280,en,Star Trek: Generations,Captain Jean-Luc Picard and the crew of the En...,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",11/17/94,118.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Boldly go.,Star Trek: Generations,"[{'id': 10988, 'name': 'based on tv series'}, ...","[{'cast_id': 5, 'character': 'Captain Jean-Luc...","[{'credit_id': '52fe4225c3a36847f80076c3', 'de..."
4,1883,1884,"[{'id': 454520, 'name': 'Captain Harlock Colle...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 878, ...",http://www.transformersmovie.com/,tt2668134,ja,キャプテンハーロック,Space Pirate Captain Harlock and his fearless ...,...,"[{'iso_3166_1': 'JP', 'name': 'Japan'}]",9/7/13,115.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,Just when you thought it was safe to go back t...,Space Pirate Captain Harlock,"[{'id': 10183, 'name': 'independent film'}]","[{'cast_id': 1, 'character': 'Captain Harlock ...","[{'credit_id': '52fe4cac9251416c910fc711', 'de..."


In [1577]:
from sklearn.base import BaseEstimator, TransformerMixin

In [1578]:
df.columns

Index(['index', 'id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue'],
      dtype='object')

In [1579]:

class DateAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        """

        init
        """
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
       
        year=X['release_date'].apply(lambda x: get_year(x))
        df_year=pd.DataFrame(year.values,columns=["year"])
        month=X['release_date'].apply(lambda x:get_left(x))
        df_month=pd.DataFrame(month.values,columns=["month"])
       
        processed_X=pd.concat([X, df_year, df_month], axis=1, sort=False)
      
        return processed_X

In [1580]:
attr_adder=DateAttributesAdder()
X_train_t2=attr_adder.fit_transform(X_train_t)

In [1584]:
X_train_t2.columns

Index(['index', 'id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'year',
       'month'],
      dtype='object')

# Unpacking categories and spoken languages

In [1591]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder(sparse=False)

class CatAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        genres_list=X["genres"].apply(lambda x: cat_list(x))
        genres_label_data=mlb.fit_transform(genres_list)
        genres_labelClasses = mlb.classes_
        genresLabels=pd.DataFrame(genres_label_data, columns=genres_labelClasses)
        
        language_list=X['spoken_languages'].apply(lambda x: cat_list(x))
        language_label_data=mlb.fit_transform(language_list)
        language_labelClasses =  "spoken_lan_" + mlb.classes_
        languageLabels=pd.DataFrame(language_label_data, columns=language_labelClasses)
        languageLabels=languageLabels[[ 'spoken_lan_Deutsch', 'spoken_lan_English', 'spoken_lan_Español', 'spoken_lan_Français', 'spoken_lan_Italiano', 'spoken_lan_Pусский']]
        
        
        prod_country=X['production_countries'].apply(lambda x: cat_list(x))
        prod_country_label_data=mlb.fit_transform(prod_country)
        prod_country_labelClasses =  "prod_country_" + mlb.classes_
        prod_countryLabels=pd.DataFrame(prod_country_label_data, columns=prod_country_labelClasses)
        prod_countryLabels=prod_countryLabels[prod_countryLabels.columns[prod_countryLabels.sum()>len(X)/20]]
        
        
        prod_company=X['production_companies'].apply(lambda x: cat_list(x))
        prod_company_label_data=mlb.fit_transform(prod_company)
        prod_company_labelClasses = mlb.classes_
        prod_companyLabels=pd.DataFrame(prod_company_label_data, columns=prod_company_labelClasses)
        prod_companyLabels=prod_companyLabels[prod_companyLabels.columns[prod_companyLabels.sum()>len(X)/50]]
        
        org_lan_cat_1hot = cat_encoder.fit_transform(X[["original_language"]])
        df_org_lan=pd.DataFrame(org_lan_cat_1hot, columns=cat_encoder.categories_)
        df_org_lan=df_org_lan[df_org_lan.columns[df_org_lan.sum()>len(X)/100]]
        
        processed_X=pd.concat([X, genresLabels,languageLabels, prod_countryLabels, df_org_lan, prod_companyLabels], axis=1, sort=False)
        
         
        processed_X=processed_X[['budget',
                                   'popularity',
                                      'runtime',
                                       'Action',
                                    'Adventure',
                                    'Animation',
                                       'Comedy',
                                        'Crime',
                                  'Documentary',
                                        'Drama',
                                       'Family',
                                      'Fantasy',
                                      'Foreign',
                                      'History',
                                       'Horror',
                                        'Music',
                                      'Mystery',
                                      'Romance',
                              'Science Fiction',
                                     'Thriller',
                                          'War',
                                      'Western',
                           'spoken_lan_Deutsch',
                           'spoken_lan_English',
                           'spoken_lan_Español',
                          'spoken_lan_Français',
                          'spoken_lan_Italiano',
                           'spoken_lan_Pусский',
                          'prod_country_France',
                         'prod_country_Germany',
                  'prod_country_United Kingdom',
        'prod_country_United States of America',
                                        ('en',),
                                        ('es',),
                                        ('fr',),
                                        ('hi',),
                            'Columbia Pictures',
                    'Metro-Goldwyn-Mayer (MGM)',
                              'New Line Cinema',
                           'Paramount Pictures',
                          'Touchstone Pictures',
       'Twentieth Century Fox Film Corporation',
                           'Universal Pictures',
                                 'Warner Bros.', 
            "year", 
            "month"]]
        
        X["year_t"]=round((X["year"]-1920)/10)
        

        year_cat_1hot = cat_encoder.fit_transform(X[["year_t"]])
        df_year=pd.DataFrame(year_cat_1hot, columns=cat_encoder.categories_)

        
        month_cat_1hot = cat_encoder.fit_transform(X[["month"]])
        df_month=pd.DataFrame(month_cat_1hot, columns=cat_encoder.categories_)


        
        processed_X=pd.concat([processed_X, df_year, df_month], axis=1, sort=False)
        
        processed_X=processed_X.drop(columns=["year","month"])
        return(processed_X)
        
        
     

In [1593]:
attr_adder=CatAttributesAdder()
X_train_t3=attr_adder.fit_transform(X_train_t2)

In [1594]:
X_train_t3.columns

Index([                                'budget',
                                   'popularity',
                                      'runtime',
                                       'Action',
                                    'Adventure',
                                    'Animation',
                                       'Comedy',
                                        'Crime',
                                  'Documentary',
                                        'Drama',
                                       'Family',
                                      'Fantasy',
                                      'Foreign',
                                      'History',
                                       'Horror',
                                        'Music',
                                      'Mystery',
                                      'Romance',
                              'Science Fiction',
                                     'Thriller',
                    

# Preparation Pipeline

In [1714]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

preparation_pipeline = Pipeline([
    ("imputer", DataFrameImputer()),
    ("date_adder", DateAttributesAdder()),
    ("cat_adder", CatAttributesAdder()),
    ])

In [1715]:
X_train_prep=preparation_pipeline.fit_transform(X_train)

In [1716]:
X_train_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2100 entries, 0 to 2099
Data columns (total 67 columns):
budget                                    2100 non-null int64
popularity                                2100 non-null float64
runtime                                   2100 non-null float64
Action                                    2100 non-null int32
Adventure                                 2100 non-null int32
Animation                                 2100 non-null int32
Comedy                                    2100 non-null int32
Crime                                     2100 non-null int32
Documentary                               2100 non-null int32
Drama                                     2100 non-null int32
Family                                    2100 non-null int32
Fantasy                                   2100 non-null int32
Foreign                                   2100 non-null int32
History                                   2100 non-null int32
Horror               

In [1717]:
X_train_prep.head()

Unnamed: 0,budget,popularity,runtime,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,...,"(3,)","(4,)","(5,)","(6,)","(7,)","(8,)","(9,)","(10,)","(11,)","(12,)"
0,8575000,16.727405,145.0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4100000,3.826281,135.0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,80000000,14.39853,118.0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,38000000,8.105708,118.0,1,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,30000000,7.857666,115.0,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [1718]:
len(X_train_prep)

2100

In [1719]:
X_train_prep.isna().sum()

budget                                    0
popularity                                0
runtime                                   0
Action                                    0
Adventure                                 0
Animation                                 0
Comedy                                    0
Crime                                     0
Documentary                               0
Drama                                     0
Family                                    0
Fantasy                                   0
Foreign                                   0
History                                   0
Horror                                    0
Music                                     0
Mystery                                   0
Romance                                   0
Science Fiction                           0
Thriller                                  0
War                                       0
Western                                   0
spoken_lan_Deutsch              

In [1720]:
X_test_prep=preparation_pipeline.fit_transform(X_test)

In [1721]:
X_test_prep.columns

Index([                                'budget',
                                   'popularity',
                                      'runtime',
                                       'Action',
                                    'Adventure',
                                    'Animation',
                                       'Comedy',
                                        'Crime',
                                  'Documentary',
                                        'Drama',
                                       'Family',
                                      'Fantasy',
                                      'Foreign',
                                      'History',
                                       'Horror',
                                        'Music',
                                      'Mystery',
                                      'Romance',
                              'Science Fiction',
                                     'Thriller',
                    

In [1722]:
len(X_test_prep)

900

In [1723]:
X_test_prep.isna().sum()

budget                                    0
popularity                                0
runtime                                   0
Action                                    0
Adventure                                 0
Animation                                 0
Comedy                                    0
Crime                                     0
Documentary                               0
Drama                                     0
Family                                    0
Fantasy                                   0
Foreign                                   0
History                                   0
Horror                                    0
Music                                     0
Mystery                                   0
Romance                                   0
Science Fiction                           0
Thriller                                  0
War                                       0
Western                                   0
spoken_lan_Deutsch              

# Linear regression

In [1757]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train_prep, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [1758]:
X_test_prep.shape

(900, 67)

In [1759]:
X_train_prep.shape

(2100, 67)

In [1760]:
X_test_prep.columns

Index([                                'budget',
                                   'popularity',
                                      'runtime',
                                       'Action',
                                    'Adventure',
                                    'Animation',
                                       'Comedy',
                                        'Crime',
                                  'Documentary',
                                        'Drama',
                                       'Family',
                                      'Fantasy',
                                      'Foreign',
                                      'History',
                                       'Horror',
                                        'Music',
                                      'Mystery',
                                      'Romance',
                              'Science Fiction',
                                     'Thriller',
                    

In [1761]:
from sklearn.metrics import mean_squared_error

reve_pred=lin_reg.predict(X_test_prep)
lin_mse = mean_squared_error(reve_pred, y_test)
lin_rmse=np.sqrt(lin_mse)
lin_rmse

81084037.39779738

In [1762]:
lin_rmse/y_test.mean()

revenue    1.18772
dtype: float64

In [1763]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)
forest_reg.fit(X_train_prep, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [1764]:
reve_pred=forest_reg.predict(X_test_prep)
rfr_mse = mean_squared_error(reve_pred, y_test)
rfr_rmse=np.sqrt(rfr_mse)
rfr_rmse

74779250.3379695

In [1765]:
rfr_rmse/y_test.mean()

revenue    1.095368
dtype: float64

In [1766]:
from sklearn.model_selection import GridSearchCV

In [1767]:
param_grid = [
    {'n_estimators':[10,30,60], "max_features":[4,6,8,12]},
    
    {"bootstrap":[False], "n_estimators":[3,10],"max_features":[2,3,4]},
    
]

In [1768]:
forest_reg=RandomForestRegressor (random_state=10)

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring='neg_mean_squared_error',return_train_score=True)

grid_search.fit(X_train_prep, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Pl


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Pl


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Pl

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=10, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [10, 30, 60], 'max_features': [4, 6, 8, 12]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [1769]:
grid_search.best_params_

{'max_features': 12, 'n_estimators': 60}

In [1770]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([3.37100814e-01, 2.15290440e-01, 8.41994431e-02, 1.47672194e-02,
       3.84322208e-02, 1.10865320e-02, 7.48730378e-03, 3.93988813e-03,
       1.07348843e-03, 1.20297453e-02, 1.28492080e-02, 1.32614368e-02,
       3.32214476e-06, 2.08949531e-03, 5.63921679e-03, 2.15277171e-03,
       2.61831493e-03, 7.26366482e-03, 1.15477585e-02, 7.62538072e-03,
       1.56738391e-03, 6.56047593e-04, 5.22345965e-03, 4.12120012e-03,
       2.67980954e-03, 4.46524752e-03, 1.80454335e-03, 3.48210148e-03,
       2.29022185e-03, 3.42176204e-03, 6.87826519e-03, 8.92750968e-03,
       4.47789522e-03, 2.76971748e-05, 7.79355315e-05, 4.04562680e-04,
       3.52779946e-03, 1.52710132e-03, 5.40911602e-03, 8.42593773e-03,
       1.97352250e-03, 3.63290875e-03, 6.60050398e-03, 6.60548150e-03,
       1.95888733e-06, 2.88553853e-05, 5.28317912e-05, 7.81134269e-05,
       4.88902640e-04, 9.92728955e-04, 2.19636979e-03, 4.05632220e-03,
       7.23443517e-03, 1.48959062e-02, 1.21254203e-02, 3.60960520e-03,
      

In [1771]:
attributes = X_train_prep.columns

In [1772]:
sorted(zip(feature_importances, attributes), reverse=True)

[(0.3371008138895798, 'budget'),
 (0.2152904395440364, 'popularity'),
 (0.08419944313915938, 'runtime'),
 (0.03843222080823664, 'Adventure'),
 (0.01811861626432065, (4,)),
 (0.01489590624907009, (9.0,)),
 (0.014847541489631329, (6,)),
 (0.014767219387739316, 'Action'),
 (0.013261436762886828, 'Fantasy'),
 (0.012849208014850597, 'Family'),
 (0.012125420251226546, (10.0,)),
 (0.012029745292131641, 'Drama'),
 (0.011547758536083077, 'Science Fiction'),
 (0.011086531988002303, 'Animation'),
 (0.00892750967765898, 'prod_country_United States of America'),
 (0.00842593772547563, 'Paramount Pictures'),
 (0.007625380715502162, 'Thriller'),
 (0.007487303781689389, 'Comedy'),
 (0.0073646731159588185, (5,)),
 (0.007263664820745557, 'Romance'),
 (0.007234435171976603, (8.0,)),
 (0.006878265191519985, 'prod_country_United Kingdom'),
 (0.006605481499122046, 'Warner Bros.'),
 (0.006600503976076826, 'Universal Pictures'),
 (0.005639216785352655, 'Horror'),
 (0.005409116022987616, 'New Line Cinema'),
 (

In [1773]:
final_model = grid_search.best_estimator_

In [1774]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=12, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=60, n_jobs=None, oob_score=False, random_state=10,
           verbose=0, warm_start=False)

In [1775]:
final_pred=pd.Series(final_model.predict(X_test_prep))

In [1782]:
final_pred=pd.DataFrame(final_pred, columns=["prediction"])

In [1783]:
final_pred.shape

(900, 1)

In [1813]:
df_final_pred=pd.concat([final_pred, y_test], axis=1, sort=False)
df_final_pred.sort_values(by="prediction")

Unnamed: 0,prediction,revenue
394,1.860210e+05,10068
745,3.391445e+05,115605
308,3.459632e+05,11000
250,3.759255e+05,254190
83,4.864900e+05,220151
775,5.155374e+05,11276
493,8.780335e+05,105656
392,8.828636e+05,121
571,9.571204e+05,2586511
105,1.068752e+06,26488


In [1818]:
df_final_pred.reset_index(inplace=True)

In [1836]:
df_final_pred.head(10)

Unnamed: 0,level_0,index,prediction,revenue
0,394,394,186021.0,10068
1,745,745,339144.5,115605
2,308,308,345963.2,11000
3,250,250,375925.5,254190
4,83,83,486490.0,220151
5,775,775,515537.4,11276
6,493,493,878033.5,105656
7,392,392,882863.6,121
8,571,571,957120.4,2586511
9,105,105,1068752.0,26488


In [1820]:
trace0=go.Scatter(
        y=df_final_pred.prediction,
        x=df_final_pred.index,
        name="prediction",
        mode='lines',
        marker=dict(
        color="blue",
        size=10,
        opacity=0.2
        )
    )



trace1=go.Scatter(
        y=df_final_pred.revenue,
        x=df_final_pred.index,
        name="revenue",
        mode='markers',
        marker=dict(
        color="red",
        size=10,
        opacity=0.2,
    
        )
    )



data=[trace0, trace1]
figure=go.Figure(
    data=data,
    layout=go.Layout(
        title="MNIST dimension reduction analysis",
        xaxis=dict(title="No. of dimensions"),
        yaxis=dict(title="Retainde Variance"),

    
    ))
iplot(figure)

In [1821]:
final_mse = mean_squared_error(y_test, final_pred)
final_rmse=np.sqrt(final_mse_enchanced)

In [1823]:
final_rmse

69546536.91296713

In [1824]:
final_rmse/y_test.mean()

revenue    1.018719
dtype: float64

# Polynomial features

In [1827]:
Poly_Feat=PolynomialFeatures(degree=3, include_bias=False)

class PolyAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        """

        init
        """
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_num=X[["budget","runtime","popularity"]]
        X_poly=Poly_Feat.fit_transform(X_num)
        df_poly=pd.DataFrame(X_poly, columns=Poly_Feat.get_feature_names())
        
        processed_X=pd.concat([X, df_poly], axis=1, sort=False)
        processed_X=processed_X.drop(["x0","x1","x2"], axis=1)
        return processed_X
  

In [1828]:
poly_att=PolyAttributesAdder()
X_train_poly=poly_att.fit_transform(X_train_t3)

In [1829]:
X_train_poly.head()

Unnamed: 0,budget,popularity,runtime,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,...,x0^3,x0^2 x1,x0^2 x2,x0 x1^2,x0 x1 x2,x0 x2^2,x1^3,x1^2 x2,x1 x2^2,x2^3
0,8575000,16.727405,145.0,0,0,0,0,0,0,1,...,6.305251e+20,1.066194e+16,1229977000000000.0,180289400000.0,20798440000.0,2399337000.0,3048625.0,351693.690125,40571.881315,4680.429589
1,4100000,3.826281,135.0,0,0,0,0,0,0,1,...,6.8921e+19,2269350000000000.0,64319780000000.0,74722500000.0,2117847000.0,60025750.0,2460375.0,69733.971225,1976.457549,56.018385
2,80000000,14.39853,118.0,1,0,0,0,0,0,0,...,5.12e+23,7.552e+17,9.215059e+16,1113920000000.0,135922100000.0,16585410000.0,1643032.0,200485.13172,24463.484607,2985.069636
3,38000000,8.105708,118.0,1,1,0,0,0,0,0,...,5.4872e+22,1.70392e+17,1.170464e+16,529112000000.0,36345990000.0,2496695000.0,1643032.0,112863.878192,7752.895257,532.565298
4,30000000,7.857666,115.0,0,0,1,0,0,0,0,...,2.7e+22,1.035e+17,7071899000000000.0,396750000000.0,27108950000.0,1852287000.0,1520875.0,103917.63285,7100.435221,485.155204


In [1830]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_atrribs=[["budget", "runtime","popularity"]]

polynomial_pipeline = Pipeline([
    ("imputer", DataFrameImputer()),
    ("date_adder", DateAttributesAdder()),
    ("cat_adder", CatAttributesAdder()),
    ("polynomial_adder", PolyAttributesAdder(),)
    ])

In [1831]:
X_train_poly.columns

Index([                                'budget',
                                   'popularity',
                                      'runtime',
                                       'Action',
                                    'Adventure',
                                    'Animation',
                                       'Comedy',
                                        'Crime',
                                  'Documentary',
                                        'Drama',
                                       'Family',
                                      'Fantasy',
                                      'Foreign',
                                      'History',
                                       'Horror',
                                        'Music',
                                      'Mystery',
                                      'Romance',
                              'Science Fiction',
                                     'Thriller',
                    

In [1838]:
X_test_poly=polynomial_pipeline.fit_transform(X_test)

In [1839]:
forest_reg_poly = RandomForestRegressor(n_estimators=10, random_state=42)
forest_reg_poly.fit(X_train_poly, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [1840]:
reve_pred_poly=forest_reg_poly.predict(X_test_poly)
rfr_poly_mse = mean_squared_error(reve_pred_poly, y_test)
rfr_poly_rmse=np.sqrt(rfr_mse)
rfr_poly_rmse

74779250.3379695

In [1841]:
rfr_poly_rmse/y_test.mean()

revenue    1.095368
dtype: float64

In [1844]:
forest_reg=RandomForestRegressor (random_state=10)

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring='neg_mean_squared_error',return_train_score=True)

grid_search.fit(X_train_poly, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Pl


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Pl


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Pl

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=10, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [10, 30, 60], 'max_features': [4, 6, 8, 12]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [1845]:
grid_search.best_params_

{'max_features': 4, 'n_estimators': 60}

In [1846]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([4.25738065e-02, 3.03484983e-02, 1.78216204e-02, 5.35363767e-03,
       5.60551328e-03, 4.63969100e-03, 2.53277796e-03, 3.55334555e-03,
       1.95015717e-04, 6.95874398e-03, 3.09927565e-03, 2.86874106e-03,
       7.60657949e-06, 8.97980554e-04, 1.12403844e-03, 7.19175497e-04,
       1.47838715e-03, 3.07385933e-03, 4.06040348e-03, 3.79814494e-03,
       1.25217698e-03, 2.17338018e-04, 1.66619365e-03, 2.02117221e-03,
       1.41136721e-03, 1.65920634e-03, 6.80850675e-04, 9.70050439e-04,
       1.07005872e-03, 1.01588825e-03, 1.55916403e-03, 2.78581020e-03,
       1.89495210e-03, 1.14975578e-05, 2.58872549e-04, 3.95456516e-04,
       9.68820571e-04, 2.37144555e-04, 1.70407043e-03, 2.75386916e-03,
       8.74118106e-04, 1.99768500e-03, 2.63827633e-03, 2.74959060e-03,
       1.28923419e-07, 1.38515521e-05, 8.02541417e-06, 3.10630726e-05,
       1.94130737e-04, 5.12260434e-04, 6.72572001e-04, 1.76016001e-03,
       2.80595607e-03, 4.06455640e-03, 4.99444926e-03, 1.67344681e-03,
      

In [1851]:
attributes = X_train_poly.columns

In [1852]:
sorted(zip(feature_importances, attributes), reverse=True)

[(0.09218739270499067, 'x0^2'),
 (0.07950204679144202, 'x0 x2'),
 (0.07564879762538687, 'x0^3'),
 (0.07335947574550697, 'x0 x1 x2'),
 (0.07161721849172333, 'x0^2 x2'),
 (0.05979056193534227, 'x0 x1'),
 (0.05343652178358581, 'x0^2 x1'),
 (0.04896321942369531, 'x0 x2^2'),
 (0.042573806510401185, 'budget'),
 (0.042275292029286024, 'x0 x1^2'),
 (0.03355346981479477, 'x1^2 x2'),
 (0.032700616264064404, 'x2^2'),
 (0.03234482134082266, 'x1 x2^2'),
 (0.030348498261713455, 'popularity'),
 (0.029532811774732114, 'x2^3'),
 (0.02890799794543301, 'x1 x2'),
 (0.01993214796836383, 'x1^2'),
 (0.017821620405634796, 'runtime'),
 (0.014430420956066321, 'x1^3'),
 (0.00695874397848478, 'Drama'),
 (0.005605513275147648, 'Adventure'),
 (0.005353637670334241, 'Action'),
 (0.00499444926013705, (10.0,)),
 (0.004639691002964369, 'Animation'),
 (0.0040645563959369035, (9.0,)),
 (0.00406040347836292, 'Science Fiction'),
 (0.0038742048166786635, (4,)),
 (0.0037981449444462784, 'Thriller'),
 (0.0035533455537306597, 

In [1853]:
final_model = grid_search.best_estimator_

In [1855]:
final_pred=pd.Series(final_model.predict(X_test_poly))

In [1856]:
final_pred=pd.DataFrame(final_pred, columns=["prediction"])

In [1860]:
df_final_pred=pd.concat([final_pred, y_test], axis=1, sort=False)
df_final_pred=df_final_pred.sort_values(by="prediction")

In [1861]:
df_final_pred.reset_index(inplace=True)

In [1862]:
trace0=go.Scatter(
        y=df_final_pred.prediction,
        x=df_final_pred.index,
        name="prediction",
        mode='lines',
        marker=dict(
        color="blue",
        size=10,
        opacity=0.2
        )
    )



trace1=go.Scatter(
        y=df_final_pred.revenue,
        x=df_final_pred.index,
        name="revenue",
        mode='markers',
        marker=dict(
        color="red",
        size=10,
        opacity=0.2,
    
        )
    )



data=[trace0, trace1]
figure=go.Figure(
    data=data,
    layout=go.Layout(
        title="MNIST dimension reduction analysis",
        xaxis=dict(title="No. of dimensions"),
        yaxis=dict(title="Retainde Variance"),

    
    ))
iplot(figure)