In [37]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import numpy as np

In [38]:
# Import Cleaned Movie Data
url = 'https://raw.githubusercontent.com/GriffinLane/BrownGoresenLane/cleanup/movieData.csv'
Movies = pd.read_csv(url)

In [39]:
# Preview Data
Movies.head(5)

Unnamed: 0.1,Unnamed: 0,revenue,vote_average,vote_count,title,original_language,release_year,release_month,production_company1,production_company2,...,producer1,producer2,producer3,Cast1,Cast2,Cast3,runtimes,writer1,writer2,writer3
0,1,4300000.0,6.6,714,Four Rooms,en,1995,12,Other,Other,...,Other,Other,Other,Other,Other,Other,98,Other,Other,Other
1,3,775398007.0,8.1,8550,Star Wars,en,1977,5,Other,20th Century Fox,...,Other,Other,Other,Other,Other,Other,121,Other,,
2,4,940335536.0,7.7,8085,Finding Nemo,en,2003,5,Other,,...,Other,Other,Other,Other,Other,Other,100,Other,Other,Other
3,5,677945399.0,8.3,10346,Forrest Gump,en,1994,7,Paramount,,...,Other,Other,Other,Tom Hanks,Other,Other,142,Other,Eric Roth,
4,6,356296601.0,8.0,4571,American Beauty,en,1999,9,Other,Other,...,Other,Other,Other,Other,Other,Other,122,Other,,


In [40]:
# Check data types
Movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3579 entries, 0 to 3578
Data columns (total 24 columns):
Unnamed: 0             3579 non-null int64
revenue                3579 non-null float64
vote_average           3579 non-null float64
vote_count             3579 non-null int64
title                  3579 non-null object
original_language      3579 non-null object
release_year           3579 non-null int64
release_month          3579 non-null int64
production_company1    3579 non-null object
production_company2    3579 non-null object
production_country1    3579 non-null object
genre1                 3579 non-null object
genre2                 3579 non-null object
director1              3579 non-null object
producer1              3579 non-null object
producer2              3579 non-null object
producer3              3579 non-null object
Cast1                  3579 non-null object
Cast2                  3579 non-null object
Cast3                  3579 non-null object
runtimes       

In [41]:
# Deal with categorical variables 
original_language_dmyfull = pd.get_dummies(Movies['original_language'], drop_first=True)
production_company1_dmyfull = pd.get_dummies(Movies['production_company1'], drop_first=True)
production_company2_dmyfull = pd.get_dummies(Movies['production_company2'], drop_first=True)
production_country1_dmyfull = pd.get_dummies(Movies['production_country1'], drop_first=True)
genre1_dmyfull = pd.get_dummies(Movies['genre1'], drop_first=True)
genre2_dmyfull = pd.get_dummies(Movies['genre2'], drop_first=True)
director1_dmyfull = pd.get_dummies(Movies['director1'], drop_first=True)
producer1_dmyfull = pd.get_dummies(Movies['producer1'], drop_first=True)
producer2_dmyfull = pd.get_dummies(Movies['producer2'], drop_first=True)
producer3_dmyfull = pd.get_dummies(Movies['producer3'], drop_first=True)
Cast1_dmyfull = pd.get_dummies(Movies['Cast1'], drop_first=True)
Cast2_dmyfull = pd.get_dummies(Movies['Cast2'], drop_first=True)
Cast3_dmyfull = pd.get_dummies(Movies['Cast3'], drop_first=True)
writer1_dmyfull = pd.get_dummies(Movies['writer1'], drop_first=True)
writer2_dmyfull = pd.get_dummies(Movies['writer2'], drop_first=True)
writer3_dmyfull = pd.get_dummies(Movies['writer3'], drop_first=True)

In [42]:
# Concatenate dummy variables to data set 
Movies_full = pd.concat([Movies,original_language_dmyfull,production_company1_dmyfull,production_company2_dmyfull,
                         production_country1_dmyfull,genre1_dmyfull,genre2_dmyfull,director1_dmyfull,producer1_dmyfull,
                         producer2_dmyfull,producer3_dmyfull,Cast1_dmyfull,Cast2_dmyfull,Cast3_dmyfull,writer1_dmyfull,
                         writer2_dmyfull,writer3_dmyfull], axis = 1 )

In [43]:
# Drop unimportant/object columns
Movies_full = Movies_full.drop(['title','original_language','production_company1','production_company2',
                                'production_country1','genre1','genre2','director1','producer1','producer2',
                                'producer3','Cast1','Cast2','Cast3','writer1','writer2','writer3'], axis = 1)

In [44]:
# Check new info
Movies_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3579 entries, 0 to 3578
Columns: 196 entries, Unnamed: 0 to Other
dtypes: float64(2), int64(5), uint8(189)
memory usage: 856.4 KB


In [49]:
# Split into x and y for prediction
X = Movies_full.drop(['revenue'], axis = 1)
y = Movies_full['revenue']

In [50]:
# Separate into test and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [51]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [52]:
rf = RandomForestRegressor(criterion='mse')
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [54]:
modelPred = rf.predict(X_test)

In [55]:
print(modelPred)

[  1.44606572e+08   6.98440139e+07   7.78601913e+07 ...,   1.11744494e+08
   8.22682460e+06   1.89704173e+08]


In [56]:
meanSquaredError = mean_squared_error(y_test, modelPred)

In [57]:
print("MSE:", meanSquaredError)

MSE: 9.01990808099e+15
