In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

In [2]:
# Import Cleaned Movie Data
url = 'https://raw.githubusercontent.com/GriffinLane/BrownGoresenLane/cleanup/movieData.csv'
Movies = pd.read_csv(url)

In [3]:
# Preview Data
Movies.head(5)

Unnamed: 0.1,Unnamed: 0,revenue,vote_average,vote_count,title,original_language,release_year,release_month,production_company1,production_company2,...,producer1,producer2,producer3,Cast1,Cast2,Cast3,runtimes,writer1,writer2,writer3
0,1,4300000.0,6.6,714,Four Rooms,en,1995,12,Other,Other,...,Other,Other,Other,Other,Other,Other,98,Other,Other,Other
1,3,775398007.0,8.1,8550,Star Wars,en,1977,5,Other,20th Century Fox,...,Other,Other,Other,Other,Other,Other,121,Other,,
2,4,940335536.0,7.7,8085,Finding Nemo,en,2003,5,Other,,...,Other,Other,Other,Other,Other,Other,100,Other,Other,Other
3,5,677945399.0,8.3,10346,Forrest Gump,en,1994,7,Paramount,,...,Other,Other,Other,Tom Hanks,Other,Other,142,Other,Eric Roth,
4,6,356296601.0,8.0,4571,American Beauty,en,1999,9,Other,Other,...,Other,Other,Other,Other,Other,Other,122,Other,,


In [4]:
# Check data types
Movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3579 entries, 0 to 3578
Data columns (total 24 columns):
Unnamed: 0             3579 non-null int64
revenue                3579 non-null float64
vote_average           3579 non-null float64
vote_count             3579 non-null int64
title                  3579 non-null object
original_language      3579 non-null object
release_year           3579 non-null int64
release_month          3579 non-null int64
production_company1    3579 non-null object
production_company2    3579 non-null object
production_country1    3579 non-null object
genre1                 3579 non-null object
genre2                 3579 non-null object
director1              3579 non-null object
producer1              3579 non-null object
producer2              3579 non-null object
producer3              3579 non-null object
Cast1                  3579 non-null object
Cast2                  3579 non-null object
Cast3                  3579 non-null object
runtimes       

In [5]:
# Deal with categorical variables 
original_language_dmyfull = pd.get_dummies(Movies['original_language'], drop_first=True)
production_company1_dmyfull = pd.get_dummies(Movies['production_company1'], drop_first=True)
production_company2_dmyfull = pd.get_dummies(Movies['production_company2'], drop_first=True)
production_country1_dmyfull = pd.get_dummies(Movies['production_country1'], drop_first=True)
genre1_dmyfull = pd.get_dummies(Movies['genre1'], drop_first=True)
genre2_dmyfull = pd.get_dummies(Movies['genre2'], drop_first=True)
director1_dmyfull = pd.get_dummies(Movies['director1'], drop_first=True)
producer1_dmyfull = pd.get_dummies(Movies['producer1'], drop_first=True)
producer2_dmyfull = pd.get_dummies(Movies['producer2'], drop_first=True)
producer3_dmyfull = pd.get_dummies(Movies['producer3'], drop_first=True)
Cast1_dmyfull = pd.get_dummies(Movies['Cast1'], drop_first=True)
Cast2_dmyfull = pd.get_dummies(Movies['Cast2'], drop_first=True)
Cast3_dmyfull = pd.get_dummies(Movies['Cast3'], drop_first=True)
writer1_dmyfull = pd.get_dummies(Movies['writer1'], drop_first=True)
writer2_dmyfull = pd.get_dummies(Movies['writer2'], drop_first=True)
writer3_dmyfull = pd.get_dummies(Movies['writer3'], drop_first=True)

In [6]:
# Concatenate dummy variables to data set 
Movies_full = pd.concat([Movies,original_language_dmyfull,production_company1_dmyfull,production_company2_dmyfull,
                         production_country1_dmyfull,genre1_dmyfull,genre2_dmyfull,director1_dmyfull,producer1_dmyfull,
                         producer2_dmyfull,producer3_dmyfull,Cast1_dmyfull,Cast2_dmyfull,Cast3_dmyfull,writer1_dmyfull,
                         writer2_dmyfull,writer3_dmyfull], axis = 1 )

In [7]:
# Drop unimportant/object columns
Movies_full = Movies_full.drop(['title','original_language','production_company1','production_company2',
                                'production_country1','genre1','genre2','director1','producer1','producer2',
                                'producer3','Cast1','Cast2','Cast3','writer1','writer2','writer3'], axis = 1)

In [8]:
# Check new info
Movies_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3579 entries, 0 to 3578
Columns: 196 entries, Unnamed: 0 to Other
dtypes: float64(2), int64(5), uint8(189)
memory usage: 856.4 KB


In [9]:
# Split into x and y for prediction
X = Movies_full.drop(['revenue'], axis = 1)
y = Movies_full['revenue']

In [10]:
# Separate into test and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [11]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

# Random Forest

In [12]:
rf = RandomForestRegressor(criterion='mse')
RFmodel1 = rf.fit(X_train, y_train)

In [13]:
RFmodelPred1 = RFmodel1.predict(X_test)

In [14]:
print(RFmodelPred1)

[  1.76995418e+08   4.05527553e+07   8.20269426e+07 ...,   8.64052154e+07
   1.47991041e+07   2.21320386e+08]


In [15]:
RF1meanSquaredError = mean_squared_error(y_test, RFmodelPred1)

In [16]:
print("MSE:", RF1meanSquaredError)

MSE: 8.54225718814e+15


2nd Try

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn import model_selection

In [18]:
rf2 = RandomForestClassifier(n_estimators=300, random_state=1)
RFmodel2 = rf2.fit(X_train, y_train)
RFmodelPred2 = RFmodel2.predict(X_test)

In [19]:
print(RFmodelPred2)

[  3.70956570e+07   3.37881610e+07   2.52054600e+07 ...,   1.52368585e+08
   1.15854830e+07   4.80811000e+05]


In [20]:
RF2meanSquaredError = mean_squared_error(y_test, RFmodelPred2)
print("MSE:", RF2meanSquaredError)

MSE: 2.34828596889e+16


3rd Try

In [22]:
seed = 7
num_trees = 100
max_features = 3
kfold = model_selection.KFold(n_splits=10, random_state=seed)
RFModel3 = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
RFmodelPred3 = model_selection.cross_val_score(RFModel3, X, y, cv=kfold)
print(RFmodelPred3.mean())

0.000558659217877


# Multiple Linear Regression

In [72]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()

In [73]:
linearmodel1 = lm.fit(X_train,y_train)

In [74]:
linearmodelPred1 = linearmodel1.predict(X_test)

In [75]:
LM1meanSquaredError = mean_squared_error(y_test, linearmodelPred1)

In [76]:
print("MSE:", LM1meanSquaredError)

MSE: 9.04951890273e+15
