In [1]:
%%capture
import sys

# If you're on Colab:
if 'google.colab' in sys.modules:
    DATA_PATH = 'https://raw.githubusercontent.com/MaxTechniche/DS-Unit-2-Applied-Modeling/master/data/'
    !pip install category_encoders==2.*

# If you're working locally:
else:
    DATA_PATH = 'data/'
    
top_1000 = 'top_1000.csv'

In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from category_encoders import OrdinalEncoder, OneHotEncoder

In [3]:
df = pd.read_csv(DATA_PATH+top_1000, index_col='Title')

In [5]:
def genres(df):
    df = df.copy()
    
    genres = set()
    for item in df['Genres']:
        item = item.lstrip("['").rstrip("']").replace("'","").split(', ')
        for genre in item:
            genres.add(genre)

    df[list(genres)] = 0
        
    for i, item in zip(df.index, df['Genres']):
        item = item.lstrip("['").rstrip("']").replace("'","").split(', ')
        for genre in item:
            df.at[i, genre] = 1
        
    return df

df = genres(df)

# df['Year'] = df['Year'].astype('object')
df.drop(columns=['Placement', 'Genres', 'Description', 'Directors', 'stars'], inplace=True)

In [6]:
def merge_ratings(df):
    df = df.copy()

    mapping = {
        'TV-PG':'PG',
        'TV-MA':'R',
        'TV-14':'PG-13',
        'M':'PG-13',
        'GP':'PG',
        'Unrated|Not Rated|Passed|Approved': np.NaN
    }

    df['Certification'] = df['Certification'].replace(mapping, regex=True)

    return df

df = merge_ratings(df)

In [7]:
target = 'Runtime'
X = df.drop(columns=target)
y = df[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2)

In [8]:
%%capture
pipeline = make_pipeline(
    OrdinalEncoder(),
    StandardScaler(),
    SimpleImputer(),
    RandomForestRegressor()
)
pipeline.fit(X_train, y_train)

In [10]:
params = {
    'randomforestregressor__n_estimators': range(100, 5001, 100),
    'randomforestregressor__max_depth': range(8, 31, 2),
    'randomforestregressor__max_samples': np.arange(.2, 1.0, .05)
}


model = RandomizedSearchCV(
    pipeline,
    param_distributions=params,
    n_jobs=6,
    n_iter=30
)
model.fit(X, y)

RandomizedSearchCV(estimator=Pipeline(steps=[('ordinalencoder',
                                              OrdinalEncoder(cols=['Certification'],
                                                             mapping=[{'col': 'Certification',
                                                                       'data_type': dtype('O'),
                                                                       'mapping': R        1
PG-13    2
NaN      3
PG       4
G        5
NC-17    6
dtype: int64}])),
                                             ('standardscaler',
                                              StandardScaler()),
                                             ('simpleimputer', SimpleImputer()),
                                             ('randomforestregressor',
                                              RandomForestRegressor())]),
                   n_iter=30, n_jobs=6,
                   param_distributions={'randomforestregressor__max_depth': range(8, 31, 2),
      

In [11]:
baseline = mean_absolute_error(y, [y.mean()]*len(y))

In [12]:
print('Baseline MAE:', baseline)
print('Model MAE:    ', mean_absolute_error(y, model.predict(X)))
print('Model R2:     ', model.score(X, y))


Baseline MAE: 21.13167
Model MAE:     10.902070302532993
Model R2:      0.7280029061594206


In [13]:
important = pd.DataFrame(list(zip(X_train.columns, pipeline.named_steps['randomforestregressor'].feature_importances_))).sort_values(1, ascending=False)

In [17]:
important

Unnamed: 0,0,1
0,Year,0.205841
4,Votes,0.138541
5,USA Box Office,0.133472
2,Rating,0.098262
3,Metascore,0.09569
16,Animation,0.055434
25,Comedy,0.038771
1,Certification,0.038206
20,History,0.029528
21,Biography,0.018467
