In [73]:
%%capture
import sys

# If you're on Colab:
if 'google.colab' in sys.modules:
    DATA_PATH = 'https://raw.githubusercontent.com/MaxTechniche/DS-Unit-2-Applied-Modeling/master/data/'
    !pip install category_encoders==2.*

# If you're working locally:
else:
    DATA_PATH = 'data/'
    
top_1000 = 'top_1000.csv'

In [74]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from category_encoders import OrdinalEncoder, OneHotEncoder

In [75]:
df = pd.read_csv(DATA_PATH+top_1000)

In [76]:
def genres(df):
    df = df.copy()
    
    genres = set()
    for item in df['Genres']:
        item = item.lstrip("['").rstrip("']").replace("'","").split(', ')
        for genre in item:
            genres.add(genre)

    df[list(genres)] = 0
        
    for i, item in zip(df.index, df['Genres']):
        item = item.lstrip("['").rstrip("']").replace("'","").split(', ')
        for genre in item:
            df.at[i, genre] = 1
        
    return df

df = genres(df)

In [77]:
def wrangle(df):
    df = df.copy()

    # df['Year'] = df['Year'].astype('object') # 1.4 increase

    # df['Title Length'] = df['Title'].apply(lambda x: len(x)) # .1 increase

    df.drop(columns=['Placement', 'Genres', 'Description', 'Directors', 'stars', 'Title'], inplace=True)

    return df

df = wrangle(df)

In [78]:
def merge_ratings(df):
    df = df.copy()

    mapping = {
        'TV-PG':'PG',
        'TV-MA':'R',
        'TV-14':'PG-13',
        'M':'PG-13',
        'GP':'PG',
        'Unrated|Not Rated|Passed|Approved': np.NaN
    }

    df['Certification'] = df['Certification'].replace(mapping, regex=True)

    return df

df = merge_ratings(df)

In [79]:
target = 'Runtime'
X = df.drop(columns=target)
y = df[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=42)

In [80]:
%%capture
pipeline = make_pipeline(
    OrdinalEncoder(),
    StandardScaler(),
    SimpleImputer(),
    RandomForestRegressor(random_state=42)
)
pipeline.fit(X_train, y_train)

In [81]:
params = {
    'randomforestregressor__n_estimators': range(100, 5001, 100),
    'randomforestregressor__max_depth': range(8, 31, 2),
    'randomforestregressor__max_samples': np.arange(.2, .81, .05)
}


model = RandomizedSearchCV(
    pipeline,
    param_distributions=params,
    n_jobs=6,
    verbose=1,
    random_state=42
)
model.fit(X, y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   57.1s
[Parallel(n_jobs=6)]: Done  50 out of  50 | elapsed:  1.2min finished


RandomizedSearchCV(estimator=Pipeline(steps=[('ordinalencoder',
                                              OrdinalEncoder(cols=['Certification'],
                                                             mapping=[{'col': 'Certification',
                                                                       'data_type': dtype('O'),
                                                                       'mapping': NaN      1
PG       2
R        3
G        4
PG-13    5
NC-17    6
dtype: int64}])),
                                             ('standardscaler',
                                              StandardScaler()),
                                             ('simpleimputer', SimpleImputer()),
                                             ('randomforestregressor',
                                              RandomForestRegressor(random_state=42))]),
                   n_jobs=6,
                   param_distributions={'randomforestregressor__max_depth': range(8, 31, 2),
  

In [82]:
baseline = mean_absolute_error(y, [y.mean()]*len(y))
print('Baseline MAE:', baseline)
print('Model MAE:', mean_absolute_error(y, model.predict(X))) # 8.9
print('Model R2/Score:', model.score(X, y))


Baseline MAE: 21.13167
Model MAE: 9.036084972763346
Model R2/Score: 0.8073177882453013


In [83]:
model.predict(X_val)[:5]

array([118.1   , 129.4275, 122.075 , 117.11  , 124.4575])

In [84]:
y_val

521    105
737    131
740    122
660    129
411    130
      ... 
408    107
332    113
208    117
613    123
78      88
Name: Runtime, Length: 200, dtype: int64