In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.svm import SVR, LinearSVR
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import train_test_split

In [3]:
import functions

In [4]:
imdb_df = pd.read_csv('data/final_imdb.csv')

## Data Preparation

In [5]:
imdb_df.columns

Index(['actors', 'director', 'duration', 'genre', 'imdb_rating', 'link',
       'synopsis', 'title', 'votes', 'page_url', 'page_url_cleaned',
       'release_start', 'release_month', 'action', 'adult', 'adventure',
       'animation', 'biography', 'comedy', 'crime', 'documentary', 'drama',
       'family', 'fantasy', 'film-noir', 'game-show', 'history', 'horror',
       'music', 'musical', 'mystery', 'news', 'reality-tv', 'romance',
       'sci-fi', 'short', 'sport', 'talk-show', 'thriller', 'unknown', 'war',
       'western', 'tv_series'],
      dtype='object')

In [6]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312521 entries, 0 to 312520
Data columns (total 43 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   actors            312521 non-null  object 
 1   director          308472 non-null  object 
 2   duration          253937 non-null  float64
 3   genre             291314 non-null  object 
 4   imdb_rating       227843 non-null  float64
 5   link              312521 non-null  object 
 6   synopsis          312514 non-null  object 
 7   title             312520 non-null  object 
 8   votes             227843 non-null  float64
 9   page_url          312521 non-null  object 
 10  page_url_cleaned  312521 non-null  object 
 11  release_start     312388 non-null  float64
 12  release_month     312521 non-null  int64  
 13  action            312521 non-null  int64  
 14  adult             312521 non-null  int64  
 15  adventure         312521 non-null  int64  
 16  animation         31

In [7]:
model_df = imdb_df[[
    'imdb_rating', 'duration',  'votes',
    'release_start', 'action', 'adult', 'adventure', 'animation',
    'biography', 'comedy', 'crime', 'documentary', 'drama', 'family',
    'fantasy', 'film-noir', 'game-show', 'history', 'horror', 'music',
    'musical', 'mystery', 'news', 'reality-tv', 'romance', 'sci-fi',
    'short', 'sport', 'talk-show', 'thriller', 'unknown', 'war', 'western',
    'tv_series'
]].dropna()

In [8]:
model_df.shape

(200365, 34)

In [9]:
dep_var = f'imdb_rating'
indep_vars = model_df.columns.drop(['imdb_rating'])

In [10]:
X = model_df[indep_vars]
y = model_df[dep_var]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
X_train.shape, X_test.shape

((160292, 33), (40073, 33))

## Model Training

In [13]:
lr_params = {}
lr_mse, lr_pred, lr = functions.regression_scale_fit_mse(LinearRegression, lr_params, X_train, y_train, X_test, y_test)
functions.draw_actual_vs_predicted(y_test, lr_pred)

In [14]:
ridge_params = {'alphas':[0.01, 0.1, 1, 4, 5, 10, 15], 'scoring':'neg_mean_squared_error', 'cv':5}
ridge_mse, ridge_pred, ridge = functions.regression_scale_fit_mse(RidgeCV, ridge_params, X_train, y_train, X_test, y_test)
functions.draw_actual_vs_predicted(y_test, ridge_pred)

In [15]:
lasso_params = {'cv':5}
lasso_mse, lasso_pred, lasso = functions.regression_scale_fit_mse(LassoCV, lasso_params, X_train, y_train, X_test, y_test)
functions.draw_actual_vs_predicted(y_test, lasso_pred)


In [16]:
elastic_params = {'l1_ratio':np.linspace(0.1, 1, 10), 'cv':5}
elastic_mse, elastic_pred, elastic = functions.regression_scale_fit_mse(ElasticNetCV, elastic_params, X_train, y_train, X_test, y_test)
functions.draw_actual_vs_predicted(y_test, elastic_pred)

In [17]:
models_and_mses = pd.DataFrame()
for name, mse in zip([lr, ridge, lasso, elastic], [lr_mse, ridge_mse, lasso_mse, elastic_mse]):
    models_and_mses = models_and_mses.append({'model':type(name.named_steps['model']).__name__, 'mse': mse}, ignore_index=True)

In [18]:
models_and_mses.sort_values('mse')

Unnamed: 0,model,mse
1,RidgeCV,1.390395
0,LinearRegression,1.390412
2,LassoCV,1.390453
3,ElasticNetCV,1.390453


As can be seen from the results of the models, the CV MSE-s do not vary much across the linear models. Linear Regression is slightly better among those.