In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.svm import SVR, LinearSVR
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import train_test_split

In [3]:
import functions

In [4]:
imdb_df = pd.read_csv('data/final_imdb.csv')

## Data Preparation

In [5]:
imdb_df.columns

Index(['actors', 'director', 'duration', 'genre', 'imdb_rating', 'link',
       'synopsis', 'title', 'votes', 'page_url', 'page_url_cleaned',
       'release_start', 'action', 'adult', 'adventure', 'animation',
       'biography', 'comedy', 'crime', 'documentary', 'drama', 'family',
       'fantasy', 'film-noir', 'game-show', 'history', 'horror', 'music',
       'musical', 'mystery', 'news', 'reality-tv', 'romance', 'sci-fi',
       'short', 'sport', 'talk-show', 'thriller', 'unknown', 'war', 'western',
       'tv_series'],
      dtype='object')

In [6]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131790 entries, 0 to 131789
Data columns (total 42 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   actors            131790 non-null  object 
 1   director          130906 non-null  object 
 2   duration          115615 non-null  float64
 3   genre             129129 non-null  object 
 4   imdb_rating       131790 non-null  float64
 5   link              131790 non-null  object 
 6   synopsis          131785 non-null  object 
 7   title             131789 non-null  object 
 8   votes             131790 non-null  float64
 9   page_url          131790 non-null  object 
 10  page_url_cleaned  131790 non-null  object 
 11  release_start     131790 non-null  float64
 12  action            131790 non-null  int64  
 13  adult             131790 non-null  int64  
 14  adventure         131790 non-null  int64  
 15  animation         131790 non-null  int64  
 16  biography         13

In [7]:
model_df = imdb_df[[
    'imdb_rating', 'duration',  'votes',
    'release_start', 'action', 'adult', 'adventure', 'animation',
    'biography', 'comedy', 'crime', 'documentary', 'drama', 'family',
    'fantasy', 'film-noir', 'game-show', 'history', 'horror', 'music',
    'musical', 'mystery', 'news', 'reality-tv', 'romance', 'sci-fi',
    'short', 'sport', 'talk-show', 'thriller', 'unknown', 'war', 'western',
    'tv_series'
]].dropna()

In [8]:
model_df.shape

(115615, 34)

In [9]:
dep_var = f'imdb_rating'
indep_vars = model_df.columns.drop(['imdb_rating'])

In [10]:
X = model_df[indep_vars]
y = model_df[dep_var]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
X_train.shape, X_test.shape

((92492, 33), (23123, 33))

## Model Training

In [13]:
lr_params = {}
lr_mse, lr_pred, lr = functions.regression_scale_fit_mse(LinearRegression, lr_params, X_train, y_train, X_test, y_test)
functions.draw_actual_vs_predicted(y_test, lr_pred)

In [14]:
ridge_params = {'alphas':[0.01, 0.1, 1, 4, 5, 10, 15], 'scoring':'neg_mean_squared_error', 'cv':5}
ridge_mse, ridge_pred, ridge = functions.regression_scale_fit_mse(RidgeCV, ridge_params, X_train, y_train, X_test, y_test)
functions.draw_actual_vs_predicted(y_test, ridge_pred)

In [15]:
lasso_params = {'cv':5}
lasso_mse, lasso_pred, lasso = functions.regression_scale_fit_mse(LassoCV, lasso_params, X_train, y_train, X_test, y_test)
functions.draw_actual_vs_predicted(y_test, lasso_pred)


In [16]:
elastic_params = {'l1_ratio':np.linspace(0.1, 1, 10), 'cv':5}
elastic_mse, elastic_pred, elastic = functions.regression_scale_fit_mse(ElasticNetCV, elastic_params, X_train, y_train, X_test, y_test)
functions.draw_actual_vs_predicted(y_test, elastic_pred)

In [17]:
models_and_mses = pd.DataFrame()
for name, mse in zip([lr, ridge, lasso, elastic], [lr_mse, ridge_mse, lasso_mse, elastic_mse]):
    models_and_mses = models_and_mses.append({'model':type(name.named_steps['model']).__name__, 'mse': mse}, ignore_index=True)

In [18]:
models_and_mses.sort_values('mse')

Unnamed: 0,model,mse
0,LinearRegression,1.392262
1,RidgeCV,1.392474
2,LassoCV,1.392507
3,ElasticNetCV,1.392507


As can be seen from the results of the models, the CV MSE-s do not vary much across the linear models. Linear Regression is slightly better among those.