In [4]:
import pandas as pd
import numpy as np

import plotly.express as px

In [5]:
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVC


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error

In [8]:
imdb_df = pd.read_csv('data/final_imdb.csv')

In [9]:
imdb_df.columns

Index(['actors', 'director', 'duration', 'genre', 'imdb_rating', 'link',
       'synopsis', 'title', 'votes', 'page_url', 'page_url_cleaned',
       'release_start', 'action', 'adult', 'adventure', 'animation',
       'biography', 'comedy', 'crime', 'documentary', 'drama', 'family',
       'fantasy', 'film-noir', 'game-show', 'history', 'horror', 'music',
       'musical', 'mystery', 'news', 'reality-tv', 'romance', 'sci-fi',
       'short', 'sport', 'talk-show', 'thriller', 'unknown', 'war', 'western',
       'tv_series'],
      dtype='object')

In [10]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89674 entries, 0 to 89673
Data columns (total 42 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   actors            89674 non-null  object 
 1   director          89359 non-null  object 
 2   duration          81430 non-null  float64
 3   genre             88365 non-null  object 
 4   imdb_rating       89674 non-null  float64
 5   link              89674 non-null  object 
 6   synopsis          89672 non-null  object 
 7   title             89673 non-null  object 
 8   votes             89674 non-null  float64
 9   page_url          89674 non-null  object 
 10  page_url_cleaned  89674 non-null  object 
 11  release_start     89674 non-null  float64
 12  action            89674 non-null  int64  
 13  adult             89674 non-null  int64  
 14  adventure         89674 non-null  int64  
 15  animation         89674 non-null  int64  
 16  biography         89674 non-null  int64 

In [11]:
imdb_df.columns

Index(['actors', 'director', 'duration', 'genre', 'imdb_rating', 'link',
       'synopsis', 'title', 'votes', 'page_url', 'page_url_cleaned',
       'release_start', 'action', 'adult', 'adventure', 'animation',
       'biography', 'comedy', 'crime', 'documentary', 'drama', 'family',
       'fantasy', 'film-noir', 'game-show', 'history', 'horror', 'music',
       'musical', 'mystery', 'news', 'reality-tv', 'romance', 'sci-fi',
       'short', 'sport', 'talk-show', 'thriller', 'unknown', 'war', 'western',
       'tv_series'],
      dtype='object')

In [14]:
model_df = imdb_df[[
    'imdb_rating', 'duration',  'votes',
    'release_start', 'action', 'adult', 'adventure', 'animation',
    'biography', 'comedy', 'crime', 'documentary', 'drama', 'family',
    'fantasy', 'film-noir', 'game-show', 'history', 'horror', 'music',
    'musical', 'mystery', 'news', 'reality-tv', 'romance', 'sci-fi',
    'short', 'sport', 'talk-show', 'thriller', 'unknown', 'war', 'western',
    'tv_series'
]].dropna()

In [15]:
model_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81430 entries, 0 to 89673
Data columns (total 34 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   imdb_rating    81430 non-null  float64
 1   duration       81430 non-null  float64
 2   votes          81430 non-null  float64
 3   release_start  81430 non-null  float64
 4   action         81430 non-null  int64  
 5   adult          81430 non-null  int64  
 6   adventure      81430 non-null  int64  
 7   animation      81430 non-null  int64  
 8   biography      81430 non-null  int64  
 9   comedy         81430 non-null  int64  
 10  crime          81430 non-null  int64  
 11  documentary    81430 non-null  int64  
 12  drama          81430 non-null  int64  
 13  family         81430 non-null  int64  
 14  fantasy        81430 non-null  int64  
 15  film-noir      81430 non-null  int64  
 16  game-show      81430 non-null  int64  
 17  history        81430 non-null  int64  
 18  horror

In [16]:
# creating categories for imdb_rating
px.histogram(model_df, 'imdb_rating')

In [21]:
model_df['imdb_rating_cat'] = pd.qcut(model_df['imdb_rating'], 4)
model_df['imdb_rating_num'] = model_df['imdb_rating']

In [22]:
px.histogram(model_df, 'imdb_rating_num', color='imdb_rating_cat')

In [29]:
model_type = 'cat' #'cat'
dep_var = f'imdb_rating_{model_type}'
indep_vars = model_df.columns.drop(['imdb_rating', 'imdb_rating_cat', 'imdb_rating_num'])

In [30]:
X = model_df[indep_vars]
X = StandardScaler().fit_transform(X)
y = model_df[dep_var]

if model_type == 'num':
    y = LabelEncoder().fit_transform(y)

In [34]:
X.shape,y.shape

((81430, 33), (81430,))

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
model = LogisticRegression(max_iter=1000).fit(X_train, y_train)
model.score(X_test, y_test)

0.4115190961562078

In [35]:
model = SVC().fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
model = LinearRegression().fit(X_train, y_train)
np.sqrt(mean_squared_error(y_test, model.predict(X_test)))

0.6473103384488575