In [184]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.preprocessing import MultiLabelBinarizer

from pandas_profiling import ProfileReport

In [103]:
df_paths = os.listdir(path='data')

In [104]:
df_paths

['imdb_df_2000-01-01_2010-01-02.csv',
 'imdb_df_1950-01-01_1960-01-01.csv',
 'imdb_df_1970-01-01_1980-01-02.csv',
 'imdb_df_2010-01-01_2022-05-01.csv',
 'imdb_df_1980-01-01_1990-01-02.csv',
 'imdb_df_1960-01-01_1970-01-02.csv',
 'imdb_df_1990-01-01_2000-01-01.csv']

In [105]:
imdb_df = pd.DataFrame()

for i in df_paths:
    tmp_df = pd.read_csv('data/'+i)
    imdb_df = imdb_df.append(tmp_df, ignore_index=True)

In [106]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108500 entries, 0 to 108499
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Unnamed: 0    108500 non-null  int64  
 1   title         108500 non-null  object 
 2   link          108500 non-null  object 
 3   release_date  108489 non-null  float64
 4   duration      92806 non-null   float64
 5   genre         103762 non-null  object 
 6   imdb_rating   88629 non-null   float64
 7   metascore     6835 non-null    float64
 8   synopsis      108497 non-null  object 
 9   director      0 non-null       float64
 10  actors        108500 non-null  object 
 11  votes         88629 non-null   float64
 12  page_url      108500 non-null  object 
dtypes: float64(6), int64(1), object(6)
memory usage: 10.8+ MB


In [107]:
imdb_df = imdb_df.drop_duplicates(subset=['link'], ignore_index=True)
imdb_df = imdb_df.drop(['Unnamed: 0'], axis=1)
imdb_df = imdb_df.dropna(subset=['imdb_rating', 'release_date']).reset_index(drop=True)


In [108]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88350 entries, 0 to 88349
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         88350 non-null  object 
 1   link          88350 non-null  object 
 2   release_date  88350 non-null  float64
 3   duration      80090 non-null  float64
 4   genre         87081 non-null  object 
 5   imdb_rating   88350 non-null  float64
 6   metascore     6835 non-null   float64
 7   synopsis      88347 non-null  object 
 8   director      0 non-null      float64
 9   actors        88350 non-null  object 
 10  votes         88350 non-null  float64
 11  page_url      88350 non-null  object 
dtypes: float64(6), object(6)
memory usage: 8.1+ MB


In [100]:
imdb_df['release_start'] = [float(str(i)[0:4]) if i>3000 else float(str(i)[0:4]) for i in imdb_df.release_date]
imdb_df['release_end'] = [float(str(i)[4:8]) if i>3000 else float(str(i)[0:4]) for i in imdb_df.release_date]

In [142]:
k = [i.split(', ') if i == i else ['Unknown'] for i in imdb_df['genre']]

In [145]:
mlb = MultiLabelBinarizer()
imdb_df[mlb.classes_] = pd.DataFrame(mlb.fit_transform(k), columns = mlb.classes_)


In [146]:
imdb_df

Unnamed: 0,title,link,release_date,duration,genre,imdb_rating,metascore,synopsis,director,actors,...,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,Unknown,War,Western
0,The Beach,https://www.imdb.com/title/tt0163978/,2000.0,119.0,"Adventure, Drama, Romance",6.6,43.0,"On vacation in Thailand, Richard sets out for ...",,[],...,0,1,0,0,0,0,0,0,0,0
1,Scream 3,https://www.imdb.com/title/tt0134084/,2000.0,116.0,"Horror, Mystery",5.6,56.0,While Sidney and her friends visit the Hollywo...,,[],...,0,0,0,0,0,0,0,0,0,0
2,The Chronicles of Riddick: Pitch Black,https://www.imdb.com/title/tt0134847/,2000.0,109.0,"Action, Horror, Sci-Fi",7.1,49.0,A commercial transport ship and its crew are m...,,[],...,0,0,1,0,0,0,0,0,0,0
3,28 Days,https://www.imdb.com/title/tt0191754/,2000.0,103.0,"Comedy, Drama",6.0,46.0,A big-city newspaper columnist is forced to en...,,[],...,0,0,0,0,0,0,0,0,0,0
4,The Whole Nine Yards,https://www.imdb.com/title/tt0190138/,2000.0,98.0,"Comedy, Crime",6.7,47.0,A struggling dentist's life is turned upside d...,,[],...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88345,Suraag: The Clue,https://www.imdb.com/title/tt0456800/,1999.0,60.0,Crime,8.8,,'SURAAG the clue' is a popular crime TV serial...,,[],...,0,0,0,0,0,0,0,0,0,0
88346,Omertà - Le dernier des hommes d'honneur,https://www.imdb.com/title/tt0149503/,1999.0,,"Crime, Drama",8.3,,Add a Plot,,[],...,0,0,0,0,0,0,0,0,0,0
88347,Ladies Room,https://www.imdb.com/title/tt0151281/,1999.0,90.0,"Comedy, Drama",4.6,,"Women wait in an ethereal room, perhaps dead i...",,[],...,0,0,0,0,0,0,0,0,0,0
88348,If You Believe,https://www.imdb.com/title/tt0222029/,1999.0,91.0,"Comedy, Drama, Fantasy",6.4,,A jaded book editor has nearly given up on hap...,,[],...,0,0,0,0,0,0,0,0,0,0


In [147]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88350 entries, 0 to 88349
Data columns (total 41 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         88350 non-null  object 
 1   link          88350 non-null  object 
 2   release_date  88350 non-null  float64
 3   duration      80090 non-null  float64
 4   genre         87081 non-null  object 
 5   imdb_rating   88350 non-null  float64
 6   metascore     6835 non-null   float64
 7   synopsis      88347 non-null  object 
 8   director      0 non-null      float64
 9   actors        88350 non-null  object 
 10  votes         88350 non-null  float64
 11  page_url      88350 non-null  object 
 12  Action        88350 non-null  int64  
 13  Adult         88350 non-null  int64  
 14  Adventure     88350 non-null  int64  
 15  Animation     88350 non-null  int64  
 16  Biography     88350 non-null  int64  
 17  Comedy        88350 non-null  int64  
 18  Crime         88350 non-nu

In [148]:
ProfileReport(imdb_df)

Summarize dataset: 100%|██████████| 81/81 [02:32<00:00,  1.88s/it, Completed]                         
Generate report structure: 100%|██████████| 1/1 [00:29<00:00, 29.28s/it]
Render HTML: 100%|██████████| 1/1 [00:04<00:00,  4.14s/it]




In [210]:
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVC


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error

In [171]:
imdb_df.columns

Index(['title', 'link', 'release_date', 'duration', 'genre', 'imdb_rating',
       'metascore', 'synopsis', 'director', 'actors', 'votes', 'page_url',
       'Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir',
       'Game-Show', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News',
       'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Talk-Show',
       'Thriller', 'Unknown', 'War', 'Western'],
      dtype='object')

In [191]:
imdb_df

Unnamed: 0,title,link,release_date,duration,genre,imdb_rating,metascore,synopsis,director,actors,...,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,Unknown,War,Western
0,The Beach,https://www.imdb.com/title/tt0163978/,2000.0,119.0,"Adventure, Drama, Romance",6.6,43.0,"On vacation in Thailand, Richard sets out for ...",,[],...,0,1,0,0,0,0,0,0,0,0
1,Scream 3,https://www.imdb.com/title/tt0134084/,2000.0,116.0,"Horror, Mystery",5.6,56.0,While Sidney and her friends visit the Hollywo...,,[],...,0,0,0,0,0,0,0,0,0,0
2,The Chronicles of Riddick: Pitch Black,https://www.imdb.com/title/tt0134847/,2000.0,109.0,"Action, Horror, Sci-Fi",7.1,49.0,A commercial transport ship and its crew are m...,,[],...,0,0,1,0,0,0,0,0,0,0
3,28 Days,https://www.imdb.com/title/tt0191754/,2000.0,103.0,"Comedy, Drama",6.0,46.0,A big-city newspaper columnist is forced to en...,,[],...,0,0,0,0,0,0,0,0,0,0
4,The Whole Nine Yards,https://www.imdb.com/title/tt0190138/,2000.0,98.0,"Comedy, Crime",6.7,47.0,A struggling dentist's life is turned upside d...,,[],...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88345,Suraag: The Clue,https://www.imdb.com/title/tt0456800/,1999.0,60.0,Crime,8.8,,'SURAAG the clue' is a popular crime TV serial...,,[],...,0,0,0,0,0,0,0,0,0,0
88346,Omertà - Le dernier des hommes d'honneur,https://www.imdb.com/title/tt0149503/,1999.0,,"Crime, Drama",8.3,,Add a Plot,,[],...,0,0,0,0,0,0,0,0,0,0
88347,Ladies Room,https://www.imdb.com/title/tt0151281/,1999.0,90.0,"Comedy, Drama",4.6,,"Women wait in an ethereal room, perhaps dead i...",,[],...,0,0,0,0,0,0,0,0,0,0
88348,If You Believe,https://www.imdb.com/title/tt0222029/,1999.0,91.0,"Comedy, Drama, Fantasy",6.4,,A jaded book editor has nearly given up on hap...,,[],...,0,0,0,0,0,0,0,0,0,0


In [213]:
model_df = imdb_df.drop(['title', 'link', 'genre', 'page_url', 'synopsis', 'director', 'actors', 'metascore'], axis=1).dropna()
X = model_df.drop(['imdb_rating'], axis=1)
X = StandardScaler().fit_transform(X)
print(X.shape)
y = model_df['imdb_rating']

bins = [0, 3, 5, 7, 8, 9, 10]
group_names = [0, 3, 5, 7, 8, 9]
y = pd.cut(model_df['imdb_rating'], bins)
y = LabelEncoder().fit_transform(y)
# y = model_df['imdb_rating']

(80090, 32)


In [214]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88350 entries, 0 to 88349
Data columns (total 41 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         88350 non-null  object 
 1   link          88350 non-null  object 
 2   release_date  88350 non-null  float64
 3   duration      80090 non-null  float64
 4   genre         87081 non-null  object 
 5   imdb_rating   88350 non-null  float64
 6   metascore     6835 non-null   float64
 7   synopsis      88347 non-null  object 
 8   director      0 non-null      float64
 9   actors        88350 non-null  object 
 10  votes         88350 non-null  float64
 11  page_url      88350 non-null  object 
 12  Action        88350 non-null  int64  
 13  Adult         88350 non-null  int64  
 14  Adventure     88350 non-null  int64  
 15  Animation     88350 non-null  int64  
 16  Biography     88350 non-null  int64  
 17  Comedy        88350 non-null  int64  
 18  Crime         88350 non-nu

In [215]:
px.histogram(model_df['imdb_rating'])

In [216]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [217]:
y

array([2, 2, 3, ..., 1, 2, 4])

In [218]:
model = LogisticRegression(max_iter=1000).fit(X_train, y_train)
model.score(X_test, y_test)

0.5678611561992758

In [219]:
model = SVC().fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
model = LinearRegression().fit(X_train, y_train)
np.sqrt(mean_squared_error(y_test, model.predict(X_test)))

0.6473103384488575