In [9]:
import pandas as pd
df_data = pd.read_csv("/kaggle/input/imdb-title-with-person-name/final_rating_table/final_rating_table.tsv",sep='\t')
df_data.head()

Unnamed: 0,tconst,averageRating,language,titleType,primaryTitle,isAdult,startYear,runtimeMinutes,genres,director,editor,writer,actor_actress
0,tt0000005,6.2,en,short,Blacksmith Scene,0,1893,1,"Comedy,Short",william k.l. dickson,Unknown,Unknown,"charles kayser,john ott"
1,tt0000010,6.9,sv,short,Leaving the Factory,0,1895,1,"Documentary,Short",louis lumière,Unknown,Unknown,Unknown
2,tt0000012,7.4,tr,short,The Arrival of a Train,0,1896,1,"Documentary,Short","auguste lumière,louis lumière",Unknown,Unknown,Unknown
3,tt0000013,5.7,en,short,The Photographical Congress Arrives in Lyon,0,1895,1,"Documentary,Short",louis lumière,Unknown,Unknown,Unknown
4,tt0000014,7.1,sv,short,The Waterer Watered,0,1895,1,"Comedy,Short",louis lumière,Unknown,Unknown,"françois clerc,benoît duval"


In [10]:
df_data.replace('Unknown', 'other', inplace=True)
df_data.drop('editor', axis=1, inplace=True)
df_data["genres"] = df_data["genres"].str.lower()
df_data['runtimeMinutes'] = pd.to_numeric(df_data['runtimeMinutes'], errors='coerce')
df_data['startYear'] = pd.to_numeric(df_data['startYear'], errors='coerce')
df_data.head()

Unnamed: 0,tconst,averageRating,language,titleType,primaryTitle,isAdult,startYear,runtimeMinutes,genres,director,writer,actor_actress
0,tt0000005,6.2,en,short,Blacksmith Scene,0,1893.0,1.0,"comedy,short",william k.l. dickson,other,"charles kayser,john ott"
1,tt0000010,6.9,sv,short,Leaving the Factory,0,1895.0,1.0,"documentary,short",louis lumière,other,other
2,tt0000012,7.4,tr,short,The Arrival of a Train,0,1896.0,1.0,"documentary,short","auguste lumière,louis lumière",other,other
3,tt0000013,5.7,en,short,The Photographical Congress Arrives in Lyon,0,1895.0,1.0,"documentary,short",louis lumière,other,other
4,tt0000014,7.1,sv,short,The Waterer Watered,0,1895.0,1.0,"comedy,short",louis lumière,other,"françois clerc,benoît duval"


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [31]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 258027 entries, 0 to 258026
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          258027 non-null  object 
 1   averageRating   258027 non-null  float64
 2   language        258027 non-null  object 
 3   titleType       258027 non-null  object 
 4   primaryTitle    258027 non-null  object 
 5   isAdult         258027 non-null  int64  
 6   startYear       258003 non-null  float64
 7   runtimeMinutes  195847 non-null  float64
 8   genres          258027 non-null  object 
 9   director        258027 non-null  object 
 10  writer          258027 non-null  object 
 11  actor_actress   258027 non-null  object 
dtypes: float64(3), int64(1), object(8)
memory usage: 23.6+ MB


In [32]:
X = df_data[['language', 'titleType', 'isAdult','startYear', 'runtimeMinutes', 'genres', 'director', 'writer', 'actor_actress']]
y = df_data['averageRating']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
categorical_features = ['language', 'titleType', 'genres', 'director', 'writer', 'actor_actress']
numeric_features = ['isAdult', 'runtimeMinutes','startYear']

In [48]:
# Create transformers for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values with median
#     ('imputer', SimpleImputer(strategy='median')),  
    ('scaler', StandardScaler())  # Standardize numeric features
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [49]:
# Combine transformers for all features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [50]:
# Step 3: Model Selection (Linear Regression)
# Step 4: Create a pipeline for preprocessing and modeling
model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', LinearRegression())])

In [51]:
# Step 5: Model Training
model.fit(X_train, y_train)

In [52]:
# Step 6: Model Evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f'Mean Squared Error: {mse:.2f}')
print(f'Mean Absolute Error: {mae:.2f}')

Mean Squared Error: 1.54
Mean Absolute Error: 0.90


In [53]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2:.2f}')

R-squared: 0.24


In [54]:
import joblib
joblib.dump(model, 'imdb_LR_model_v1.joblib')

['imdb_LR_model_v1.joblib']

# **Improve**

In [56]:
df_exploded = df_data.assign(genres=df['genres'].str.split(',')).explode('genres')
df_exploded = df_exploded.assign(director=df['director'].str.split(',')).explode('director')
df_exploded = df_exploded.assign(writer=df['writer'].str.split(',')).explode('writer')
df_exploded = df_exploded.assign(actor_actress=df['actor_actress'].str.split(',')).explode('actor_actress')
df_exploded = df_exploded.assign(titleType=df['titleType'].str.split(',')).explode('titleType')
df_exploded

Unnamed: 0,tconst,averageRating,language,titleType,primaryTitle,isAdult,startYear,runtimeMinutes,genres,director,writer,actor_actress
0,tt0000005,6.2,en,short,Blacksmith Scene,0,1893.0,1.0,comedy,william k.l. dickson,other,charles kayser
0,tt0000005,6.2,en,short,Blacksmith Scene,0,1893.0,1.0,comedy,william k.l. dickson,other,john ott
0,tt0000005,6.2,en,short,Blacksmith Scene,0,1893.0,1.0,short,william k.l. dickson,other,charles kayser
0,tt0000005,6.2,en,short,Blacksmith Scene,0,1893.0,1.0,short,william k.l. dickson,other,john ott
1,tt0000010,6.9,sv,short,Leaving the Factory,0,1895.0,1.0,documentary,louis lumière,other,other
...,...,...,...,...,...,...,...,...,...,...,...,...
258025,tt9916720,5.1,bg,short,The Nun 2,0,2019.0,10.0,horror,niclas schmidt,other,teddington
258025,tt9916720,5.1,bg,short,The Nun 2,0,2019.0,10.0,mystery,niclas schmidt,other,teddington
258026,tt9916766,6.7,es,tvEpisode,Episode #10.15,0,2019.0,43.0,family,other,other,other
258026,tt9916766,6.7,es,tvEpisode,Episode #10.15,0,2019.0,43.0,game-show,other,other,other


In [57]:
X = df_exploded[['language', 'titleType', 'isAdult','startYear', 'runtimeMinutes', 'genres', 'director', 'writer', 'actor_actress']]
y = df_exploded['averageRating']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
# Step 5: Model Training
model.fit(X_train, y_train)

In [59]:
# Step 6: Model Evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f'Mean Squared Error: {mse:.2f}')
print(f'Mean Absolute Error: {mae:.2f}')

Mean Squared Error: 0.43
Mean Absolute Error: 0.42


In [60]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2:.2f}')

R-squared: 0.78


In [61]:
import joblib
joblib.dump(model, 'imdb_LR_model_v2.joblib')

['imdb_LR_model_v2.joblib']