## Setup

In [269]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
import movecolumn as mc
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae


## Prepare data

In [270]:
df = pd.read_csv('../../data/processed/data.csv')
df.sample(5)

Unnamed: 0,name,genre,tomatometer_score,tomatometer_count,audience_score,audience_count,classification,runtime,release_year,original_language
128,The Hitman's Bodyguard,Action,0.43,228.0,0.67,25000.0,R,118.0,2017,English
630,Ad Astra,Sci-fi,0.83,398.0,0.4,10000.0,PG-13,124.0,2019,English
447,Elemental,Kids & family,0.74,257.0,0.93,2500.0,PG,103.0,2023,English
91,"Are You There God? It's Me, Margaret.",Comedy,0.99,212.0,0.95,250.0,PG-13,105.0,2023,English
87,Anna and the Apocalypse,Holiday,0.77,128.0,0.63,1000.0,R,92.0,2017,English


Now, we will drop the 'name' column, as it is not useful for the model.


In [271]:
df = df.drop(columns='name')

In the 'original_language' column, we just need the main language.

In [272]:
df['original_language'] = df['original_language'].str.split(' ').str[0]

Now, we perform one-hot encoding on the categorical columns.

In [273]:
cols_to_onehot = ['original_language', 'genre', 'classification']
df = pd.get_dummies(df, columns=cols_to_onehot)
df

Unnamed: 0,tomatometer_score,tomatometer_count,audience_score,audience_count,runtime,release_year,original_language_Arabic,original_language_Bosnian,original_language_Chinese,original_language_Danish,...,genre_Western,classification_G,classification_NC-17,classification_Not Rated,classification_PG,classification_PG-13,classification_R,classification_TV14,classification_TVMA,classification_TVPG
0,0.74,23.0,0.40,100.0,98.0,2021,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1,1.00,61.0,0.73,250000.0,87.0,1940,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
2,0.64,58.0,0.60,250.0,113.0,2019,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,0.95,129.0,1.00,25.0,122.0,2023,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,0.89,472.0,0.88,25000.0,119.0,2019,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1210,0.98,202.0,0.89,1000.0,160.0,2020,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
1211,1.00,75.0,0.95,100000.0,102.0,1952,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
1212,0.89,106.0,0.75,250000.0,88.0,1999,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
1213,0.92,48.0,0.87,25000.0,124.0,1972,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


We will move the target column to the end of the dataframe and then fill the missing values with mean.

In [274]:
df = mc.MoveToLast(df, 'audience_score')
df.fillna(df.mean(), inplace=True)

## Data splitting

In [275]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='audience_score'), df['audience_score'], test_size=0.1)

## Data modeling

In [276]:
kfold = KFold(n_splits=10, shuffle=False)

LinearRegressionModel = LinearRegression()
## Mô hinh sử dụng tất cả các thuộc tính
first_model = X_train
first_model_test = X_test
first_model_name = 'All Features'

## Mô hình không sử dụng "tomatometer_count" và "tomatometer_score"
second_model = X_train.drop(columns=['tomatometer_count', 'tomatometer_score'])
second_model_test = X_test.drop(columns=['tomatometer_count', 'tomatometer_score'])
second_model_name = 'Without tomatometer_count and tomatometer_score'

# Mô hình không sử dụng "tomatometer_score"
third_model = X_train.drop(columns=['tomatometer_score'])
third_model_test = X_test.drop(columns=['tomatometer_score'])
third_model_name = 'Without tomatometer_score'

models_train = [first_model, second_model, third_model]
models_test = [first_model_test, second_model_test, third_model_test]
models_name = [first_model_name, second_model_name, third_model_name]
average_mse = np.zeros(len(models_train))
average_mse = [np.abs(cross_val_score(LinearRegression(fit_intercept=False), model, y_train, cv=kfold, scoring='neg_mean_squared_error').mean()) for model in models_train]
data = []
for i in range(len(models_train)):
    data.append([models_name[i], average_mse[i]])
df = pd.DataFrame(data, columns=['Mô hình', 'MSE'])
df.sort_values(['MSE'], inplace=True)
print(df)

                                           Mô hình       MSE
0                                     All Features  0.022629
1  Without tomatometer_count and tomatometer_score  0.032931
2                        Without tomatometer_score  0.035994


In [277]:
# Huấn luyện lại mô hình my_best_model trên toàn bộ tập huấn luyện
best_model_index = np.argmin(average_mse)
my_best_model = LinearRegression(fit_intercept=False).fit(models_train[best_model_index], y_train)

In [278]:
# Gọi hàm MAE (tự cài đặt hoặc từ thư viện) trên tập kiểm tra với mô hình my_best_model
y_hat_best = my_best_model.predict(models_test[best_model_index])
print("MSE trên tập kiểm tra: ", mse(y_test, y_hat_best))

MSE trên tập kiểm tra:  0.02848013041570658
