## Setup

In [51]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
import movecolumn as mc
from sklearn.metrics import mean_squared_error as mse


## Prepare data

In [52]:
df = pd.read_csv('../../data/processed/data.csv')
df.sample(5)

Unnamed: 0,name,genre,tomatometer_score,tomatometer_count,audience_score,audience_count,classification,runtime,release_year,original_language
160,Turning Red,Kids & family,0.95,287.0,0.68,5000.0,PG,100.0,2022,English
666,Till,Drama,0.96,192.0,0.97,500.0,PG-13,130.0,2022,English
645,Spider-Man,Action,0.9,249.0,0.67,250000.0,PG-13,116.0,2002,English
1105,Bullet Train,Action,0.54,340.0,0.76,5000.0,R,126.0,2022,English
133,Beau Is Afraid,Adventure,0.67,260.0,0.71,250.0,R,179.0,2023,English


Now, we will drop the 'name' column, as it is not useful for the model.


In [53]:
df = df.drop(columns='name')

In the 'original_language' column, we just need the main language.

In [54]:
df['original_language'] = df['original_language'].str.split(' ').str[0]

Now, we perform one-hot encoding on the categorical columns.

In [55]:
cols_to_onehot = ['original_language', 'genre', 'classification']
df = pd.get_dummies(df, columns=cols_to_onehot)


We will move the target column to the end of the dataframe and then fill the missing values with mean.

In [56]:
df = mc.MoveToLast(df, 'audience_score')
df.fillna(df.mean(), inplace=True)

## Data splitting

In [57]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='audience_score'), df['audience_score'], test_size=0.1)

## Data modeling
**Model 1**: Linear Regression
- Reason: We want to predict the audience score, which is a continuous variable. Therefore, we will use linear regression.

In [58]:
kfold = KFold(n_splits=10, shuffle=True)
LinearRegressionModel = LinearRegression()
## Mô hinh sử dụng tất cả các thuộc tính
first_model = X_train
first_model_test = X_test
first_model_name = 'All Features'

## Mô hình không sử dụng "tomatometer_count" và "tomatometer_score"
second_model = X_train.drop(columns=['tomatometer_count', 'tomatometer_score'])
second_model_test = X_test.drop(columns=['tomatometer_count', 'tomatometer_score'])
second_model_name = 'Without tomatometer_count and tomatometer_score'

# Mô hình không sử dụng "tomatometer_score"
third_model = X_train.drop(columns=['tomatometer_score'])
third_model_test = X_test.drop(columns=['tomatometer_score'])
third_model_name = 'Without tomatometer_score'


models_train = [first_model, second_model, third_model]
models_test = [first_model_test, second_model_test, third_model_test]
models_name = [first_model_name, second_model_name, third_model_name]
average_mse = np.zeros(len(models_train))
average_mse = [np.abs(cross_val_score(LinearRegression(fit_intercept=False), model, y_train, cv=kfold, scoring='neg_mean_squared_error').mean()) for model in models_train]
data = []
for i in range(len(models_train)):
    data.append([models_name[i], average_mse[i]])
res_df = pd.DataFrame(data, columns=['Mô hình', 'MSE'])
res_df.sort_values(['MSE'], inplace=True)

In [59]:
# Huấn luyện lại mô hình my_best_model trên toàn bộ tập huấn luyện
best_model_index = np.argmin(average_mse)
my_best_model = LinearRegression(fit_intercept=False).fit(models_train[best_model_index], y_train)

In [60]:
# Gọi hàm MAE (tự cài đặt hoặc từ thư viện) trên tập kiểm tra với mô hình my_best_model
y_hat_best = my_best_model.predict(models_test[best_model_index])
print("MSE trên tập kiểm tra: ", mse(y_test, y_hat_best))

MSE trên tập kiểm tra:  0.026695296081372874
