In [1]:
import pandas as pd #Đọc xử lí dữ liệu từ dataframe 
from sklearn.model_selection import train_test_split #chia dữ liệu train_test
from sklearn.linear_model import LinearRegression #Xây dựng mô hình hồi quy tuyến tính 
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score #Đánh giá mô hình 
import numpy as np #Tính toán 

In [2]:
# Đọc dữ liệu vào
data = pd.read_csv("WineQT.csv")  

In [3]:
# Chọn biến độc lập và biến phụ thuộc
x = data.drop(['quality','Id'], axis=1)
y = data['quality']

In [4]:
# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [5]:
# Xây dựng mô hình hồi quy tuyến tính
model = LinearRegression()
# Huấn luyện mô hình trên tập huấn luyện
model.fit(x_train, y_train)


In [6]:
# Dự đoán chất lượng rượu trên tập kiểm tra
y_pred = model.predict(x_test)

# Làm tròn các giá trị dự đoán đến số nguyên gần nhất
y_pred_rounded = np.round(y_pred).astype(int)

# Tạo DataFrame từ dự đoán
y_pred_df = pd.DataFrame(y_pred_rounded, columns=['Quality_predict'], index=x_test.index)

# Kết hợp các DataFrame và so sánh kết quả
result_compare = pd.concat([x_test, y_test, y_pred_df], axis=1)
result_compare['Deviation'] = result_compare['Quality_predict'] - result_compare['quality']

# Hiển thị kết quả
result_compare


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Quality_predict,Deviation
158,6.8,0.610,0.04,1.5,0.057,5.0,10.0,0.99525,3.42,0.60,9.500000,5,5,0
1081,6.9,0.840,0.21,4.1,0.074,16.0,65.0,0.99842,3.53,0.72,9.233333,6,5,-1
291,7.0,0.580,0.12,1.9,0.091,34.0,124.0,0.99560,3.44,0.48,10.500000,5,5,0
538,7.8,0.480,0.68,1.7,0.415,14.0,32.0,0.99656,3.09,1.06,9.100000,6,5,-1
367,12.5,0.600,0.49,4.3,0.100,5.0,14.0,1.00100,3.25,0.74,11.900000,6,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,5.0,1.020,0.04,1.4,0.045,41.0,85.0,0.99380,3.75,0.48,10.500000,4,5,1
328,10.3,0.500,0.42,2.0,0.069,21.0,51.0,0.99820,3.16,0.72,11.500000,6,6,0
67,6.8,0.775,0.00,3.0,0.102,8.0,23.0,0.99650,3.45,0.56,10.700000,5,5,0
231,10.0,0.490,0.20,11.0,0.071,13.0,50.0,1.00150,3.16,0.69,9.200000,6,5,-1


In [7]:
# Đánh giá mô hình

# Tính MSE
mse = mean_squared_error(y_test, y_pred_rounded)

# Tính RMSE
rmse = np.sqrt(mse)

# Tính MAE
mae = mean_absolute_error(y_test, y_pred_rounded)

# Tính MAPE
mape = mean_absolute_percentage_error(y_test, y_pred_rounded)

# Tính R-squared
r2 = r2_score(y_test, y_pred_rounded)

print('Mean Squared Error (MSE):', mse)
print('Root Mean Squared Error (RMSE):', rmse)
print('Mean Absolute Error (MAE):', mae)
print('Mean Absolute Percentage Error (MAPE):', mape)
print('R-squared (R2):', r2)


Mean Squared Error (MSE): 0.4759825327510917
Root Mean Squared Error (RMSE): 0.6899148735540435
Mean Absolute Error (MAE): 0.39737991266375544
Mean Absolute Percentage Error (MAPE): 0.07235911831981702
R-squared (R2): 0.14464395860461976


In [8]:
# Xem Hệ số hồi quy (trọng số) của từng biến độc lập
pd.DataFrame(model.coef_, x_train.columns)

Unnamed: 0,0
fixed acidity,0.051343
volatile acidity,-1.336257
citric acid,-0.339101
residual sugar,0.004121
chlorides,-1.80653
free sulfur dioxide,0.001881
total sulfur dioxide,-0.002283
density,-30.653256
pH,-0.249236
sulphates,0.97304
