In [25]:
import pandas as pd
import numpy as np
import matplotlib
from sklearn.linear_model import LinearRegression, HuberRegressor, Ridge, Lasso
from sklearn.metrics import mean_absolute_error
import pickle

In [2]:
data_path = 'data/train.csv'

columns = ['Time', 'OZONE', 'NO2', 'temp', 'humidity', 'no2op1', 'no2op2', 'o3op1', 'o3op2']

raw = pd.read_csv(data_path, header=0, names=columns)

In [3]:
print(raw.head())

                  Time  OZONE     NO2  temp  ...  no2op1  no2op2  o3op1  o3op2
0  2019-03-27 13:01:00  77.59   6.881  36.2  ...   199.0   200.0  240.0  197.0
1  2019-03-27 13:03:00  78.71  11.057  36.3  ...   196.0   200.0  237.0  196.0
2  2019-03-27 13:04:00  78.85   8.596  36.7  ...   195.0   199.0  235.0  196.0
3  2019-03-27 13:07:00  79.27   7.248  37.0  ...   193.0   198.0  233.0  195.0
4  2019-03-27 13:08:00  80.01   8.638  36.8  ...   191.0   198.0  231.0  195.0

[5 rows x 9 columns]


In [4]:
X_train = raw[['no2op1', 'no2op2', 'o3op1', 'o3op2']]
print(X_train.head())   

   no2op1  no2op2  o3op1  o3op2
0   199.0   200.0  240.0  197.0
1   196.0   200.0  237.0  196.0
2   195.0   199.0  235.0  196.0
3   193.0   198.0  233.0  195.0
4   191.0   198.0  231.0  195.0


In [5]:
Y_train_ozone = raw[['OZONE']]
print(Y_train_ozone.head())


   OZONE
0  77.59
1  78.71
2  78.85
3  79.27
4  80.01


In [6]:
Y_train_no2 = raw[['NO2']]
print(Y_train_no2.head())

      NO2
0   6.881
1  11.057
2   8.596
3   7.248
4   8.638


In [7]:
#Khởi tạo linear model mặc định
lr_ozone = LinearRegression()

lr_ozone.fit(X_train, Y_train_ozone)


In [8]:
lr_no2 = LinearRegression()

lr_no2.fit(X_train, Y_train_no2)

In [None]:
y_pred_ozone = lr_ozone.predict(X_train)
mae_ozone_train = mean_absolute_error(Y_train_ozone, y_pred_ozone)
print(f'MAE train ozone using default Linear Regression: {mae_ozone_train:.2f}')

MAE train ozone using Linear Regression: 5.63


In [None]:
y_pred_no2 = lr_no2.predict(X_train)
mae_no2_train = mean_absolute_error(Y_train_no2, y_pred_no2)
print(f'MAE train no2 using default Linear Regression: {mae_no2_train:.2f}')

MAE train no2 using Linear Regression: 6.54


In [19]:
# Khởi tạo SDGRegressor để thử qua các hàm mất mát khác nhau
from sklearn.linear_model import SGDRegressor
losses = ['huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']
results = {}

for loss in losses:
    model = SGDRegressor(loss=loss, max_iter=10000, tol=1e-3, random_state=42)
    model.fit(X_train, Y_train_ozone.values.ravel())
    y_pred = model.predict(X_train)
    mae = mean_absolute_error(Y_train_ozone, y_pred)
    results[loss] = mae

print("MAE cho các loss function đối với Ozone:")
for loss, mae in results.items():
    print(f"{loss:<25}: {mae:.4f}")

print("\n\n")    
    
for loss in losses:
    model = SGDRegressor(loss=loss, epsilon=0.1, max_iter=10000, tol=1e-3, random_state=42)
    model.fit(X_train, Y_train_no2.values.ravel())
    y_pred = model.predict(X_train)
    mae = mean_absolute_error(Y_train_no2, y_pred)
    results[loss] = mae
    
print("MAE cho các loss function đối với NO2:")
for loss, mae in results.items():
    print(f"{loss:<25}: {mae:.4f}")

MAE cho các loss function đối với Ozone:
huber                    : 6.1749
epsilon_insensitive      : 28.8156
squared_epsilon_insensitive: 22112138439618.3633



MAE cho các loss function đối với NO2:
huber                    : 6.5398
epsilon_insensitive      : 6.7073
squared_epsilon_insensitive: 7531384844444.8174


In [None]:
linear_models = {
    'ozone_model': lr_ozone,
    'no2_model': lr_no2
}

# Lưu dictionary vào file pkl
model_save_path = 'models/linear_model.pkl'
try:
    with open(model_save_path, 'wb') as f:
        pickle.dump(linear_models, f)
    print(f"\nĐã lưu các mô hình Linear Regression vào file: {model_save_path}")
except Exception as e:
    print(f"Lỗi khi lưu mô hình: {e}")



Đã lưu các mô hình Linear Regression vào file: models/linear_model.pkl


: 

In [None]:
# Sử dụng kỹ thuật Ridge
ridge_o3 = Ridge(alpha=0.1)
ridge_o3.fit(X_train, Y_train_ozone)
y_pred_ridge_o3 = ridge_o3.predict(X_train)
mae_ridge_train = mean_absolute_error(Y_train_ozone, y_pred_ridge_o3)
print(f'MAE train ozone using Ridge Regression: {mae_ridge_train:.2f}')

print("\n")
ridge_no2 = Ridge(alpha=1.0)
ridge_no2.fit(X_train, Y_train_no2)
y_pred_ridge_no2 = ridge_no2.predict(X_train)
mae_ridge_train_no2 = mean_absolute_error(Y_train_no2, y_pred_ridge_no2)
print(f'MAE train no2 using Ridge Regression: {mae_ridge_train_no2:.2f}')


MAE train ozone using Ridge Regression: 5.63


MAE train no2 using Ridge Regression: 6.54


: 

In [None]:
# Sử dụng kỹ thuật Lasso
lasso_o3 = Lasso(alpha=0.1)
lasso_o3.fit(X_train, Y_train_ozone)
y_pred_lasso_o3 = lasso_o3.predict(X_train)
mae_lasso_train = mean_absolute_error(Y_train_ozone, y_pred_lasso_o3)
print(f'MAE train ozone using Lasso Regression: {mae_lasso_train:.2f}')

print("\n")

lasso_no2 = Lasso(alpha=0.1)
lasso_no2.fit(X_train, Y_train_no2)
y_pred_lasso_no2 = lasso_no2.predict(X_train)
mae_lasso_train_no2 = mean_absolute_error(Y_train_no2, y_pred_lasso_no2)
print(f'MAE train no2 using Lasso Regression: {mae_lasso_train_no2:.2f}')


  model = cd_fast.enet_coordinate_descent(


MAE train ozone using Lasso Regression: 5.63


MAE train no2 using Lasso Regression: 6.53


  model = cd_fast.enet_coordinate_descent(
