In [None]:
import pandas as pd
import numpy as np
import matplotlib
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import joblib
import pickle

In [None]:
data_path = 'data/train.csv'

columns = ['Time', 'OZONE', 'NO2', 'temp', 'humidity', 'no2op1', 'no2op2', 'o3op1', 'o3op2']

raw = pd.read_csv(data_path, header=True, names=columns)

In [None]:
print(raw.head())

                  Time  OZONE     NO2  temp  humidity  no2op1  no2op2  o3op1  \
0  2019-03-27 13:01:00  77.59   6.881  36.2      38.9   199.0   200.0  240.0   
1  2019-03-27 13:03:00  78.71  11.057  36.3      37.7   196.0   200.0  237.0   
2  2019-03-27 13:04:00  78.85   8.596  36.7      38.0   195.0   199.0  235.0   
3  2019-03-27 13:07:00  79.27   7.248  37.0      37.5   193.0   198.0  233.0   
4  2019-03-27 13:08:00  80.01   8.638  36.8      36.8   191.0   198.0  231.0   

   o3op2  
0  197.0  
1  196.0  
2  196.0  
3  195.0  
4  195.0  


In [10]:
X_train = raw[['no2op1', 'no2op2', 'o3op1', 'o3op2']]
print(X_train.head())   

   no2op1  no2op2  o3op1  o3op2
0   199.0   200.0  240.0  197.0
1   196.0   200.0  237.0  196.0
2   195.0   199.0  235.0  196.0
3   193.0   198.0  233.0  195.0
4   191.0   198.0  231.0  195.0


In [11]:
Y_train_ozone = raw[['OZONE']]
print(Y_train_ozone.head())


   OZONE
0  77.59
1  78.71
2  78.85
3  79.27
4  80.01


In [None]:
Y_train_no2 = raw[['NO2']]
print(Y_train_no2.head())

      NO2
0   6.881
1  11.057
2   8.596
3   7.248
4   8.638


In [33]:
#Khởi tạo linear model
lr_ozone = LinearRegression()

lr_ozone.fit(X_train, Y_train_ozone)

formulate_o3 = f"O3 ≈ {lr_ozone.intercept_[0]:.4f}"
for i, feature in enumerate(X_train.columns):
    coeff = lr_ozone.coef_[0][i]
    if coeff >= 0:
        formulate_o3 += f" + {coeff:.4f} * {feature}"
    else:
        formulate_o3 += f" - {abs(coeff):.4f} * {feature}"
print(formulate_o3)


O3 ≈ 10.9386 - 1.7559 * no2op1 + 1.1150 * no2op2 + 1.5643 * o3op1 - 0.9241 * o3op2


In [34]:
lr_no2 = LinearRegression()

lr_no2.fit(X_train, Y_train_no2)


formulate_no2 = f"NO2 ≈ {lr_no2.intercept_[0]:.4f}"
for i, feature in enumerate(X_train.columns):
    coeff = lr_no2.coef_[0][i]
    if coeff >= 0:
        formulate_no2 += f" + {coeff:.4f} * {feature}"
    else:
        formulate_no2 += f" - {abs(coeff):.4f} * {feature}"
print(formulate_no2)
    

NO2 ≈ 43.4556 + 0.8547 * no2op1 - 2.4247 * no2op2 - 0.0119 * o3op1 + 1.4628 * o3op2


In [24]:
y_pred_ozone = lr_ozone.predict(X_train)
mae_ozone_train = mean_absolute_error(Y_train_ozone, y_pred_ozone)
print(f'MAE train ozone: {mae_ozone_train}')

MAE train ozone: 5.6259377355149365


In [35]:
y_pred_no2 = lr_no2.predict(X_train)
mae_no2_train = mean_absolute_error(Y_train_no2, y_pred_no2)
print(f'MAE train no2: {mae_no2_train}')

MAE train no2: 6.5401000938427885


In [37]:
linear_models = {
    'ozone_model': lr_ozone,
    'no2_model': lr_no2
}

# Lưu dictionary vào file pkl
model_save_path = 'models/linear_model.pkl'
try:
    joblib.dump(linear_models, model_save_path)
    # Hoặc dùng pickle:
    # import pickle
    # with open(model_save_path, 'wb') as f:
    #     pickle.dump(linear_models, f)
    print(f"\nĐã lưu các mô hình Linear Regression vào file: {model_save_path}")
except Exception as e:
    print(f"Lỗi khi lưu mô hình: {e}")



Đã lưu các mô hình Linear Regression vào file: models/linear_model.pkl


In [39]:
print(lr_no2)

LinearRegression()
