In [1]:
import pandas as pd
import numpy as np
import sklearn


In [2]:
df = pd.read_csv('../res/AirQualityUCI/AirQualityUCI.csv', sep=';', na_values=-200, decimal = ',')

names = ['date', 'time', 'co_gt', 'pt08_s1_co', 
         'nmhc_gt','c6h6_gt','pt08_s2_nmhc','nox_gt',
         'pt08_s3_nox','no2_gt','pt08_s4_no2','pt08_s5_o3',
         't','rh','ah','Unnamed_1','Unnamed_2']

df = df.rename(columns= {last_key: new_key for last_key, new_key in zip(df.keys(), names)})
df = df.drop(columns=['Unnamed_1', 'Unnamed_2', 'nmhc_gt'])
df = df.dropna(thresh=len(df.keys()))


In [3]:
df.info()
df.shape[0] - df.dropna().count()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 6941 entries, 0 to 9356
Data columns (total 14 columns):
date            6941 non-null object
time            6941 non-null object
co_gt           6941 non-null float64
pt08_s1_co      6941 non-null float64
c6h6_gt         6941 non-null float64
pt08_s2_nmhc    6941 non-null float64
nox_gt          6941 non-null float64
pt08_s3_nox     6941 non-null float64
no2_gt          6941 non-null float64
pt08_s4_no2     6941 non-null float64
pt08_s5_o3      6941 non-null float64
t               6941 non-null float64
rh              6941 non-null float64
ah              6941 non-null float64
dtypes: float64(12), object(2)
memory usage: 813.4+ KB


date            0
time            0
co_gt           0
pt08_s1_co      0
c6h6_gt         0
pt08_s2_nmhc    0
nox_gt          0
pt08_s3_nox     0
no2_gt          0
pt08_s4_no2     0
pt08_s5_o3      0
t               0
rh              0
ah              0
dtype: int64

стоит убрать столбец `nmhc_gt` так как он не несет информативности и имеет пропуски в 90% случаях 

In [4]:
df['time'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='%d/%m/%Y %H.%M.%S')
df = df.set_index('time').sort_index().drop(columns=['date'])
df.head()

Unnamed: 0_level_0,co_gt,pt08_s1_co,c6h6_gt,pt08_s2_nmhc,nox_gt,pt08_s3_nox,no2_gt,pt08_s4_no2,pt08_s5_o3,t,rh,ah
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2004-03-10 18:00:00,2.6,1360.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
2004-03-10 19:00:00,2.0,1292.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2004-03-10 20:00:00,2.2,1402.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
2004-03-10 21:00:00,2.2,1376.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
2004-03-10 22:00:00,1.6,1272.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888


## Base statistic

In [5]:
corr = pd.DataFrame(df.corr()['c6h6_gt'].drop('c6h6_gt'))
corr = corr[(np.abs(corr.c6h6_gt) > 0.5)]
corr

Unnamed: 0,c6h6_gt
co_gt,0.930008
pt08_s1_co,0.87743
pt08_s2_nmhc,0.982705
nox_gt,0.718344
pt08_s3_nox,-0.725722
no2_gt,0.603241
pt08_s4_no2,0.761805
pt08_s5_o3,0.861154


### Baseline model - linear regression without regularization

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [7]:
x_train, x_test, y_train, y_test=train_test_split(df.drop(columns=['c6h6_gt']),
                                                  np.array(df['c6h6_gt']).reshape(-1, 1),
                                                  test_size=0.3, random_state=42, shuffle= False)
ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.transform(x_test)
y_train = ss.fit_transform(y_train)
y_test = ss.transform(y_test)



In [8]:
lr=LinearRegression()
lr_model=lr.fit(x_train,y_train)

y_pred=lr_model.predict(x_test)

rmse=np.sqrt(mean_squared_error(y_test,y_pred))
r2 = r2_score(y_test, y_pred)
r2_adjusted = 1 - (1 - r2)* ((x_test.shape[0] - 1) / (x_test.shape[0] - x_test.shape[1] - 1))

print('RMSE of Linear model:',rmse)
print('R^2 of Linear model:', r2)
print('adjusted R^2:', r2_adjusted)

RMSE of Linear model: 0.26512895351433285
R^2 of Linear model: 0.9074794915869183
adjusted R^2: 0.9069880741110401


In [9]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

model = DecisionTreeRegressor()
model=model.fit(x_train,y_train)

y_pred = model.predict(x_test)

rmse=np.sqrt(mean_squared_error(y_test,y_pred))
r2 = r2_score(y_test, y_pred)
r2_adjusted = 1 - (1 - r2)* ((x_test.shape[0] - 1) / (x_test.shape[0] - x_test.shape[1] - 1))

print('RMSE of Linear model:',rmse)
print('R^2 of Linear model:', r2)
print('adjusted R^2:', r2_adjusted)

RMSE of Linear model: 0.005237299703477898
R^2 of Linear model: 0.9999638973941962
adjusted R^2: 0.9999637056372364


In [10]:
feature_importances = pd.Series(data=list(lr_model.coef_.reshape(11)), 
                             index=list(df.drop(columns=['c6h6_gt']).keys()))
feature_importances.nlargest(len(df) - 1).plot(kind='barh', xlim = (-1), figsize=(15,8))

<matplotlib.axes._subplots.AxesSubplot at 0x11e843b90>

### Feature engineering -  basic transformations (nonlinear) 

In [11]:
from sklearn.preprocessing import PolynomialFeatures

x_train, x_test, y_train, y_test=train_test_split(df.drop(columns=['c6h6_gt']),
                                                  np.array(df['c6h6_gt']).reshape(-1, 1),
                                                  test_size=0.3, random_state=42, shuffle= False)

ss = StandardScaler()
p_f = PolynomialFeatures(degree=2)


x_train = p_f.fit_transform(x_train).T[1::].T
x_train = ss.fit_transform(x_train)

x_test = p_f.transform(x_test).T[1::].T
x_test = ss.transform(x_test)

y_train = ss.fit_transform(y_train)
y_test = ss.transform(y_test)

In [12]:
lr=LinearRegression()
lr_model=lr.fit(x_train,y_train)

y_pred=lr_model.predict(x_test)

rmse=np.sqrt(mean_squared_error(y_test,y_pred))
r2 = r2_score(y_test, y_pred)
r2_adjusted = 1 - (1 - r2)* ((x_test.shape[0] - 1) / (x_test.shape[0] - x_test.shape[1] - 1))

print('RMSE of Poly model:',rmse)
print('R^2 of Poly model:', r2)
print('adjusted R^2:', r2_adjusted)

RMSE of Poly model: 0.012804511308873766
R^2 of Poly model: 0.9997842008590925
adjusted R^2: 0.9997759133110377


### Feature importance, hyperparameters tuning

In [13]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

import numpy as np
from sklearn.svm import SVR
import matplotlib.pyplot as plt


x_train, x_test, y_train, y_test=train_test_split(df.drop(columns=['c6h6_gt']),
                                                  np.array(df['c6h6_gt']).reshape(-1, 1),
                                                  test_size=0.3, random_state=42, shuffle= False)

ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.transform(x_test)
y_train = ss.fit_transform(y_train)
y_test = ss.transform(y_test)

parameters = {
    'kernel': ('linear', 'rbf','poly'),
    'C':[10**(x) for x in range(-2, 2, 1)],
    'gamma': [10**(x) for x in range(-4, 0, 1)],
    'epsilon':[10**x for x in range(-2, 2, 1)]
}

svr = SVR(gamma='scale')
clf = GridSearchCV(svr, parameters, cv = TimeSeriesSplit(n_splits=3), verbose=2)
clf.fit(x_train,y_train.ravel())

Fitting 3 folds for each of 192 candidates, totalling 576 fits
[CV] C=0.01, epsilon=0.01, gamma=0.0001, kernel=linear ...............
[CV]  C=0.01, epsilon=0.01, gamma=0.0001, kernel=linear, total=   0.1s
[CV] C=0.01, epsilon=0.01, gamma=0.0001, kernel=linear ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV]  C=0.01, epsilon=0.01, gamma=0.0001, kernel=linear, total=   0.3s
[CV] C=0.01, epsilon=0.01, gamma=0.0001, kernel=linear ...............
[CV]  C=0.01, epsilon=0.01, gamma=0.0001, kernel=linear, total=   0.5s
[CV] C=0.01, epsilon=0.01, gamma=0.0001, kernel=rbf ..................
[CV] ... C=0.01, epsilon=0.01, gamma=0.0001, kernel=rbf, total=   0.1s
[CV] C=0.01, epsilon=0.01, gamma=0.0001, kernel=rbf ..................
[CV] ... C=0.01, epsilon=0.01, gamma=0.0001, kernel=rbf, total=   0.5s
[CV] C=0.01, epsilon=0.01, gamma=0.0001, kernel=rbf ..................
[CV] ... C=0.01, epsilon=0.01, gamma=0.0001, kernel=rbf, total=   1.0s
[CV] C=0.01, epsilon=0.01, gamma=0.0001, kernel=poly .................
[CV] .. C=0.01, epsilon=0.01, gamma=0.0001, kernel=poly, total=   0.1s
[CV] C=0.01, epsilon=0.01, gamma=0.0001, kernel=poly .................
[CV] .. C=0.01, epsilon=0.01, gamma=0.0001, kernel=poly, total=   0.4s
[CV] C=0.01, epsilon=0.01, gamma=0.0001, kernel=poly .................
[CV] .

[CV] ...... C=0.01, epsilon=0.1, gamma=0.01, kernel=rbf, total=   0.5s
[CV] C=0.01, epsilon=0.1, gamma=0.01, kernel=poly ....................
[CV] ..... C=0.01, epsilon=0.1, gamma=0.01, kernel=poly, total=   0.2s
[CV] C=0.01, epsilon=0.1, gamma=0.01, kernel=poly ....................
[CV] ..... C=0.01, epsilon=0.1, gamma=0.01, kernel=poly, total=   0.6s
[CV] C=0.01, epsilon=0.1, gamma=0.01, kernel=poly ....................
[CV] ..... C=0.01, epsilon=0.1, gamma=0.01, kernel=poly, total=   1.1s
[CV] C=0.01, epsilon=0.1, gamma=0.1, kernel=linear ...................
[CV] .... C=0.01, epsilon=0.1, gamma=0.1, kernel=linear, total=   0.1s
[CV] C=0.01, epsilon=0.1, gamma=0.1, kernel=linear ...................
[CV] .... C=0.01, epsilon=0.1, gamma=0.1, kernel=linear, total=   0.1s
[CV] C=0.01, epsilon=0.1, gamma=0.1, kernel=linear ...................
[CV] .... C=0.01, epsilon=0.1, gamma=0.1, kernel=linear, total=   0.2s
[CV] C=0.01, epsilon=0.1, gamma=0.1, kernel=rbf ......................
[CV] .

[CV] . C=0.1, epsilon=0.01, gamma=0.0001, kernel=linear, total=   0.2s
[CV] C=0.1, epsilon=0.01, gamma=0.0001, kernel=linear ................
[CV] . C=0.1, epsilon=0.01, gamma=0.0001, kernel=linear, total=   0.6s
[CV] C=0.1, epsilon=0.01, gamma=0.0001, kernel=linear ................
[CV] . C=0.1, epsilon=0.01, gamma=0.0001, kernel=linear, total=   1.4s
[CV] C=0.1, epsilon=0.01, gamma=0.0001, kernel=rbf ...................
[CV] .... C=0.1, epsilon=0.01, gamma=0.0001, kernel=rbf, total=   0.2s
[CV] C=0.1, epsilon=0.01, gamma=0.0001, kernel=rbf ...................
[CV] .... C=0.1, epsilon=0.01, gamma=0.0001, kernel=rbf, total=   0.6s
[CV] C=0.1, epsilon=0.01, gamma=0.0001, kernel=rbf ...................
[CV] .... C=0.1, epsilon=0.01, gamma=0.0001, kernel=rbf, total=   0.9s
[CV] C=0.1, epsilon=0.01, gamma=0.0001, kernel=poly ..................
[CV] ... C=0.1, epsilon=0.01, gamma=0.0001, kernel=poly, total=   0.3s
[CV] C=0.1, epsilon=0.01, gamma=0.0001, kernel=poly ..................
[CV] .

[CV] ....... C=0.1, epsilon=0.1, gamma=0.01, kernel=rbf, total=   0.2s
[CV] C=0.1, epsilon=0.1, gamma=0.01, kernel=poly .....................
[CV] ...... C=0.1, epsilon=0.1, gamma=0.01, kernel=poly, total=   0.2s
[CV] C=0.1, epsilon=0.1, gamma=0.01, kernel=poly .....................
[CV] ...... C=0.1, epsilon=0.1, gamma=0.01, kernel=poly, total=   0.4s
[CV] C=0.1, epsilon=0.1, gamma=0.01, kernel=poly .....................
[CV] ...... C=0.1, epsilon=0.1, gamma=0.01, kernel=poly, total=   0.9s
[CV] C=0.1, epsilon=0.1, gamma=0.1, kernel=linear ....................
[CV] ..... C=0.1, epsilon=0.1, gamma=0.1, kernel=linear, total=   0.0s
[CV] C=0.1, epsilon=0.1, gamma=0.1, kernel=linear ....................
[CV] ..... C=0.1, epsilon=0.1, gamma=0.1, kernel=linear, total=   0.1s
[CV] C=0.1, epsilon=0.1, gamma=0.1, kernel=linear ....................
[CV] ..... C=0.1, epsilon=0.1, gamma=0.1, kernel=linear, total=   0.4s
[CV] C=0.1, epsilon=0.1, gamma=0.1, kernel=rbf .......................
[CV] .

[CV] ... C=1, epsilon=0.01, gamma=0.0001, kernel=linear, total=   0.5s
[CV] C=1, epsilon=0.01, gamma=0.0001, kernel=linear ..................
[CV] ... C=1, epsilon=0.01, gamma=0.0001, kernel=linear, total=   1.8s
[CV] C=1, epsilon=0.01, gamma=0.0001, kernel=linear ..................
[CV] ... C=1, epsilon=0.01, gamma=0.0001, kernel=linear, total=   4.5s
[CV] C=1, epsilon=0.01, gamma=0.0001, kernel=rbf .....................
[CV] ...... C=1, epsilon=0.01, gamma=0.0001, kernel=rbf, total=   0.1s
[CV] C=1, epsilon=0.01, gamma=0.0001, kernel=rbf .....................
[CV] ...... C=1, epsilon=0.01, gamma=0.0001, kernel=rbf, total=   0.4s
[CV] C=1, epsilon=0.01, gamma=0.0001, kernel=rbf .....................
[CV] ...... C=1, epsilon=0.01, gamma=0.0001, kernel=rbf, total=   1.0s
[CV] C=1, epsilon=0.01, gamma=0.0001, kernel=poly ....................
[CV] ..... C=1, epsilon=0.01, gamma=0.0001, kernel=poly, total=   0.1s
[CV] C=1, epsilon=0.01, gamma=0.0001, kernel=poly ....................
[CV] .

[CV] ........ C=1, epsilon=0.1, gamma=0.01, kernel=poly, total=   0.1s
[CV] C=1, epsilon=0.1, gamma=0.01, kernel=poly .......................
[CV] ........ C=1, epsilon=0.1, gamma=0.01, kernel=poly, total=   0.3s
[CV] C=1, epsilon=0.1, gamma=0.01, kernel=poly .......................
[CV] ........ C=1, epsilon=0.1, gamma=0.01, kernel=poly, total=   0.7s
[CV] C=1, epsilon=0.1, gamma=0.1, kernel=linear ......................
[CV] ....... C=1, epsilon=0.1, gamma=0.1, kernel=linear, total=   0.1s
[CV] C=1, epsilon=0.1, gamma=0.1, kernel=linear ......................
[CV] ....... C=1, epsilon=0.1, gamma=0.1, kernel=linear, total=   0.6s
[CV] C=1, epsilon=0.1, gamma=0.1, kernel=linear ......................
[CV] ....... C=1, epsilon=0.1, gamma=0.1, kernel=linear, total=   1.5s
[CV] C=1, epsilon=0.1, gamma=0.1, kernel=rbf .........................
[CV] .......... C=1, epsilon=0.1, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=1, epsilon=0.1, gamma=0.1, kernel=rbf .........................
[CV] .

[CV] .. C=10, epsilon=0.01, gamma=0.0001, kernel=linear, total=   2.6s
[CV] C=10, epsilon=0.01, gamma=0.0001, kernel=linear .................
[CV] .. C=10, epsilon=0.01, gamma=0.0001, kernel=linear, total=  12.9s
[CV] C=10, epsilon=0.01, gamma=0.0001, kernel=linear .................
[CV] .. C=10, epsilon=0.01, gamma=0.0001, kernel=linear, total=  33.2s
[CV] C=10, epsilon=0.01, gamma=0.0001, kernel=rbf ....................
[CV] ..... C=10, epsilon=0.01, gamma=0.0001, kernel=rbf, total=   0.3s
[CV] C=10, epsilon=0.01, gamma=0.0001, kernel=rbf ....................
[CV] ..... C=10, epsilon=0.01, gamma=0.0001, kernel=rbf, total=   0.3s
[CV] C=10, epsilon=0.01, gamma=0.0001, kernel=rbf ....................
[CV] ..... C=10, epsilon=0.01, gamma=0.0001, kernel=rbf, total=   0.8s
[CV] C=10, epsilon=0.01, gamma=0.0001, kernel=poly ...................
[CV] .... C=10, epsilon=0.01, gamma=0.0001, kernel=poly, total=   0.1s
[CV] C=10, epsilon=0.01, gamma=0.0001, kernel=poly ...................
[CV] .

[CV] ....... C=10, epsilon=0.1, gamma=0.01, kernel=poly, total=   0.2s
[CV] C=10, epsilon=0.1, gamma=0.01, kernel=poly ......................
[CV] ....... C=10, epsilon=0.1, gamma=0.01, kernel=poly, total=   0.4s
[CV] C=10, epsilon=0.1, gamma=0.1, kernel=linear .....................
[CV] ...... C=10, epsilon=0.1, gamma=0.1, kernel=linear, total=   0.7s
[CV] C=10, epsilon=0.1, gamma=0.1, kernel=linear .....................
[CV] ...... C=10, epsilon=0.1, gamma=0.1, kernel=linear, total=   3.5s
[CV] C=10, epsilon=0.1, gamma=0.1, kernel=linear .....................
[CV] ...... C=10, epsilon=0.1, gamma=0.1, kernel=linear, total=   7.0s
[CV] C=10, epsilon=0.1, gamma=0.1, kernel=rbf ........................
[CV] ......... C=10, epsilon=0.1, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=10, epsilon=0.1, gamma=0.1, kernel=rbf ........................
[CV] ......... C=10, epsilon=0.1, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=10, epsilon=0.1, gamma=0.1, kernel=rbf ........................
[CV] .

[CV] ........ C=10, epsilon=10, gamma=0.01, kernel=poly, total=   0.0s
[CV] C=10, epsilon=10, gamma=0.1, kernel=linear ......................
[CV] ....... C=10, epsilon=10, gamma=0.1, kernel=linear, total=   0.0s
[CV] C=10, epsilon=10, gamma=0.1, kernel=linear ......................
[CV] ....... C=10, epsilon=10, gamma=0.1, kernel=linear, total=   0.0s
[CV] C=10, epsilon=10, gamma=0.1, kernel=linear ......................
[CV] ....... C=10, epsilon=10, gamma=0.1, kernel=linear, total=   0.0s
[CV] C=10, epsilon=10, gamma=0.1, kernel=rbf .........................
[CV] .......... C=10, epsilon=10, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=10, epsilon=10, gamma=0.1, kernel=rbf .........................
[CV] .......... C=10, epsilon=10, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=10, epsilon=10, gamma=0.1, kernel=rbf .........................
[CV] .......... C=10, epsilon=10, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=10, epsilon=10, gamma=0.1, kernel=poly ........................
[CV] .

[Parallel(n_jobs=1)]: Done 576 out of 576 | elapsed:  6.1min finished


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
             error_score='raise-deprecating',
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.01, 0.1, 1, 10], 'epsilon': [0.01, 0.1, 1, 10],
                         'gamma': [0.0001, 0.001, 0.01, 0.1],
                         'kernel': ('linear', 'rbf', 'poly')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [16]:
best_model = clf.best_estimator_

y_pred=best_model.predict(x_test)

rmse=np.sqrt(mean_squared_error(y_test,y_pred))
r2 = r2_score(y_test, y_pred)
r2_adjusted = 1 - (1 - r2)* ((x_test.shape[0] - 1) / (x_test.shape[0] - x_test.shape[1] - 1))

print('RMSE of Best model:',rmse)
print('R^2 of Best model:', r2)
print('adjusted R^2:', r2_adjusted)

RMSE of Best model: 0.017458079785411536
R^2 of Best model: 0.9995988409046825
adjusted R^2: 0.9995967101707142
