In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../res/AirQualityUCI/AirQualityUCI.csv', sep=';', na_values=-200, decimal = ',')

names = ['date', 'time', 'co_gt', 'pt08_s1_co', 
         'nmhc_gt','c6h6_gt','pt08_s2_nmhc','nox_gt',
         'pt08_s3_nox','no2_gt','pt08_s4_no2','pt08_s5_o3',
         't','rh','ah','Unnamed_1','Unnamed_2']

df = df.rename(columns= {last_key: new_key for last_key, new_key in zip(df.keys(), names)})
df = df.drop(columns=['Unnamed_1', 'Unnamed_2', 'nmhc_gt'])
df = df.dropna(thresh=len(df.keys()))


In [3]:
df.info()
df.shape[0] - df.dropna().count()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 6941 entries, 0 to 9356
Data columns (total 14 columns):
date            6941 non-null object
time            6941 non-null object
co_gt           6941 non-null float64
pt08_s1_co      6941 non-null float64
c6h6_gt         6941 non-null float64
pt08_s2_nmhc    6941 non-null float64
nox_gt          6941 non-null float64
pt08_s3_nox     6941 non-null float64
no2_gt          6941 non-null float64
pt08_s4_no2     6941 non-null float64
pt08_s5_o3      6941 non-null float64
t               6941 non-null float64
rh              6941 non-null float64
ah              6941 non-null float64
dtypes: float64(12), object(2)
memory usage: 813.4+ KB


date            0
time            0
co_gt           0
pt08_s1_co      0
c6h6_gt         0
pt08_s2_nmhc    0
nox_gt          0
pt08_s3_nox     0
no2_gt          0
pt08_s4_no2     0
pt08_s5_o3      0
t               0
rh              0
ah              0
dtype: int64

стоит убрать столбец `nmhc_gt` так как он не несет информативности и имеет пропуски в 90% случаях 

In [4]:
df['time'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='%d/%m/%Y %H.%M.%S')
df = df.set_index('time').sort_index().drop(columns=['date'])
df.head()

Unnamed: 0_level_0,co_gt,pt08_s1_co,c6h6_gt,pt08_s2_nmhc,nox_gt,pt08_s3_nox,no2_gt,pt08_s4_no2,pt08_s5_o3,t,rh,ah
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2004-03-10 18:00:00,2.6,1360.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
2004-03-10 19:00:00,2.0,1292.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2004-03-10 20:00:00,2.2,1402.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
2004-03-10 21:00:00,2.2,1376.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
2004-03-10 22:00:00,1.6,1272.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888


## Base statistic

In [5]:
corr = pd.DataFrame(df.corr()['c6h6_gt'].drop('c6h6_gt'))
corr = corr[(np.abs(corr.c6h6_gt) > 0.5)]
corr

Unnamed: 0,c6h6_gt
co_gt,0.930008
pt08_s1_co,0.87743
pt08_s2_nmhc,0.982705
nox_gt,0.718344
pt08_s3_nox,-0.725722
no2_gt,0.603241
pt08_s4_no2,0.761805
pt08_s5_o3,0.861154


### Baseline model - linear regression without regularization

In [31]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [32]:
ss = StandardScaler()
x = ss.fit_transform(df.drop(columns=['c6h6_gt']))
y = ss.fit_transform(np.array(df['c6h6_gt']).reshape(-1, 1))

x_train, x_test, y_train, y_test=train_test_split(x ,y,test_size=0.3, random_state=42, shuffle= False)

In [33]:
lr=LinearRegression()
lr_model=lr.fit(x_train,y_train)

y_pred=lr_model.predict(x_test)
rmse=np.sqrt(mean_squared_error(y_test,y_pred))
print('RMSE of Linear model:',rmse)
print('R^2 of Linear model:', r2_score(y_test, y_pred))

RMSE of Linear model: 0.2719667472193049
R^2 of Linear model: 0.9074794915869189


In [9]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.cross_validation import cross_val_score

model = DecisionTreeRegressor()
model=model.fit(x_train,y_train)

y_pred = model.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_pred,y_test))
print('RMSE of Decision Tree Regression:', rmse)
print('R^2 of Decision Tree Regression:', r2_score(y_test, y_pred))

RMSE of Decision Tree Regression: 0.06102829931416377
R^2 of Decision Tree Regression: 0.9953412550856929




### Feature engineering -  basic transformations (nonlinear) 

In [44]:
from sklearn.preprocessing import PolynomialFeatures

ss = StandardScaler()
p_f = PolynomialFeatures(degree=2)

x = df.drop(columns=['c6h6_gt'])
x = p_f.fit_transform(df.drop(columns=['c6h6_gt']))
x = ss.fit_transform(x)
x = x.T[1::].T
y = np.array(df['c6h6_gt']).reshape(-1, 1)

x_train, x_test, y_train, y_test=train_test_split(x ,y,test_size=0.3, random_state=42,shuffle= False)

In [45]:
lr=LinearRegression()
lr_model=lr.fit(x_train,y_train)

y_pred=lr_model.predict(x_test)

rmse=np.sqrt(mean_squared_error(y_test,y_pred))

print('RMSE of Poly model:',rmse)
print('R^2 of Poly model:', r2_score(y_test, y_pred))

RMSE of Poly model: 0.09804678320095545
R^2 of Poly model: 0.9997842008590925


### Non-classic regression 

In [46]:
import numpy as np
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor as sgd
import matplotlib.pyplot as plt

svr_rbf = SVR(kernel='rbf', gamma=0.1)
svr_lin = SVR(kernel='linear')
svr_poly = SVR(kernel='poly',degree=2)
sgd = sgd()


y_rbf = svr_rbf.fit(x_train, y_train).predict(x_test)
y_lin = svr_lin.fit(x_train, y_train).predict(x_test)
y_poly = svr_poly.fit(x_train, y_train).predict(x_test)
y_sgd = sgd.fit(x_train, y_train).predict(x_test)



print("Mean Squared Error rbf:",mean_squared_error(y_rbf, y_test))
print("Mean Squared Error lin:",mean_squared_error(y_lin, y_test))
print("Mean Squared Error: poly",mean_squared_error(y_poly, y_test))
print("Mean Squared Error: sgd",mean_squared_error(y_sgd, y_test))

  y = column_or_1d(y, warn=True)


Mean Squared Error rbf: 20.892605567707786
Mean Squared Error lin: 0.01840895388824787
Mean Squared Error: poly 28.83003832765449
Mean Squared Error: sgd 0.29970887262861184




### Feature importance, hyperparameters tuning

In [29]:
from sklearn.svm import SVR
import numpy as np
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': 
              (
                  'linear', 
                  'rbf',
                  'poly'
              ), 
              'C':[1.5, 10],
              'gamma': [1e-7, 1e-4],
              'epsilon':[0.1,0.2]}
svr = SVR()
clf = GridSearchCV(svr, parameters)
clf.fit(x,y.ravel())
clf.best_params_

{'C': 10, 'epsilon': 0.2, 'gamma': 0.0001, 'kernel': 'rbf'}