In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet

In [2]:
concrete_frame = pd.read_csv('concrete_preprocessed.csv',index_col=0)
concrete_frame.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Concrete compressive strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05278
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075


In [3]:
concrete_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Cement                         1030 non-null   float64
 1   Blast Furnace Slag             1030 non-null   float64
 2   Fly Ash                        1030 non-null   float64
 3   Water                          1030 non-null   float64
 4   Superplasticizer               1030 non-null   float64
 5   Coarse Aggregate               1030 non-null   float64
 6   Fine Aggregate                 1030 non-null   float64
 7   Age                            1030 non-null   int64  
 8   Concrete compressive strength  1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 80.5 KB


In [4]:
x = concrete_frame.iloc[:,:-1]
y = concrete_frame.iloc[:,-1]

In [5]:
x.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360


In [6]:
y.head()

0    79.986111
1    61.887366
2    40.269535
3    41.052780
4    44.296075
Name: Concrete compressive strength, dtype: float64

In [7]:
(x_train, x_test, y_train, y_test) = train_test_split(x, y,test_size = 0.2, random_state = 0)

In [8]:
scaler = StandardScaler()
scaler.fit(x_train, y_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

## Линейная регрессия

Осуществляется с помощью sklearn.linear_model.LinearRegression.

Ссылка на документацию: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

Обучаем регрессор LineatRegression

In [9]:
lr = LinearRegression(normalize=True,n_jobs=-1)
lr.fit(x_train, y_train)
y_pred_lr = lr.predict(x_test)
print(r2_score(y_test, y_pred_lr))
print(mean_squared_error(y_test, y_pred_lr))
print(mean_squared_error(y_test, y_pred_lr, squared=False))
print(mean_absolute_error(y_test, y_pred_lr))

0.6368981103411095
95.6353348269099
9.779332023554058
7.865298605808521


Модель показывает не хороший результат. Так же нельзя подобрать гиперпараметры чтобы улучшить её.

## Регрессия дерева решений

Осуществляется с помощью sklearn.tree.DecisionTreeRegressor.

Ссылка на документацию: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html



In [10]:
dtr = DecisionTreeRegressor()
dtr.fit(x_train, y_train)
y_pred_dtr = dtr.predict(x_test)
print(r2_score(y_test, y_pred_dtr))
print(mean_squared_error(y_test, y_pred_dtr))
print(mean_squared_error(y_test, y_pred_dtr, squared=False))
print(mean_absolute_error(y_test, y_pred_dtr))

0.7805046008960914
57.81164071051411
7.603396656134291
4.767819298921417


Дерево решений показало результаты лучше, чем линейнаяы регрессия. Попробуем подобрать гиперпараметры, чтобы улучшить результат.

Воспользуемся sklearn.model_selection.GridSearchCV

In [11]:
dtr.get_params().keys()

dict_keys(['ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'presort', 'random_state', 'splitter'])

In [12]:
dtr_parameters_grid = {'criterion' : ['mse', 'friedman_mse', 'mae', 'poisson'],
                       'splitter' : ['best', 'random'],
                       'max_depth' : range(1, 11),
                       'min_samples_split' : range(1, 10) 
}

In [13]:
dtr_grid = GridSearchCV(dtr, dtr_parameters_grid,
                        cv=5, n_jobs=-1, verbose=True)
dtr_grid.fit(x_train, y_train)
dtr_grid.best_params_

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 584 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed:   19.4s finished


{'criterion': 'mse',
 'max_depth': 8,
 'min_samples_split': 3,
 'splitter': 'best'}

In [14]:
dtr1 = DecisionTreeRegressor(criterion='mse', max_depth=10, splitter='best',min_samples_split=4)
dtr1.fit(x_train, y_train)
y_pred_dtr1 = dtr1.predict(x_test)
print(r2_score(y_test, y_pred_dtr1))
print(mean_squared_error(y_test, y_pred_dtr1))
print(mean_squared_error(y_test, y_pred_dtr1, squared=False))
print(mean_absolute_error(y_test, y_pred_dtr1))

0.7924113963744821
54.67557779065111
7.394293596460118
4.833047910945383


Результаты улучшились, но не значительно.

## LASSO регрессия

Осуществляется с помощью sklearn.linear_model.Lasso.

Ссылка на документацию: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html

In [15]:
lasso = Lasso()
lasso.fit(x_train,y_train)
y_pred_lasso = lasso.predict(x_test)
print(r2_score(y_test, y_pred_lasso))
print(mean_squared_error(y_test, y_pred_lasso))
print(mean_squared_error(y_test, y_pred_lasso, squared=False))
print(mean_absolute_error(y_test, y_pred_lasso))

0.6380351738941825
95.33585014584763
9.7640078935777
7.840308993829385


Результаты схожи с линейной регрессией. Попробуем подобрать параметры с помощью GridSearchCV

In [16]:
lasso.get_params().keys()

dict_keys(['alpha', 'copy_X', 'fit_intercept', 'max_iter', 'normalize', 'positive', 'precompute', 'random_state', 'selection', 'tol', 'warm_start'])

In [17]:
lasso_parameters_grid = {'alpha' : np.arange(0.01, 1.0, 0.05),
                         'normalize' : [True, False],
                         'max_iter' : [10, 50 ,100, 200, 400, 800, 1000, 1500, 2000],
                         'selection' : ['cyclic', 'random'],
                         'warm_start':[True,False]
                         }

In [18]:
lasso_grid = GridSearchCV(lasso, lasso_parameters_grid, cv = 5, n_jobs=-1, verbose=True)
lasso_grid.fit(x_train, y_train)
lasso_grid.best_params_

Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 956 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 5756 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed:   23.3s finished
  positive)


{'alpha': 0.11,
 'max_iter': 100,
 'normalize': False,
 'selection': 'random',
 'warm_start': True}

In [19]:
lasso1 = Lasso(alpha=0.11, max_iter=100, selection='random')
lasso1.fit(x_train,y_train)
y_pred_lasso1 = lasso.predict(x_test)
print(r2_score(y_test, y_pred_lasso1))
print(mean_squared_error(y_test, y_pred_lasso1))
print(mean_squared_error(y_test, y_pred_lasso1, squared=False))
print(mean_absolute_error(y_test, y_pred_lasso1))

0.6380351738941825
95.33585014584763
9.7640078935777
7.840308993829385


  positive)


После подбора параметров результат никак не изменился.

## Гребневая регрессия

Осуществляется с помощью sklearn.linear_model.Ridge.

Ссылка на документацию: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html

---



In [23]:
ridge = Ridge()
ridge.fit(x_train_scaled,y_train)
y_pred_ridge = ridge.predict(x_test_scaled)
print(r2_score(y_test, y_pred_ridge))
print(mean_squared_error(y_test, y_pred_ridge))
print(mean_squared_error(y_test, y_pred_ridge, squared=False))
print(mean_absolute_error(y_test, y_pred_ridge))

0.6371483910782557
95.5694148116762
9.775961068441108
7.860731949198349


Результат схож с линейной и lasso регрессией. Подберём параметры по сетке.

In [37]:
ridge.get_params().keys()

dict_keys(['alpha', 'copy_X', 'fit_intercept', 'max_iter', 'normalize', 'random_state', 'solver', 'tol'])

In [26]:
ridge_parameters_grid = {'alpha' : np.arange(0.01, 1.0, 0.05),
                         'max_iter' : [10, 50 ,100, 200, 400, 800, 1000, 1500, 2000],
                         'solver' : ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
                         'fit_intercept' : [True, False]
                         }

In [27]:
ridge_grid = GridSearchCV(ridge, ridge_parameters_grid, cv = 5, n_jobs=-1, verbose=True)
ridge_grid.fit(x_train_scaled, y_train)
ridge_grid.best_params_

Fitting 5 folds for each of 2520 candidates, totalling 12600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 904 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 10504 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done 12600 out of 12600 | elapsed:   25.9s finished


{'alpha': 0.21000000000000002,
 'fit_intercept': True,
 'max_iter': 10,
 'solver': 'sag'}

In [28]:
ridge1 = Ridge(alpha=0.21, max_iter=10,solver='sag')
ridge1.fit(x_train_scaled,y_train)
y_pred_ridge1 = ridge1.predict(x_test_scaled)
print(r2_score(y_test, y_pred_ridge1))
print(mean_squared_error(y_test, y_pred_ridge1))
print(mean_squared_error(y_test, y_pred_ridge1, squared=False))
print(mean_absolute_error(y_test, y_pred_ridge1))

0.6387008938761193
95.16050995846027
9.755024856885823
7.835917148959343




Результат незначительно вырос.

## Elastic Net регрессия

Осуществляется с помощью sklearn.linear_model.ElasticNet

Ссылка на документацию: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html

In [34]:
en = ElasticNet()
en.fit(x_train, y_train)
y_pred_en = en.predict(x_test)
print(r2_score(y_test, y_pred_en))
print(mean_squared_error(y_test, y_pred_en))
print(mean_squared_error(y_test, y_pred_en, squared=False))
print(mean_absolute_error(y_test, y_pred_en))

0.6376299559291188
95.44257819345746
9.769471745875386
7.8500087161825975


Elastic Net регрессия выдаёт не лучший результат. Попробуем улучшить, подбирая параметры.

In [36]:
en.get_params().keys()

dict_keys(['alpha', 'copy_X', 'fit_intercept', 'l1_ratio', 'max_iter', 'normalize', 'positive', 'precompute', 'random_state', 'selection', 'tol', 'warm_start'])

In [40]:
en_parameters_grid = {'alpha' : np.arange(0.01, 1.0, 0.05),
                         'l1_ratio' : np.arange(0.1, 1.0, 0.1),
                         'max_iter' : [10, 50 ,100, 200, 400, 800, 1000],
                         'fit_intercept' : [True, False],
                         'normalize' : [True, False],
                         'selection' : ['cyclic', 'random'],
                         }

In [41]:
en_grid = GridSearchCV(en, en_parameters_grid, cv = 5, n_jobs=-1, verbose=True)
en_grid.fit(x_train, y_train)
en_grid.best_params_

Fitting 5 folds for each of 10080 candidates, totalling 50400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 584 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 5384 tasks      | elapsed:   21.3s
[Parallel(n_jobs=-1)]: Done 13384 tasks      | elapsed:   50.6s
[Parallel(n_jobs=-1)]: Done 24584 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 38984 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 50400 out of 50400 | elapsed:  3.0min finished
  positive)


{'alpha': 0.51,
 'fit_intercept': False,
 'l1_ratio': 0.5,
 'max_iter': 200,
 'normalize': True,
 'selection': 'random'}

In [42]:
en1 = ElasticNet(alpha=0.51, fit_intercept=False, max_iter=200, normalize=True,selection='random')
en1.fit(x_train, y_train)
y_pred_en1 = en1.predict(x_test)
print(r2_score(y_test, y_pred_en1))
print(mean_squared_error(y_test, y_pred_en1))
print(mean_squared_error(y_test, y_pred_en1, squared=False))
print(mean_absolute_error(y_test, y_pred_en1))

0.640174419015117
94.77240658015975
9.735112047642788
7.810144988182405


  positive)


С подобранными параметрами результат немного улучшился, но подбор был слишком долгим.

Из всех алгоритмов регрессии, регрессия дерева решений лучше всего показала себя на данном датасете.