In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn import ensemble  
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Ridge

# Композиции модели для задачи классификации

## Подготовка данных

In [None]:
abalone = pd.read_csv('../data/abalone_preprocessed.csv', index_col=0)
abalone.head()

Unnamed: 0,Female,Infant,Male,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0.0,0.0,1.0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0.0,0.0,1.0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,1.0,0.0,0.0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0.0,0.0,1.0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,0.0,1.0,0.0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [None]:
X_cat = abalone[['Female','Infant', 'Male']]
y = pd.DataFrame(abalone.iloc[:,-1])
X_real = abalone.iloc[:, 3:-1]

In [None]:
for i in range(len(y)):
  if(y['Rings'][i] < 8):
    y['Rings'][i] = "young"
  elif(y['Rings'][i] < 12):
    y['Rings'][i] = "middle"
  else:
    y['Rings'][i] = "old"

In [None]:
(X_real_train, X_real_test, y_train, y_test) =  train_test_split(X_real, y, test_size = 0.2, random_state = 0)

(X_cat_train, X_cat_test) = train_test_split(X_cat, test_size=0.2, random_state = 0)

In [None]:
scaler = StandardScaler()
scaler.fit(X_real_train, y_train)
X_real_train_scaled = scaler.transform(X_real_train)
X_real_test_scaled = scaler.transform(X_real_test)

In [None]:
X_cat_train = pd.DataFrame(X_cat_train)
X_cat_test = pd.DataFrame(X_cat_test)

X_real_train_scaled = pd.DataFrame(X_real_train_scaled)
X_real_test_scaled = pd.DataFrame(X_real_test_scaled)

X_cat_train.index = X_real_train_scaled.index
X_train = pd.concat([X_real_train_scaled, X_cat_train], axis=1)

X_cat_test.index = X_real_test_scaled.index
X_test = pd.concat([X_real_test_scaled, X_cat_test], axis=1)

## Обучение модели

В качестве базового алгоритма используем метод опорных векторов, потому что он лучше всего показал себя на этой выборке. Используем лучшие параметры полученные в лабораторной работе № 2.



In [None]:
svm = SVC(C = 1000, gamma = 0.01)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print(metrics.confusion_matrix(y_test, y_pred_svm))
print(metrics.classification_report(y_test, y_pred_svm))

  y = column_or_1d(y, warn=True)


[[391  27  24]
 [107  92   1]
 [ 56   0 138]]
              precision    recall  f1-score   support

      middle       0.71      0.88      0.79       442
         old       0.77      0.46      0.58       200
       young       0.85      0.71      0.77       194

    accuracy                           0.74       836
   macro avg       0.78      0.69      0.71       836
weighted avg       0.75      0.74      0.73       836



## Обучение композиций

### Бэггинг

In [None]:
%%time
bagging = ensemble.BaggingClassifier(svm,n_jobs = -1)
bagging.fit(X_train, y_train)
pred_bagging = bagging.predict(X_test)
print(metrics.confusion_matrix(y_test, pred_bagging))
print(metrics.classification_report(y_test, pred_bagging))

  y = column_or_1d(y, warn=True)


[[394  25  23]
 [107  92   1]
 [ 57   0 137]]
              precision    recall  f1-score   support

      middle       0.71      0.89      0.79       442
         old       0.79      0.46      0.58       200
       young       0.85      0.71      0.77       194

    accuracy                           0.75       836
   macro avg       0.78      0.69      0.71       836
weighted avg       0.76      0.75      0.73       836

CPU times: user 59 ms, sys: 2.91 ms, total: 61.9 ms
Wall time: 3.77 s


Подберём параметры бэггинга по сетке с помощью GridSearchCV.

In [None]:
bagging_parameters_grid = { 'n_estimators' : [1, 5, 10, 50, 100],
                           'max_features' : range(1,10,1)
    
}

In [None]:
bagging_grid = GridSearchCV(bagging, bagging_parameters_grid,
                        cv=5, n_jobs=-1, verbose=True)
bagging_grid.fit(X_train, y_train)
bagging_grid.best_params_, bagging_grid.best_score_

Fitting 5 folds for each of 45 candidates, totalling 225 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed: 33.8min
[Parallel(n_jobs=-1)]: Done 225 out of 225 | elapsed: 38.1min finished
  y = column_or_1d(y, warn=True)


({'max_features': 9, 'n_estimators': 100}, 0.7530691979270159)

Точность немного улучшилась, но на поиск параметров ушло очень много времени.

### Бустинг

In [None]:
%%time
boosting = ensemble.GradientBoostingClassifier()
boosting.fit(X_train, y_train)
pred_boosting = boosting.predict(X_test)
print(metrics.confusion_matrix(y_test, pred_boosting))
print(metrics.classification_report(y_test, pred_boosting))

  y = column_or_1d(y, warn=True)


[[380  36  26]
 [101  98   1]
 [ 55   0 139]]
              precision    recall  f1-score   support

      middle       0.71      0.86      0.78       442
         old       0.73      0.49      0.59       200
       young       0.84      0.72      0.77       194

    accuracy                           0.74       836
   macro avg       0.76      0.69      0.71       836
weighted avg       0.74      0.74      0.73       836

CPU times: user 2.19 s, sys: 0 ns, total: 2.19 s
Wall time: 2.2 s


In [None]:
boosting_parameters_grid = { 'learning_rate' : np.arange(0.1, 0.8, 0.1),
                           'n_estimators' : [10, 50, 100],
                            'max_depth' : range(1,10,1)

}

In [None]:
boosting_grid = GridSearchCV(boosting, boosting_parameters_grid,
                        cv=5, n_jobs=-1, verbose=True)
boosting_grid.fit(X_train, y_train)
boosting_grid.best_params_, boosting_grid.best_score_

Fitting 5 folds for each of 189 candidates, totalling 945 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  54 tasks      | elapsed:   28.3s
[Parallel(n_jobs=-1)]: Done 204 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 454 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 804 tasks      | elapsed: 15.2min
[Parallel(n_jobs=-1)]: Done 945 out of 945 | elapsed: 17.7min finished
  y = column_or_1d(y, warn=True)


({'learning_rate': 0.30000000000000004, 'max_depth': 2, 'n_estimators': 50},
 0.7461825228466834)

Результат не изменился

### Стекинг

In [None]:
estimators = [
('svm', SVC()),
('knn', KNeighborsClassifier(algorithm = 'brute',metric = 'manhattan', n_neighbors = 9))
]

In [None]:
%%time
stacking = ensemble.StackingClassifier(estimators=estimators)
stacking.fit(X_train, y_train)
pred_stacking = stacking.predict(X_test)
print(metrics.confusion_matrix(y_test, pred_stacking))
print(metrics.classification_report(y_test, pred_stacking))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[[387  29  26]
 [110  88   2]
 [ 56   0 138]]
              precision    recall  f1-score   support

      middle       0.70      0.88      0.78       442
         old       0.75      0.44      0.56       200
       young       0.83      0.71      0.77       194

    accuracy                           0.73       836
   macro avg       0.76      0.68      0.70       836
weighted avg       0.74      0.73      0.72       836

CPU times: user 2.08 s, sys: 11.7 ms, total: 2.1 s
Wall time: 2.11 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
stacking_parameters_grid = { 'stack_method' : ['auto', 'pred_proba', 'solution_function', 'predict'],
                            'cv' : [1, 3, 5, 7, 9]
    
}

In [None]:
stacking_grid = GridSearchCV(stacking, stacking_parameters_grid,
                        cv=5, n_jobs=-1, verbose=True)
stacking_grid.fit(X_train, y_train)
stacking_grid.best_params_, stacking_grid.best_score_

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   18.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.0min finished
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


({'cv': 3, 'stack_method': 'auto'}, 0.7405006131235288)

После подбора параметров композиция оказалась немного хуже остальных, но быстрее. 

Лучшей композицией оказался бэггинг.

# Композиции модели для задачи регрессии

## Подготовка данных

In [None]:
concrete = pd.read_csv('../data/concrete_preprocessed.csv',index_col=0)
concrete.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Concrete compressive strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05278
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075


In [None]:
X_reg = concrete.iloc[:,:-1]
y_reg = concrete.iloc[:,-1]

In [None]:
(X_reg_train, X_reg_test, y_reg_train, y_reg_test) = train_test_split(X_reg, y_reg,test_size = 0.2, random_state = 0)

## Обучение модели

In [None]:
dtr = DecisionTreeRegressor()
dtr.fit(X_reg_train, y_reg_train)
pred_dtr = dtr.predict(X_reg_test)
print(metrics.r2_score(y_reg_test, pred_dtr))
print(metrics.mean_squared_error(y_reg_test, pred_dtr))
print(metrics.mean_squared_error(y_reg_test, pred_dtr, squared=False))
print(metrics.mean_absolute_error(y_reg_test, pred_dtr))

0.7943173365261431
54.17358308956216
7.360270585349574
4.620425395581611


## Обучение композиций

### Бэггинг

In [None]:
%%time
bagging_reg = ensemble.BaggingRegressor(dtr)
bagging_reg.fit(X_reg_train, y_reg_train)
pred_bagging_reg = bagging_reg.predict(X_reg_test)
print(metrics.r2_score(y_reg_test, pred_bagging_reg))
print(metrics.mean_squared_error(y_reg_test, pred_bagging_reg))
print(metrics.mean_squared_error(y_reg_test, pred_bagging_reg, squared=False))
print(metrics.mean_absolute_error(y_reg_test, pred_bagging_reg))

0.9075282330130774
24.355611054897757
4.93514042909599
3.578909946739593
CPU times: user 47.7 ms, sys: 1.01 ms, total: 48.7 ms
Wall time: 50.6 ms


После применения бэггинга со стандартными параметрами точность сильно увеличилась.

In [None]:
bagging_reg_parameters_grid = { 'n_estimators' : [5, 10, 50, 100],
                               'max_samples' : range(1, 5, 1),
                               'max_features' : range(1, 5, 1)
}

In [None]:
bagging_reg_grid = GridSearchCV(bagging_reg, bagging_reg_parameters_grid,
                        cv=5, n_jobs=-1, verbose=True)
bagging_reg_grid.fit(X_reg_train, y_reg_train)
bagging_reg_grid.best_params_, bagging_reg_grid.best_score_

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  92 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:   16.9s finished


({'max_features': 4, 'max_samples': 4, 'n_estimators': 100},
 0.25206120347450545)

После подбора параметров точность очень сильно упала.

### Бустинг

In [None]:
boosting_reg = ensemble.GradientBoostingRegressor()
boosting_reg.fit(X_reg_train, y_reg_train)
pred_boosting_reg = boosting_reg.predict(X_reg_test)
print(metrics.r2_score(y_reg_test, pred_bagging_reg))
print(metrics.mean_squared_error(y_reg_test, pred_boosting_reg))
print(metrics.mean_squared_error(y_reg_test, pred_boosting_reg, squared=False))
print(metrics.mean_absolute_error(y_reg_test, pred_boosting_reg))

0.9075282330130774
24.899702897420063
4.989960210003689
3.6653597825714797


Бустинг без параметров тоже сильно поднял точность.

In [None]:
boosting_reg_parameters_grid = { 'loss' : ['ls', 'lad', 'huber', 'quantile'],
                                'learning_rate' : np.arange(0.1, 0.8, 0.1),
                                'n_estimators' : [10, 50, 100 , 500]
}

In [None]:
boosting_reg_grid = GridSearchCV(boosting_reg, boosting_reg_parameters_grid,
                        cv=5, n_jobs=-1, verbose=True)
boosting_reg_grid.fit(X_reg_train, y_reg_train)
boosting_reg_grid.best_params_, boosting_reg_grid.best_score_

Fitting 5 folds for each of 112 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   50.1s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed:  2.5min finished


({'learning_rate': 0.2, 'loss': 'ls', 'n_estimators': 500}, 0.9278365432854425)

После подбора качество только возросло.

### Стекинг

In [None]:
estimators_reg = [
  ('dtr', DecisionTreeRegressor()),
  ('ridge', Ridge())
]

In [None]:
stacking_reg = ensemble.StackingRegressor(estimators=estimators_reg)
stacking_reg.fit(X_reg_train, y_reg_train)
pred_stacking_reg = stacking_reg.predict(X_reg_test)
print(metrics.r2_score(y_reg_test, pred_stacking_reg))
print(metrics.mean_squared_error(y_reg_test, pred_stacking_reg))
print(metrics.mean_squared_error(y_reg_test, pred_stacking_reg, squared=False))
print(metrics.mean_absolute_error(y_reg_test, pred_stacking_reg))

0.8328967106716147
44.01238187058505
6.634182833671759
4.402772531086011


In [None]:
stacking_reg_parameters_grid = { 'cv' : [1, 3, 5, 7, 9]
}

In [None]:
stacking_reg_grid = GridSearchCV(stacking_reg, stacking_reg_parameters_grid,
                        cv=5, n_jobs=-1, verbose=True)
stacking_reg_grid.fit(X_reg_train, y_reg_train)
stacking_reg_grid.best_params_, stacking_reg_grid.best_score_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    1.1s finished


({'cv': 5}, 0.8585205885983207)

Стекинг показывает неплохой результат но отстаёт от остальных композиций.

Лучшей композицией оказался бустинг после подбора параметров.