In [60]:
import warnings
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, StackingClassifier, BaggingRegressor, \
    GradientBoostingRegressor, StackingRegressor
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
warnings.filterwarnings('ignore')

In [61]:
parameters_ensemble = {'n_estimators': np.arange(20,101,20),
                       'max_features': np.arange(3,24,10)}

# Классификация

In [62]:
def print_classification_model_metrics(estimator, y_test, y_pred):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print(estimator.score(X_test, y_test))

In [63]:
phone_df = pd.read_csv("../data/Smartphone_chosse_preprocessed.csv")
phone_df

Unnamed: 0.1,Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,0,842,0,2.2,0,1,0,7,0.6,188,...,20,756,2549,9,7,19,0,0,1,1
1,1,1021,1,0.5,1,0,1,53,0.7,136,...,905,1988,2631,17,3,7,1,1,0,2
2,2,563,1,0.5,1,2,1,41,0.9,145,...,1263,1716,2603,11,2,9,1,1,0,2
3,3,615,1,2.5,0,0,0,10,0.8,131,...,1216,1786,2769,16,8,11,1,0,0,2
4,4,1821,1,1.2,0,13,1,44,0.6,141,...,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1995,794,1,0.5,1,0,1,2,0.8,106,...,1222,1890,668,13,4,19,1,1,0,0
1996,1996,1965,1,2.6,1,0,0,39,0.2,187,...,915,1965,2032,11,10,16,1,1,1,2
1997,1997,1911,0,0.9,1,1,1,36,0.7,108,...,868,1632,3057,9,1,5,1,1,0,3
1998,1998,1512,0,0.9,0,4,1,46,0.1,145,...,336,670,869,18,10,19,1,1,1,0


In [64]:
y = phone_df["blue"]
y

0       0
1       1
2       1
3       1
4       1
       ..
1995    1
1996    1
1997    0
1998    0
1999    1
Name: blue, Length: 2000, dtype: int64

In [65]:
X = phone_df[phone_df.columns[2:phone_df.shape[1]]]
X

Unnamed: 0,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,0,2.2,0,1,0,7,0.6,188,1,2,20,756,2549,9,7,19,0,0,1,1
1,1,0.5,1,0,1,53,0.7,136,2,6,905,1988,2631,17,3,7,1,1,0,2
2,1,0.5,1,2,1,41,0.9,145,4,6,1263,1716,2603,11,2,9,1,1,0,2
3,1,2.5,0,0,0,10,0.8,131,5,9,1216,1786,2769,16,8,11,1,0,0,2
4,1,1.2,0,13,1,44,0.6,141,1,14,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1,0.5,1,0,1,2,0.8,106,5,14,1222,1890,668,13,4,19,1,1,0,0
1996,1,2.6,1,0,0,39,0.2,187,3,3,915,1965,2032,11,10,16,1,1,1,2
1997,0,0.9,1,1,1,36,0.7,108,7,3,868,1632,3057,9,1,5,1,1,0,3
1998,0,0.9,0,4,1,46,0.1,145,4,5,336,670,869,18,10,19,1,1,1,0


In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y)

In [67]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Базовая модель DTC

In [68]:
%%time
parameters_dtc = {'max_depth': np.arange(5,16,1)}
dtc = DecisionTreeClassifier()
dtc_base = GridSearchCV(dtc, parameters_dtc).fit(X_train, y_train)
dtc_base.best_params_

Wall time: 244 ms


{'max_depth': 5}

In [69]:
print_classification_model_metrics(dtc_base, y_test, dtc_base.predict(X_test))

[[152   0]
 [  0 148]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       152
           1       1.00      1.00      1.00       148

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300

1.0


# Модель BaggingClassifier

In [70]:
%%time
bag = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=dtc_base.best_params_['max_depth']))
model = GridSearchCV(bag, parameters_ensemble).fit(X_train, y_train)
model.best_params_

Wall time: 13.1 s


{'max_features': 13, 'n_estimators': 20}

In [71]:
print_classification_model_metrics(model, y_test, model.predict(X_test))

[[152   0]
 [  0 148]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       152
           1       1.00      1.00      1.00       148

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300

1.0


# Модель GradientBoostingClassifier

In [72]:
%%time
gbc = GradientBoostingClassifier()
model = GridSearchCV(gbc, parameters_ensemble).fit(X_train, y_train)
model.best_params_

Wall time: 8.04 s


{'max_features': 3, 'n_estimators': 40}

In [73]:
print_classification_model_metrics(model, y_test, model.predict(X_test))

[[152   0]
 [  0 148]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       152
           1       1.00      1.00      1.00       148

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300

1.0


# Модель StackingClassifier

In [74]:
%%time
model = StackingClassifier(estimators=[('bag',bag), ('gbc',gbc)],
                           final_estimator=dtc_base).fit(X_train, y_train)

Wall time: 2.41 s


In [75]:
print_classification_model_metrics(model, y_test, model.predict(X_test))

[[152   0]
 [  0 148]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       152
           1       1.00      1.00      1.00       148

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300

1.0


## Изучение модели CatBoostRegressor

In [None]:
%%time
cbr = CatBoostRegressor(learning_rate=0.15).fit(X_train, y_train)
print_regression_model_metrics(cbr, y_test, cbr.predict(X_test))

# Регрессия

In [76]:
def print_regression_model_metrics(estimator, y_test, y_pred):
    print(f"Коэффициент детерминации: {estimator.score(X,y)}")
    print(f'MSE: {mean_squared_error(y_test, y_pred)}')
    print(f'RMSE: {mean_squared_error(y_test, y_pred, squared=False)}')
    print(f'MAE: {mean_absolute_error(y_test, y_pred)}')

In [77]:
phone_df = pd.read_csv("../data/Smartphone_chosse_preprocessed.csv")
phone_df.drop(["Unnamed: 0"], axis=1, inplace=True)
phone_df

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,1,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,2,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,4,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,5,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,1,...,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,5,...,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,3,...,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,7,...,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,4,...,336,670,869,18,10,19,1,1,1,0


In [78]:
y = phone_df["price_range"]
y

0       1
1       2
2       2
3       2
4       1
       ..
1995    0
1996    2
1997    3
1998    0
1999    3
Name: price_range, Length: 2000, dtype: int64

In [79]:
X = phone_df[phone_df.columns[1:]]
X

Unnamed: 0,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,0,2.2,0,1,0,7,0.6,188,1,2,20,756,2549,9,7,19,0,0,1,1
1,1,0.5,1,0,1,53,0.7,136,2,6,905,1988,2631,17,3,7,1,1,0,2
2,1,0.5,1,2,1,41,0.9,145,4,6,1263,1716,2603,11,2,9,1,1,0,2
3,1,2.5,0,0,0,10,0.8,131,5,9,1216,1786,2769,16,8,11,1,0,0,2
4,1,1.2,0,13,1,44,0.6,141,1,14,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1,0.5,1,0,1,2,0.8,106,5,14,1222,1890,668,13,4,19,1,1,0,0
1996,1,2.6,1,0,0,39,0.2,187,3,3,915,1965,2032,11,10,16,1,1,1,2
1997,0,0.9,1,1,1,36,0.7,108,7,3,868,1632,3057,9,1,5,1,1,0,3
1998,0,0.9,0,4,1,46,0.1,145,4,5,336,670,869,18,10,19,1,1,1,0


In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

## Базовая модель DTR

In [81]:
%%time
parameters_dtr = {'max_depth': np.arange(5,16,1)}
dtr = DecisionTreeRegressor().fit(X_train, y_train)
dtr_base = GridSearchCV(dtr, parameters_dtr).fit(X_train, y_train)
print(dtr_base.best_params_)
print_regression_model_metrics(dtr_base, y_test, dtr_base.predict(X_test))

{'max_depth': 5}
Коэффициент детерминации: 1.0
MSE: 0.0
RMSE: 0.0
MAE: 0.0
Wall time: 653 ms


# Модель BaggingRegressor

In [82]:
%%time
br = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=dtr_base.best_params_['max_depth']))
model = GridSearchCV(br, parameters_ensemble).fit(X_train, y_train)
model.best_params_

Wall time: 12.8 s


{'max_features': 13, 'n_estimators': 80}

In [83]:
print_regression_model_metrics(model, y_test, model.predict(X_test))

Коэффициент детерминации: 0.9593367262002863
MSE: 0.060346278400344414
RMSE: 0.24565479519102495
MAE: 0.21897408412851396


# Модель GradientBoostingRegressor

In [84]:
%%time
gbr = GradientBoostingRegressor()
model = GridSearchCV(gbr, parameters_ensemble).fit(X_train, y_train)
model.best_params_

Wall time: 7.12 s


{'max_features': 13, 'n_estimators': 80}

In [85]:
print_regression_model_metrics(model, y_test, model.predict(X_test))

Коэффициент детерминации: 0.9997816023314263
MSE: 0.0003734941100898772
RMSE: 0.01932599570759233
MAE: 0.01252285808495729


# Модель StackingRegressor

In [86]:
%%time
model = StackingRegressor(estimators=[('br',br), ('gbr',gbr)],
                           final_estimator=dtr_base).fit(X_train, y_train)

Wall time: 2.61 s


In [87]:
print_regression_model_metrics(model, y_test, model.predict(X_test))

Коэффициент детерминации: 1.0
MSE: 0.0
RMSE: 0.0
MAE: 0.0


## Изучение модели CatBoostRegressor

In [None]:
%%time
cbr = CatBoostRegressor(learning_rate=0.15).fit(X_train, y_train)
print_regression_model_metrics(cbr, y_test, cbr.predict(X_test))