In [169]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import \
    classification_report, roc_auc_score, roc_curve, auc

In [170]:
df = pd.read_csv('data/AirPass.csv')
df=df.drop(['Unnamed: 0'], axis= 1)
df.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [171]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 24 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   id                                 103904 non-null  int64  
 1   Gender                             103904 non-null  object 
 2   Customer Type                      103904 non-null  object 
 3   Age                                103904 non-null  int64  
 4   Type of Travel                     103904 non-null  object 
 5   Class                              103904 non-null  object 
 6   Flight Distance                    103904 non-null  int64  
 7   Inflight wifi service              103904 non-null  int64  
 8   Departure/Arrival time convenient  103904 non-null  int64  
 9   Ease of Online booking             103904 non-null  int64  
 10  Gate location                      103904 non-null  int64  
 11  Food and drink                     1039

In [172]:
df.isnull().sum().sum()

310

In [173]:
df['Arrival Delay in Minutes']=df['Arrival Delay in Minutes'].fillna(df['Arrival Delay in Minutes'].median())

In [174]:
df['Arrival Delay in Minutes'].mean()

15.133392362180475

In [175]:
df.groupby('Gender')['satisfaction'].value_counts()*100/df.shape[0]

Gender  satisfaction           
Female  neutral or dissatisfied    29.058554
        satisfied                  21.687327
Male    neutral or dissatisfied    27.608177
        satisfied                  21.645942
Name: satisfaction, dtype: float64

In [176]:
df.groupby('Type of Travel')['satisfaction'].value_counts()*100/df.shape[0]

Type of Travel   satisfaction           
Business travel  satisfied                  40.177472
                 neutral or dissatisfied    28.785225
Personal Travel  neutral or dissatisfied    27.881506
                 satisfied                   3.155798
Name: satisfaction, dtype: float64

In [177]:
df.groupby('Class')['satisfaction'].value_counts()*100/df.shape[0]

Class     satisfaction           
Business  satisfied                  33.184478
          neutral or dissatisfied    14.614452
Eco       neutral or dissatisfied    36.614567
          satisfied                   8.374076
Eco Plus  neutral or dissatisfied     5.437712
          satisfied                   1.774715
Name: satisfaction, dtype: float64

In [178]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 24 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   id                                 103904 non-null  int64  
 1   Gender                             103904 non-null  object 
 2   Customer Type                      103904 non-null  object 
 3   Age                                103904 non-null  int64  
 4   Type of Travel                     103904 non-null  object 
 5   Class                              103904 non-null  object 
 6   Flight Distance                    103904 non-null  int64  
 7   Inflight wifi service              103904 non-null  int64  
 8   Departure/Arrival time convenient  103904 non-null  int64  
 9   Ease of Online booking             103904 non-null  int64  
 10  Gate location                      103904 non-null  int64  
 11  Food and drink                     1039

In [179]:
df['satisfaction'] = df['satisfaction'].map({'neutral or dissatisfied':0 , 'satisfied':1})
df['Customer Type'] = df['Customer Type'].map({'Loyal Customer':1, 'disloyal Customer':0})
df['Type of Travel'] = df['Type of Travel'].map({'Personal Travel':0, 'Business travel':1})
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})

In [180]:
df = pd.get_dummies(df)

In [181]:
df.shape

(103904, 26)

In [182]:
X=df.drop(['satisfaction'], axis= 1)
y=df['satisfaction']

In [183]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=26)

In [184]:
X_test.shape

(20781, 25)

In [185]:
from sklearn.preprocessing import StandardScaler

In [186]:
scaler = StandardScaler()
scaler.fit(X_train) 
X_train_sc = scaler.transform(X_train) 
X_test_sc = scaler.transform(X_test) 
X_test_sc[0][0]

0.9408251379303

In [189]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

model_lr = LogisticRegression()
model_lr.fit(X_train_sc, y_train)
preds_test = model_lr.predict(X_test_sc)
f1_score(preds_test, y_test)

0.8546883773161146

In [190]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

Теперь перейдём к бустингу. Начнём с обучения первой модели — AdaBoost. В качестве базовой модели для неё возьмите решающее дерево с параметром random_state = 26.

Обучите AdaBoost, зафиксировав random_state со значением 26 и задав темп обучения 0.01. В качестве ответа введите значение метрики f1_score. Ответ округлите до трёх знаков после точки-разделителя.

In [191]:
model_ada = AdaBoostClassifier(DecisionTreeClassifier(random_state=26),random_state=26,learning_rate=0.01)

model_ada.fit(X_train_sc, y_train)
preds_test = model_ada.predict(X_test_sc)
f1_score(preds_test, y_test)

0.9404794558121674

In [192]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer

Перейдем к следующему алгоритму — градиентному бустингу.

Будем настраивать количество деревьев и темп обучения, делая перебор по следующей сетке:

params = {"n_estimators":2**np.arange(8), "learning_rate":0.1**np.arange(3)}
Используйте для поиска оптимальных параметров GridSearchCV, а для ускорения работы алгоритма задайте параметр кросс-валидации, равный 3.

Какое наибольшее значение метрики f1_score получилось? Ответ округлите до трёх знаков после точки-разделителя.

In [193]:
model_for_gs = GradientBoostingClassifier()
params = {"n_estimators":2**np.arange(8), "learning_rate":0.1**np.arange(3)}
gs = GridSearchCV(model_for_gs, 
                  params, 
                  cv=3, 
                  scoring=make_scorer(f1_score),
                  verbose=5)
 
gs.fit(X_train, y_train)
 
print("Лучшие гиперпараметры:", gs.best_params_)
print("Лучшее значение метрики:", gs.best_score_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV 1/3] END .learning_rate=1.0, n_estimators=1;, score=0.873 total time=   0.3s
[CV 2/3] END .learning_rate=1.0, n_estimators=1;, score=0.870 total time=   0.2s
[CV 3/3] END .learning_rate=1.0, n_estimators=1;, score=0.871 total time=   0.3s
[CV 1/3] END .learning_rate=1.0, n_estimators=2;, score=0.880 total time=   0.4s
[CV 2/3] END .learning_rate=1.0, n_estimators=2;, score=0.878 total time=   0.4s
[CV 3/3] END .learning_rate=1.0, n_estimators=2;, score=0.875 total time=   0.4s
[CV 1/3] END .learning_rate=1.0, n_estimators=4;, score=0.901 total time=   0.9s
[CV 2/3] END .learning_rate=1.0, n_estimators=4;, score=0.896 total time=   0.9s
[CV 3/3] END .learning_rate=1.0, n_estimators=4;, score=0.897 total time=   0.8s
[CV 1/3] END .learning_rate=1.0, n_estimators=8;, score=0.920 total time=   1.8s
[CV 2/3] END .learning_rate=1.0, n_estimators=8;, score=0.920 total time=   1.7s
[CV 3/3] END .learning_rate=1.0, n_estimators=8;

Обучите алгоритм XGBoost. Так как он достаточно мощный «из коробки», определите его с параметрами по умолчанию, только задайте random_state = 26. Какое значение метрики f1_score получилось? Ответ округлите до трёх знаков после точки-разделителя.

In [194]:
from xgboost import XGBClassifier

In [195]:
model_xgb = XGBClassifier(random_state=26)
model_xgb.fit(X_train_sc,y_train)
preds_test = model_xgb.predict(X_test_sc)
f1_score(preds_test, y_test)

0.9579785161685312

Обучите алгоритм CatBoost. Как и XGBoost, будем обучать его с настройками по умолчанию и заданным random_state = 26. Какое значение метрики f1_score получилось? Ответ округлите до трёх знаков после точки-разделителя.

In [196]:
from catboost import CatBoostClassifier

In [197]:
model = CatBoostClassifier(random_state=26)
model.fit(X_train_sc, y_train)
preds_class = model.predict(X_test_sc)
f1_score(preds_class, y_test)

Learning rate set to 0.068023
0:	learn: 0.6018089	total: 71.3ms	remaining: 1m 11s
1:	learn: 0.5020769	total: 101ms	remaining: 50.3s
2:	learn: 0.4472481	total: 166ms	remaining: 55.1s
3:	learn: 0.4028675	total: 219ms	remaining: 54.5s
4:	learn: 0.3674724	total: 264ms	remaining: 52.6s
5:	learn: 0.3397844	total: 296ms	remaining: 49s
6:	learn: 0.3121211	total: 331ms	remaining: 46.9s
7:	learn: 0.2917499	total: 352ms	remaining: 43.7s
8:	learn: 0.2749039	total: 370ms	remaining: 40.7s
9:	learn: 0.2575191	total: 394ms	remaining: 39s
10:	learn: 0.2473690	total: 415ms	remaining: 37.3s
11:	learn: 0.2377531	total: 431ms	remaining: 35.5s
12:	learn: 0.2279309	total: 453ms	remaining: 34.4s
13:	learn: 0.2212512	total: 474ms	remaining: 33.4s
14:	learn: 0.2100359	total: 503ms	remaining: 33s
15:	learn: 0.2025733	total: 539ms	remaining: 33.1s
16:	learn: 0.1942303	total: 554ms	remaining: 32s
17:	learn: 0.1877939	total: 568ms	remaining: 31s
18:	learn: 0.1832381	total: 584ms	remaining: 30.2s
19:	learn: 0.179736

0.960182404626849

In [198]:

from catboost.utils import get_confusion_matrix
from catboost import Pool, CatBoostClassifier

In [204]:
cm = get_confusion_matrix(model, Pool(X_train_sc, y_train))
cm


array([[46661.,   545.],
       [ 1274., 34643.]])

In [206]:
pd.DataFrame(
    {
        "feature_importance": model.get_feature_importance(),
        "feature_names": df.drop(columns="satisfaction").columns,
    }
).sort_values(by=["feature_importance"], ascending=False)

Unnamed: 0,feature_importance,feature_names
6,25.364737,Inflight wifi service
4,18.391876,Type of Travel
11,7.401483,Online boarding
2,7.240564,Customer Type
22,5.420057,Class_Business
17,3.925791,Checkin service
3,3.74235,Age
16,3.640798,Baggage handling
9,3.196286,Gate location
12,3.012455,Seat comfort
