In [131]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.model_selection import train_test_split # разбиение данных на тренировочные и тестовые

from sklearn.compose import ColumnTransformer # преобразование столбцов
from sklearn.preprocessing import OneHotEncoder # кодирование категориальных переменных
from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import StandardScaler # нормализация и масштабирование данных

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.ensemble import StackingClassifier, BaggingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb

from sklearn.model_selection import cross_val_score # кроссвалидация
from sklearn.model_selection import GridSearchCV # подбор гиперпараметров с кроссвалидацией


from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay

import gc

sns.set()
%matplotlib inline

import sys
# np.set_printoptions(suppress=True)
# np.set_printoptions(threshold=sys.maxsize)
# np.set_printoptions(precision=3)

DISPLAY_MAX_ROWS = 20 #20
pd.set_option('display.max_rows', DISPLAY_MAX_ROWS)
pd.set_option('display.max_column', 100) # None)
plt.style.use('seaborn-whitegrid')


# plt.rcParams["figure.figsize"] = (20, 15)

import warnings
warnings.filterwarnings('ignore')

### 1. Использовать датасет `telecom_churn.csv`

In [132]:
dataset = pd.read_csv('telecom_churn.csv')
dataset

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.70,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.70,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.30,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.90,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,AZ,192,415,No,Yes,36,156.2,77,26.55,215.5,126,18.32,279.1,83,12.56,9.9,6,2.67,2,False
3329,WV,68,415,No,No,0,231.1,57,39.29,153.4,55,13.04,191.3,123,8.61,9.6,4,2.59,3,False
3330,RI,28,510,No,No,0,180.8,109,30.74,288.8,58,24.55,191.9,91,8.64,14.1,6,3.81,2,False
3331,CT,184,510,Yes,No,0,213.8,105,36.35,159.6,84,13.57,139.2,137,6.26,5.0,10,1.35,2,False


In [133]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   State                   3333 non-null   object 
 1   Account length          3333 non-null   int64  
 2   Area code               3333 non-null   int64  
 3   International plan      3333 non-null   object 
 4   Voice mail plan         3333 non-null   object 
 5   Number vmail messages   3333 non-null   int64  
 6   Total day minutes       3333 non-null   float64
 7   Total day calls         3333 non-null   int64  
 8   Total day charge        3333 non-null   float64
 9   Total eve minutes       3333 non-null   float64
 10  Total eve calls         3333 non-null   int64  
 11  Total eve charge        3333 non-null   float64
 12  Total night minutes     3333 non-null   float64
 13  Total night calls       3333 non-null   int64  
 14  Total night charge      3333 non-null   

In [134]:
categorical_features = ['State', 'Area code', 'International plan', 'Voice mail plan', 'Churn']
dataset[categorical_features] = dataset[categorical_features].apply(lambda col:pd.Categorical(col).codes)

dataset

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,16,128,1,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.70,1,0
1,35,107,1,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.70,1,0
2,31,137,1,0,0,0,243.4,114,41.38,121.2,110,10.30,162.6,104,7.32,12.2,5,3.29,0,0
3,35,84,0,1,0,0,299.4,71,50.90,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
4,36,75,1,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,3,192,1,0,1,36,156.2,77,26.55,215.5,126,18.32,279.1,83,12.56,9.9,6,2.67,2,0
3329,49,68,1,0,0,0,231.1,57,39.29,153.4,55,13.04,191.3,123,8.61,9.6,4,2.59,3,0
3330,39,28,2,0,0,0,180.8,109,30.74,288.8,58,24.55,191.9,91,8.64,14.1,6,3.81,2,0
3331,6,184,2,1,0,0,213.8,105,36.35,159.6,84,13.57,139.2,137,6.26,5.0,10,1.35,2,0


In [135]:
X = dataset.drop('Churn', axis=1)
y = dataset['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [136]:
def print_results(classifier):
  y_pred = classifier.predict(X_test_scaled)
  cm = confusion_matrix(y_test, y_pred)
  print(f"{type(classifier).__name__} results\n")
  print(f"Conflusion matrix:\n{cm}\n")
  print(f"Accuracy score:\n{accuracy_score(y_test, y_pred)}")

### 2. Обучить LogisticRegression. Использовать в качестве `baseline`.

In [137]:
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)
print_results(logreg)

LogisticRegression results

Conflusion matrix:
[[553  13]
 [ 85  16]]

Accuracy score:
0.8530734632683659


### 3. Обучить на этом датасете все три вида ансамблей, использованных в этом ноутбуке. Сравнить с baseline

#### Stacking

In [138]:
stacking = StackingClassifier(estimators=[('lr', LogisticRegression()), ('dt', DecisionTreeClassifier())], final_estimator=SVC())
stacking.fit(X_train_scaled, y_train)
print_results(stacking)

StackingClassifier results

Conflusion matrix:
[[558   8]
 [ 33  68]]

Accuracy score:
0.9385307346326837


#### Bagging

In [139]:
bagging = BaggingClassifier(base_estimator=LogisticRegression())
bagging.fit(X_train_scaled, y_train)
print_results(bagging)

BaggingClassifier results

Conflusion matrix:
[[551  15]
 [ 82  19]]

Accuracy score:
0.8545727136431784


#### Boosting

In [140]:
ada_boost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=100, random_state=12)
ada_boost.fit(X_train_scaled, y_train)
print_results(ada_boost)

AdaBoostClassifier results

Conflusion matrix:
[[559   7]
 [ 37  64]]

Accuracy score:
0.9340329835082459


In [141]:
gradient_boost = GradientBoostingClassifier(max_depth=2, n_estimators=150,
                                      random_state=12, learning_rate=1)
gradient_boost.fit(X_train_scaled, y_train)
print_results(gradient_boost)

GradientBoostingClassifier results

Conflusion matrix:
[[548  18]
 [ 39  62]]

Accuracy score:
0.9145427286356822


### 4. Использовать PCA для понижения размерности, или использовать статистическим методом отбор признаков SelectKBest. Затем обучить LogisticRegression в качестве `baseline`, и обучить все три вида ансамблей, рассмотренных в этом ноутбуке.

#### SelectKBest

In [142]:
k_best = SelectKBest(f_classif, k=2)
X_train_kbest = k_best.fit_transform(X_train, y_train)
X_test_kbest = k_best.transform(X_test)

In [143]:
def print_results_kbest(classifier):
  y_pred = classifier.predict(X_test_kbest)
  cm = confusion_matrix(y_test, y_pred)
  print(f"{type(classifier).__name__} results\n")
  print(f"Conflusion matrix:\n{cm}\n")
  print(f"Accuracy score:\n{accuracy_score(y_test, y_pred)}")

#### Logistic

In [144]:
logreg_kbest = LogisticRegression()
logreg_kbest.fit(X_train_kbest, y_train)
print_results_kbest(logreg_kbest)

LogisticRegression results

Conflusion matrix:
[[558   8]
 [ 93   8]]

Accuracy score:
0.848575712143928


#### Stacking

In [145]:
stacking_kbest = StackingClassifier(estimators=[('lr', LogisticRegression()), ('dt', DecisionTreeClassifier())], final_estimator=SVC())
stacking_kbest.fit(X_train_kbest, y_train)
print_results_kbest(stacking_kbest)

StackingClassifier results

Conflusion matrix:
[[563   3]
 [ 94   7]]

Accuracy score:
0.8545727136431784


#### Bagging

In [146]:
bagging_kbest = BaggingClassifier(base_estimator=LogisticRegression())
bagging_kbest.fit(X_train_kbest, y_train)
print_results_kbest(bagging_kbest)

BaggingClassifier results

Conflusion matrix:
[[558   8]
 [ 93   8]]

Accuracy score:
0.848575712143928


#### Boosting

In [147]:
ada_boost_kbest = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=100, random_state=12)
ada_boost_kbest.fit(X_train_kbest, y_train)
print_results_kbest(ada_boost_kbest)

AdaBoostClassifier results

Conflusion matrix:
[[560   6]
 [ 91  10]]

Accuracy score:
0.8545727136431784


In [148]:
gradient_boost_kbest = GradientBoostingClassifier(max_depth=2, n_estimators=150,
                                      random_state=12, learning_rate=1)
gradient_boost_kbest.fit(X_train_kbest, y_train)
print_results_kbest(gradient_boost_kbest)

GradientBoostingClassifier results

Conflusion matrix:
[[560   6]
 [ 91  10]]

Accuracy score:
0.8545727136431784


### 5. Сравнить результаты между собой до понижения размерности или отбора признаков, так и после них.

In [149]:
models = [logreg, stacking, bagging, ada_boost, gradient_boost]
models_kbest = [logreg_kbest, stacking_kbest, bagging_kbest, ada_boost_kbest, gradient_boost_kbest]
model_names = ['Logistic Regression', 'Stacking', 'Bagging', 'Ada Boosting', 'Gradient Boosting']

results = {x: {} for _,x in enumerate(model_names)}

for model, name in zip(models, model_names):
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    results[name]['simple'] = acc

for model, name in zip(models_kbest, model_names):
    y_pred = model.predict(X_test_kbest)
    acc = accuracy_score(y_test, y_pred)
    results[name]['kbest'] = acc

print("{:<25} {:<25} {:<25}".format('NAME', 'SIMPLE', 'KBEST'))
for name, accuracy in results.items():
    print("{:<25} {:<25} {:<25}".format(name, accuracy['simple'], accuracy['kbest']))

NAME                      SIMPLE                    KBEST                    
Logistic Regression       0.8530734632683659        0.848575712143928        
Stacking                  0.9385307346326837        0.8545727136431784       
Bagging                   0.8545727136431784        0.848575712143928        
Ada Boosting              0.9340329835082459        0.8545727136431784       
Gradient Boosting         0.9145427286356822        0.8545727136431784       
