<h1> Импорт библиотек </h1>

In [1]:
import pandas as pd
import numpy as np
import joblib
import os

from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier

<h1> Загрузка данных </h1>

In [20]:
data = pd.read_csv('../data/raw/train.csv')

data_train, val_train = train_test_split(
    data,
    test_size = 0.2,
    random_state = 42,
    stratify = data['Exited']
)

data_train.to_csv('../data/processed/data_train.csv', index = False)
val_train.to_csv('../data/processed/val_train.csv', index = False)

**Разбиваем данные, выделяем признаки**

In [4]:
# Тренировочкая выборка
y_train = data_train['Exited']
X_train = data_train.drop(columns=['id', 'CustomerId', 'Surname', 'Exited'])

# Валидационная выборка для проверки метрики
y_val = val_train['Exited']
X_val = val_train.drop(columns=['id', 'CustomerId', 'Surname', 'Exited'])

# Категориальные признаки (будут передаваться в catboost)
cat_features = ['Geography', 'Gender']

<h1> Обучаем CatBoostClassifier </h1>

In [7]:
# Определяем доступность GPU
import subprocess

def detect_catboost_task_type():
    try:
        output = subprocess.check_output(['nvidia-smi'], stderr=subprocess.DEVNULL)
        return 'GPU'
    except:
        return 'CPU'

task_type = detect_catboost_task_type()

In [11]:
cat_model = CatBoostClassifier(
    verbose=0,
    cat_features=cat_features,
    task_type=task_type
)

# Определяем сетку гиперпараметров для перебора
param_grid_catboost = {
    'learning_rate': [0.05, 0.2, 0.6],
    'depth': [4, 6, 8],
    'n_estimators': [200, 1000],MODEL_PATH
    'l2_leaf_reg' : [2, 5, 10] 
}

**Запуск модели**

In [None]:
grid_search_catboost = GridSearchCV(
    estimator=cat_model,
    param_grid=param_grid_catboost,
    verbose=1,
    scoring='roc_auc',
    cv=15
)

grid_search_catboost.fit(X_train, y_train)

**Поиск лучших параметров и выбор лучшей модели**

In [None]:
print("Лучшие параметры: ", grid_search_catboost.best_params_)
print("Лучшая модель: ", grid_search_catboost.best_estimator_)

**На валидации измеряем roc_auc**

In [None]:
best_catboost = grid_search_catboost.best_estimator
best_catboost_pred = best_catboost.predict_proba(X_val)[:, 1]
roc_auc_catb = roc_auc_score(y_val, best_catboost_pred)

print(f'ROC_AUC на отложенной выборке для CatBoostClassifier: {roc_auc:.4f}')

In [None]:
# Сохраняем нашу лучшую модель
joblib.dump(best_catboost, '../models/best_catboost.pkl')

<h1> Обучаем XGBClassifier </h1>

In [21]:
from xgboost import XGBClassifier

X_train_xgb = pd.get_dummies(X_train.copy())
y_train_xgb = y_train.copy()

X_val_xgb = pd.get_dummies(X_val.copy())
y_val_xgb = y_val.copy()

# Выравниваем колонки между train и validation
X_train_xgb, X_val_xgb = X_train_xgb.align(X_val_xgb, join='left', axis=1, fill_value=0)

In [14]:
params = {
    "learning_rate" : [0.01, 0.1, 0.2, 0.3],
    "n_estimators" : [30, 50, 70, 100, 120, 200],
    "max_depth" : [5, 7, 9, 11, 15, 20, 25, 50]
}

xgb_model = XGBClassifier()

**Запускаем модель**

In [None]:
grid_search_xgb = GridSearchCV(
    estimator=xgb_model,
    param_grid=params,
    scoring='roc_auc',
    cv=15,
    verbose=1
)

grid_search_xgb.fit(X_train_xgb, y_train_xgb)

In [None]:
best_xgb = grid_search_xgb.best_estimator_

best_xgb_pred = best_xgb.predict_proba(X_val_xgb)[:, 1]
roc_auc_xgb = roc_auc_score(y_val_xgb, best_xgb_pred)

print(f'ROC_AUC на отложенной выборке для XGBClassifier: {roc_auc:.4f}')

In [None]:
# Сохраняем нашу лучшую модель
joblib.dump(best_xgb, '../models/best_xgb.pkl')

<h1>Обучаем ансамбль</h1>

- **Cоздаем ансамбль catboost'ов**
- **Усредняем предсказания**

**Создаем модели**

In [18]:
model1 = CatBoostClassifier(learning_rate=0.1, depth=6, iterations=100, random_state=42, verbose=0, cat_features=cat_features, task_type=task_type)
model2 = CatBoostClassifier(learning_rate=0.2, depth=6, iterations=100, random_state=42, verbose=0, cat_features=cat_features, task_type=task_type)
model3 = CatBoostClassifier(learning_rate=0.1, depth=3, iterations=100, random_state=42, verbose=0, cat_features=cat_features, task_type=task_type)
model4 = CatBoostClassifier(learning_rate=0.3, depth=5, iterations=100, random_state=42, verbose=0, cat_features=cat_features, task_type=task_type)
model5 = CatBoostClassifier(learning_rate=0.4, depth=4, iterations=100, random_state=42, verbose=0, cat_features=cat_features, task_type=task_type)


model6 = CatBoostClassifier(learning_rate=0.1, depth=6, iterations= 100, random_state=42, verbose=0, cat_features=cat_features, task_type=task_type)
model7 = CatBoostClassifier(learning_rate=0.1, depth=4, iterations=100, random_state=42, verbose=0, cat_features=cat_features, task_type=task_type)
model8 = CatBoostClassifier(learning_rate=0.15, depth=7, iterations= 100, random_state=42, verbose=0,   cat_features=cat_features, task_type=task_type)
model9 = CatBoostClassifier(learning_rate=0.25, depth=3, iterations=100, random_state=42, verbose=0, cat_features=cat_features, task_type=task_type)
model10 = CatBoostClassifier(learning_rate=0.35, depth=5, iterations=100, random_state=42, verbose=0, cat_features=cat_features, task_type=task_type)

**Строим ансамбль и запускаем**

In [None]:
ensemble = VotingClassifier(
    estimators=[('cat1', model1), ('cat2', model2), ('cat3', model3), ('cat4', model4), ('cat5', model5),
               ('cat6', model6), ('cat7', model7), ('cat8', model8), ('cat9', model9), ('cat10', model10)],
    voting='soft',  # 'soft' для усреднения вероятностей, 'hard' для большинства голосов
)

ensemble.fit(X_train, y_train)

**Измеряем roc_auc ансамбля на валидации**

In [None]:
ensemble_pred = ensemble.predict_proba(X_val)[:, 1]
roc_auc_ensemble = roc_auc_score(y_val, ensemble_pred)

print(f'ROC_AUC на отложенной выборке для Ensemble: {roc_auc:.4f}')

In [None]:
joblib.dump(ensemble, '../models/best_ensemble.pkl')

<h1>Находим лучшую</h1>

In [None]:
best_model = max([
    ("CatBoost", roc_auc_catb),
    ("XGBClassifier", roc_auc_xgb),
    ("Ensemble", roc_auc_ensemble)
], key=lambda x: x[1])

print(f"Best model: {best_model[0]} with ROC-AUC: {best_model[1]:.4f}")

In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               15000 non-null  int64  
 1   CustomerId       15000 non-null  float64
 2   Surname          15000 non-null  object 
 3   CreditScore      15000 non-null  float64
 4   Geography        15000 non-null  object 
 5   Gender           15000 non-null  object 
 6   Age              15000 non-null  float64
 7   Tenure           15000 non-null  float64
 8   Balance          15000 non-null  float64
 9   NumOfProducts    15000 non-null  float64
 10  HasCrCard        15000 non-null  float64
 11  IsActiveMember   15000 non-null  float64
 12  EstimatedSalary  15000 non-null  float64
 13  Exited           15000 non-null  float64
dtypes: float64(10), int64(1), object(3)
memory usage: 1.6+ MB


In [32]:
data.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15653521.0,Nkemakonam,667.0,Germany,Male,33.0,3.0,131769.04,1.0,1.0,1.0,162719.69,0.0
1,1,15699005.0,Chiekwugo,614.0,France,Female,31.0,2.0,110615.47,1.0,1.0,1.0,181879.56,0.0
2,2,15656912.0,Chiang,683.0,Germany,Female,24.0,6.0,115074.02,2.0,1.0,0.0,109688.82,0.0
3,3,15700772.0,Ch'ang,678.0,France,Female,38.0,9.0,0.0,1.0,1.0,0.0,122823.84,1.0
4,4,15583850.0,Chiang,588.0,Spain,Female,39.0,3.0,0.0,2.0,1.0,1.0,136910.18,0.0


In [50]:
data.CustomerId.unique

<bound method Series.unique of 0        15653521.0
1        15699005.0
2        15656912.0
3        15700772.0
4        15583850.0
            ...    
14995    15793331.0
14996    15651336.0
14997    15764072.0
14998    15792868.0
14999    15809872.0
Name: CustomerId, Length: 15000, dtype: float64>

In [57]:
print(data['CustomerId'].astype(int).duplicated().sum())

8641


In [58]:
test = pd.read_csv('/home/hoang/Рабочий стол/test.csv')

In [66]:
print(test['CustomerId'].unique)

<bound method Series.unique of 0       15646539.0
1       15628144.0
2       15687953.0
3       15585067.0
4       15746190.0
           ...    
9995    15758023.0
9996    15698528.0
9997    15696900.0
9998    15625023.0
9999    15571869.0
Name: CustomerId, Length: 10000, dtype: float64>


In [67]:
test_clean = test.drop_duplicates(subset='CustomerId')

In [68]:
test_clean

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,15000,15646539.0,Onyemauchechukwu,821.0,Spain,Male,32.0,3.0,0.00,1.0,1.0,1.0,120893.85
1,15001,15628144.0,Chikwado,634.0,France,Male,28.0,1.0,129299.28,1.0,1.0,0.0,179655.85
2,15002,15687953.0,Ting,713.0,France,Male,42.0,1.0,0.00,2.0,1.0,1.0,80552.12
3,15003,15585067.0,Achebe,611.0,France,Female,38.0,3.0,0.00,1.0,0.0,1.0,151335.24
4,15004,15746190.0,Chiazagomekpere,724.0,France,Male,29.0,9.0,0.00,1.0,1.0,1.0,88724.49
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9990,24990,15618155.0,Ch'ien,738.0,Spain,Male,37.0,6.0,0.00,2.0,1.0,0.0,139059.05
9991,24991,15772941.0,Bezrukov,745.0,Spain,Female,34.0,7.0,0.00,2.0,1.0,0.0,180134.88
9992,24992,15683383.0,Scott,705.0,Spain,Male,28.0,7.0,72535.45,1.0,1.0,1.0,40321.87
9994,24994,15613168.0,T'ien,593.0,France,Male,27.0,1.0,0.00,2.0,1.0,1.0,120107.10


In [70]:
test_clean.to_csv('~/test_clean.csv', index=False)