In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from joblib import dump
import pandas as pd
import numpy as np

In [2]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

df = pd.read_csv('data/mode_and_minmax.csv')
X = df.drop(columns=[ 'id', 'churn']).values
y = df['churn'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [3]:
lr_params = {'C': [1, 10, 100, 1000]
    , 'penalty': ['l1', 'l2']
    , 'solver': ['liblinear', 'saga']
    , 'tol': [0.0001, 0.001, 0.01]
    , 'class_weight': [None, 'balanced']
    , 'max_iter': [100, 1000, 5000, 10000]
}

lr_gridSCV = GridSearchCV(LogisticRegression(random_state=42)
                       , lr_params
                       , n_jobs=-1
                       , refit=True
                       , cv=k_fold)
lr_gridSCV.fit(X_train, y_train)

In [4]:
print(lr_gridSCV.best_params_)
lr_gridSCV_predict = lr_gridSCV.predict(X_test)
print(classification_report(y_test, lr_gridSCV_predict))

{'C': 10, 'class_weight': 'balanced', 'max_iter': 100, 'penalty': 'l2', 'solver': 'saga', 'tol': 0.01}
              precision    recall  f1-score   support

           0       0.88      0.83      0.86      6489
           1       0.87      0.91      0.89      7966

    accuracy                           0.87     14455
   macro avg       0.87      0.87      0.87     14455
weighted avg       0.87      0.87      0.87     14455



In [5]:
rf_params = {'n_estimators': [10, 50, 80, 100, 120]
    , 'max_depth': [None, 10, 20]
    , 'min_samples_split': [2, 5, 10]
    , 'min_samples_leaf': [1, 2, 4]
    , 'max_features': ['sqrt', 'log2']
    , 'class_weight': [None, 'balanced']
}

rf_gridSCV = GridSearchCV(RandomForestClassifier(random_state=42)
                           , rf_params
                           , n_jobs=-1
                           , refit=True
                           , cv=k_fold
                           )
rf_gridSCV.fit(X_train, y_train)

In [6]:
print(rf_gridSCV.best_params_)
rf_grid_predict = rf_gridSCV.predict(X_test)
print(classification_report(y_test, rf_grid_predict))

{'class_weight': 'balanced', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
              precision    recall  f1-score   support

           0       0.92      0.94      0.93      6489
           1       0.95      0.94      0.94      7966

    accuracy                           0.94     14455
   macro avg       0.94      0.94      0.94     14455
weighted avg       0.94      0.94      0.94     14455



In [7]:
svm_params = {'C': [0.1, 1, 10, 100]
    , 'kernel': ['linear', 'rbf']
    , 'gamma': ['scale', 'auto']
    , 'class_weight': [None, 'balanced']
}

svm_gridSCV = GridSearchCV(SVC(random_state=42)
                        , svm_params
                        , n_jobs=-1
                        , refit=True
                        , cv=k_fold)

svm_gridSCV.fit(X_train, y_train)

In [8]:
print(svm_gridSCV.best_params_)
svm_gridSCV_predict = svm_gridSCV.predict(X_test)
print(classification_report(y_test, svm_gridSCV_predict))

{'C': 100, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'rbf'}
              precision    recall  f1-score   support

           0       0.91      0.90      0.90      6489
           1       0.92      0.93      0.92      7966

    accuracy                           0.91     14455
   macro avg       0.91      0.91      0.91     14455
weighted avg       0.91      0.91      0.91     14455



In [9]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
pred = rf_model.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93      6489
           1       0.95      0.93      0.94      7966

    accuracy                           0.94     14455
   macro avg       0.94      0.94      0.94     14455
weighted avg       0.94      0.94      0.94     14455



In [12]:
dump(rf_gridSCV.best_estimator_, 'model.joblib')

['model.joblib']

Вибір Алгоритму: В якості моделі було вибрано RandomForestClassifier так як на тесті вона показала найкращий результат.

In [11]:
data = np.array([0, 0, 0.0, 0.61, 0.05, 0.0, 0.00, 0.0051, 0.0])
data_2d = data.reshape(1, -1)
pred_2 = rf_model.predict_proba(data_2d)
for i, pred in enumerate(pred_2[0]):
    print(f'{i}: {pred: .4f}')

0:  0.4800
1:  0.5200
