In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.svm import SVC
import pandas as pd

In [60]:
df = pd.read_csv('mode_and_three_monthes2.csv')
df.head()

Unnamed: 0,id,is_tv_subscriber,is_movie_package_subscriber,subscription_age,bill_avg,reamining_contract,service_failure_count,download_avg,upload_avg,download_over_limit,churn
0,15,1,0,47.8,25.0,0.56,0.0,8.4,2.3,0.0,0
1,18,0,0,32.88,0.0,0.0,0.0,0.0,0.0,0.0,1
2,23,1,0,35.64,16.0,0.0,0.0,13.7,0.9,0.0,1
3,27,0,0,27.48,21.0,0.0,1.0,0.0,0.0,0.0,1
4,34,0,0,25.56,0.0,0.0,0.0,0.0,0.0,0.0,1


In [61]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

X = df.drop(columns=['id', 'churn'])
X_scaled = StandardScaler().fit_transform(X)
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

X.head()

(57819, 9) (14455, 9) (57819,) (14455,)


Unnamed: 0,is_tv_subscriber,is_movie_package_subscriber,subscription_age,bill_avg,reamining_contract,service_failure_count,download_avg,upload_avg,download_over_limit
0,1,0,47.8,25.0,0.56,0.0,8.4,2.3,0.0
1,0,0,32.88,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0,35.64,16.0,0.0,0.0,13.7,0.9,0.0
3,0,0,27.48,21.0,0.0,1.0,0.0,0.0,0.0
4,0,0,25.56,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
X_scaled

array([[ 0.47603011, -0.7091697 ,  4.66833504, ..., -0.55371302,
        -0.19086036, -0.20821312],
       [-2.10070746, -0.7091697 ,  2.8353894 , ..., -0.68637825,
        -0.42561131, -0.20821312],
       [ 0.47603011, -0.7091697 ,  3.17445978, ..., -0.47000757,
        -0.33375224, -0.20821312],
       ...,
       [ 0.47603011, -0.7091697 , -1.19414255, ..., -0.66268803,
        -0.40519819, -0.20821312],
       [-2.10070746, -0.7091697 , -1.19905662, ..., -0.68637825,
        -0.42561131, -0.20821312],
       [ 0.47603011,  1.41009972, -1.19905662, ..., -0.68637825,
        -0.42561131, -0.20821312]])

In [63]:
y.head()

0    0
1    1
2    1
3    1
4    1
Name: churn, dtype: int64

In [64]:
class_count = df['churn'].value_counts()
print(class_count)

churn
1    40050
0    32224
Name: count, dtype: int64


In [65]:
class_persentage = df['churn'].value_counts(normalize=True) * 100
print(class_persentage)

churn
1    55.414118
0    44.585882
Name: proportion, dtype: float64


In [66]:
lr_pipeline = Pipeline(
    [('scaler', StandardScaler()),
     ('lr', LogisticRegression())],
)

lr_params = {'lr__C': [1, 10, 100, 1000]
    , 'lr__penalty': ['l1', 'l2']
    , 'lr__solver': ['liblinear', 'saga']
    , 'lr__tol': [0.0001, 0.001, 0.01]
    , 'lr__class_weight': ['balanced']
    , 'lr__max_iter': [100, 1000, 5000, 10000]
}

lr_gridSCV = GridSearchCV(lr_pipeline
                       , lr_params
                       , n_jobs=-1
                       , refit='f1'
                       , scoring='f1'
                       , cv=k_fold)
lr_gridSCV.fit(X_train, y_train)

lr_randomSCV = RandomizedSearchCV(lr_pipeline
                               , lr_params
                               , n_jobs=-1
                               , n_iter=50
                               , random_state=42
                               , refit='f1'
                               , scoring='f1'
                               , cv=k_fold)
lr_randomSCV.fit(X_train, y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


In [67]:
print(lr_gridSCV.best_params_)
lr_gridSCV_predict = lr_gridSCV.predict(X_test)
print(classification_report(y_test, lr_gridSCV_predict))

{'lr__C': 10, 'lr__class_weight': 'balanced', 'lr__max_iter': 1000, 'lr__penalty': 'l2', 'lr__solver': 'saga', 'lr__tol': 0.01}
              precision    recall  f1-score   support

           0       0.88      0.83      0.86      6450
           1       0.87      0.91      0.89      8005

    accuracy                           0.88     14455
   macro avg       0.88      0.87      0.87     14455
weighted avg       0.88      0.88      0.88     14455



In [68]:
print(lr_randomSCV.best_params_)
lr_randomSCV_predict = lr_randomSCV.predict(X_test)
print(classification_report(y_test, lr_randomSCV_predict))

{'lr__tol': 0.01, 'lr__solver': 'liblinear', 'lr__penalty': 'l1', 'lr__max_iter': 100, 'lr__class_weight': 'balanced', 'lr__C': 100}
              precision    recall  f1-score   support

           0       0.88      0.84      0.86      6450
           1       0.87      0.91      0.89      8005

    accuracy                           0.88     14455
   macro avg       0.88      0.87      0.87     14455
weighted avg       0.88      0.88      0.88     14455



In [69]:
rf_params = {'n_estimators': [10, 50, 100]
    , 'max_depth': [None, 10, 20, 30, 40, 50]
    , 'min_samples_split': [2, 5, 10]
    , 'min_samples_leaf': [1, 2, 4]
    , 'max_features': ['sqrt', 'log2']
    , 'class_weight': ['balanced', 'balanced_subsample']
}

rf_gridSCV = GridSearchCV(RandomForestClassifier()
                           , rf_params
                           , n_jobs=-1
                           , refit='f1'
                           , scoring='f1'
                           , cv=k_fold)
rf_gridSCV.fit(X_train, y_train)

rf_randomSCV = RandomizedSearchCV(RandomForestClassifier()
                                   , rf_params
                                   , n_jobs=-1
                                   , n_iter=50
                                   , random_state=42
                                   , refit='f1'
                                   , scoring='f1'
                                   , cv=k_fold)
rf_randomSCV.fit(X_train, y_train)

In [70]:
print(rf_gridSCV.best_params_)
rf_grid_predict = rf_gridSCV.predict(X_test)
print(classification_report(y_test, rf_grid_predict))

{'class_weight': 'balanced', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
              precision    recall  f1-score   support

           0       0.93      0.94      0.93      6450
           1       0.95      0.94      0.95      8005

    accuracy                           0.94     14455
   macro avg       0.94      0.94      0.94     14455
weighted avg       0.94      0.94      0.94     14455



In [71]:
print(rf_randomSCV.best_params_)
rf_random_predict = rf_randomSCV.predict(X_test)
print(classification_report(y_test, rf_random_predict))

{'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40, 'class_weight': 'balanced'}
              precision    recall  f1-score   support

           0       0.92      0.94      0.93      6450
           1       0.95      0.94      0.95      8005

    accuracy                           0.94     14455
   macro avg       0.94      0.94      0.94     14455
weighted avg       0.94      0.94      0.94     14455



In [72]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [73]:
predict_rf = rf_model.predict(X_test)
print(classification_report(y_test, predict_rf))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93      6450
           1       0.95      0.94      0.95      8005

    accuracy                           0.94     14455
   macro avg       0.94      0.94      0.94     14455
weighted avg       0.94      0.94      0.94     14455



In [55]:
svm_params = {'C': [0.1, 1, 10, 100]
    , 'kernel': ['linear', 'rbf']
    ,'gamma': ['scale', 'auto']
    , 'class_weight': ['balanced']
    , 'max_iter': [1000, 5000, 10000]
}

svm_gridSCV = GridSearchCV(SVC()
                        , svm_params
                        , n_jobs=-1
                        , refit='f1'
                        , scoring='f1'
                        , cv=k_fold)

svm_gridSCV.fit(X_train, y_train)

svm_randomSCV = RandomizedSearchCV(SVC()
                                , svm_params
                                , n_jobs=-1
                                , n_iter=50
                                , random_state=42
                                , refit='f1'
                                , scoring='f1'
                                , cv=k_fold)

svm_randomSCV.fit(X_train, y_train)



In [56]:
print(svm_gridSCV.best_params_)
svm_gridSCV_predict = svm_gridSCV.predict(X_test)
print(classification_report(y_test, svm_gridSCV_predict, zero_division=0))

{'C': 100, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'rbf'}
              precision    recall  f1-score   support

           0       0.91      0.91      0.91      6444
           1       0.93      0.93      0.93      8011

    accuracy                           0.92     14455
   macro avg       0.92      0.92      0.92     14455
weighted avg       0.92      0.92      0.92     14455



In [57]:
print(svm_randomSCV.best_params_)
svm_randomSCV_predict = svm_randomSCV.predict(X_test)
print(classification_report(y_test, svm_randomSCV_predict, zero_division=0))

{'kernel': 'rbf', 'gamma': 'auto', 'class_weight': 'balanced', 'C': 100}
              precision    recall  f1-score   support

           0       0.91      0.91      0.91      6444
           1       0.93      0.93      0.93      8011

    accuracy                           0.92     14455
   macro avg       0.92      0.92      0.92     14455
weighted avg       0.92      0.92      0.92     14455

