In [None]:
import numpy as np
import pandas as pd

from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
from catboost import CatBoostClassifier


from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.metrics import recall_score

import warnings
warnings.filterwarnings("ignore")

In [53]:
df = pd.read_csv("data/credit_card_churn.csv")
df2 = df.copy()

In [44]:
df2.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.000134,0.99987
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998


In [54]:
df2.drop(["CLIENTNUM", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"], axis=1, inplace=True)

In [55]:
df2["Attrition_Flag"] = np.where(df2["Attrition_Flag"] == "Attrited Customer", 1, 0)

In [56]:
X = df2.drop(["Attrition_Flag"], axis=1)
y = df2["Attrition_Flag"]

In [48]:
ordinalEncoder = OrdinalEncoder()
oneHotEncoder = OneHotEncoder()
standardScaler = StandardScaler()

num_features = X.select_dtypes(exclude="object").columns


preprocessor = ColumnTransformer(
    [
        ("OrdinalEncoder", ordinalEncoder, ["Education_Level", "Income_Category", "Card_Category",]),
        ("OneHotEncoder", oneHotEncoder, ["Gender", "Marital_Status"]),
        ("StandardScaler", standardScaler, num_features)

    ]
)


In [57]:
ordinalEncoder = OrdinalEncoder()
oneHotEncoder = OneHotEncoder()
standardScaler = StandardScaler()

num_features = X.select_dtypes(exclude="object").columns


preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oneHotEncoder, ["Gender", "Marital_Status","Education_Level", "Income_Category", "Card_Category"]),
        ("StandardScaler", standardScaler, num_features)

    ]
)


In [58]:
X = preprocessor.fit_transform(X)

In [59]:
X.shape

(10127, 37)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
print(y_train.value_counts())


Attrition_Flag
0    6801
1    1300
Name: count, dtype: int64


In [61]:
sm = SMOTE(random_state = 69, sampling_strategy = 1.0)

X_train, y_train = sm.fit_resample(X_train, y_train)

In [62]:
print(y_train.value_counts())

Attrition_Flag
0    6801
1    6801
Name: count, dtype: int64


In [63]:
def evaluate_model(true, predicted):
    recall = recall_score(true, predicted)
    return recall

In [64]:
params = {"penalty" : ["l1", "l2", None],
          "C" : [0.001, 0.01, 0.1, 0.5, 1, 10],
          }

model = LogisticRegression()
models = GridSearchCV(model, param_grid=params, scoring="recall", cv=5)
models.fit(X_train, y_train)

lr_model = models.best_estimator_
y_pred = lr_model.predict(X_test)

print("Recall test: ", evaluate_model(y_test, y_pred))

Recall test:  0.8256880733944955


In [65]:
gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
y_pred = gnb_model.predict(X_test)
print("Recall: ", evaluate_model(y_test, y_pred))

Recall:  0.6880733944954128


In [66]:
params = {"kernel" : ["linear", "poly", "rbf", "sigmoid"],
          "degree" : [1, 2, 3, 4, 5],
          "C" : [0.001, 0.01, 0.1, 0.5, 1, 10]}

model = SVC()
models = GridSearchCV(model, param_grid=params, scoring="recall", cv=5, n_jobs=-1)
models.fit(X_train, y_train)


svc_model = models.best_estimator_
y_pred = svc_model.predict(X_test)

print("Recall test: ", evaluate_model(y_test, y_pred))

Recall test:  0.7431192660550459


In [67]:
params = {"n_neighbors" : [1, 2, 3, 4, 5],
          "weights" : ["uniform", "distance"],
          }

model = KNeighborsClassifier()
models = GridSearchCV(model, param_grid=params, scoring="recall", cv=5)
models.fit(X_train, y_train)

knn_model = models.best_estimator_
y_pred = knn_model.predict(X_test)

print("Recall test: ", evaluate_model(y_test, y_pred))

Recall test:  0.8103975535168195


In [68]:
params = {"n_estimators" : [30, 35, 40],
          "max_depth" : [11, 13, 15, 17, 19, 22],
          }

model = RandomForestClassifier()
models = GridSearchCV(model, param_grid=params, scoring="recall", cv=5)
models.fit(X_train, y_train)

rf_model = models.best_estimator_
y_pred = rf_model.predict(X_test)

print("Recall test: ", evaluate_model(y_test, y_pred))

Recall test:  0.8318042813455657


In [72]:
params= {
        "learning_rate": [0.01, 0.1, 0.5],
        "max_depth": [5, 10, 25, 40],
        "subsample": [0.5, 0.7],
        "colsample_bytree": [0.5, 0.7],
        "n_estimators" : [100, 200, 400]
        }

model = XGBClassifier(eval_metric="logloss")
models = GridSearchCV(model, param_grid=params, scoring="recall", cv=5, n_jobs=-1)
models.fit(X_train, y_train)

xgb_model = models.best_estimator_
y_pred = xgb_model.predict(X_test)

print("Recall test: ", evaluate_model(y_test, y_pred))

KeyboardInterrupt: 

In [70]:
models.best_params_

{'max_depth': 13, 'n_estimators': 30}

In [None]:
params = {
    "learning_rate": [0.01, 0.1, 0.5],
    "depth": [5, 10, 25, 40],
    "subsample": [0.5, 0.7],
    "colsample_bytree": [0.5, 0.7],
    "iterations": [100, 200, 400]
}

model = CatBoostClassifier(eval_metric="Logloss", verbose=0)  

models = GridSearchCV(model, param_grid=params, scoring="recall", cv=5, n_jobs=-1)
models.fit(X_train, y_train)

catboost_model = models.best_estimator_
y_pred = catboost_model.predict(X_test)

print("Recall test: ", evaluate_model(y_test, y_pred))

In [None]:
models.best_params_