In [1]:
import numpy as np
import pandas as pd

from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.metrics import recall_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("data/credit_card_churn.csv")
df2 = df.copy()

In [3]:
df2.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.000134,0.99987
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998


In [4]:
df2.drop(["CLIENTNUM", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"], axis=1, inplace=True)

In [5]:
df2["Attrition_Flag"] = np.where(df2["Attrition_Flag"] == "Attrited Customer", 1, 0)

In [6]:
X = df2.drop(["Attrition_Flag"], axis=1)
y = df2["Attrition_Flag"]

In [7]:
ordinalEncoder = OrdinalEncoder()
oneHotEncoder = OneHotEncoder()
standardScaler = StandardScaler()

num_features = X.select_dtypes(exclude="object").columns


preprocessor = ColumnTransformer(
    [
        ("OrdinalEncoder", ordinalEncoder, ["Education_Level", "Income_Category", "Card_Category",]),
        ("OneHotEncoder", oneHotEncoder, ["Gender", "Marital_Status"]),
        ("StandardScaler", standardScaler, num_features)

    ]
)


In [8]:
X = preprocessor.fit_transform(X)

In [9]:
X.shape

(10127, 23)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)
print(y_train.value_counts())


Attrition_Flag
0    5917
1    1171
Name: count, dtype: int64


In [11]:
sm = SMOTE()

X_train, y_train = sm.fit_resample(X_train, y_train)

In [12]:
print(y_train.value_counts())

Attrition_Flag
0    5917
1    5917
Name: count, dtype: int64


In [13]:
def evaluate_model(true, predicted):
    recall = recall_score(true, predicted)
    return recall

In [14]:
params = {"penalty" : ["l1", "l2", None],
          "C" : [0.001, 0.01, 0.1, 0.5, 1, 10],
          }

model = LogisticRegression()
models = GridSearchCV(model, param_grid=params, scoring="recall", cv=5)
models.fit(X_train, y_train)

lr_model = models.best_estimator_
y_pred = lr_model.predict(X_test)

print("Recall test: ", evaluate_model(y_test, y_pred))

Recall test:  0.8442982456140351


In [15]:
gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
y_pred = gnb_model.predict(X_test)
print("Recall: ", evaluate_model(y_test, y_pred))

Recall:  0.7741228070175439


In [17]:
params = {"kernel" : ["rbf", "sigmoid"],
          "degree" : [1],
          "C" : [0.001]}

model = SVC()
models = GridSearchCV(model, param_grid=params, scoring="recall", cv=5, n_jobs=-1)
models.fit(X_train, y_train)


svc_model = models.best_estimator_
y_pred = svc_model.predict(X_test)

print("Recall test: ", evaluate_model(y_test, y_pred))


Recall test:  0.8135964912280702


In [18]:
params = {"n_neighbors" : [1, 2, 3, 4, 5],
          "weights" : ["uniform", "distance"],
          }

model = KNeighborsClassifier()
models = GridSearchCV(model, param_grid=params, scoring="recall", cv=5)
models.fit(X_train, y_train)

knn_model = models.best_estimator_
y_pred = knn_model.predict(X_test)

print("Recall test: ", evaluate_model(y_test, y_pred))

Recall test:  0.6535087719298246


In [22]:
params = {
    "n_estimators": [30, 35, 40, 50, 100],
    "max_depth": [11, 13, 15, 17, 19, 22, None]
}

model = RandomForestClassifier(random_state=123)

search = RandomizedSearchCV(model, param_distributions=params, n_iter=10, scoring="recall", cv=5, random_state=123, n_jobs=-1, verbose=2)

search.fit(X_train, y_train)

rf_model = search.best_estimator_
y_pred = rf_model.predict(X_test)

recall = recall_score(y_test, y_pred)
print("Recall test: ", recall)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Recall test:  0.8223684210526315


In [None]:
params = {
    "colsample_bytree": 0.6109547429295861,
    "learning_rate": 0.019161672175990796,
    "max_depth": 8,
    "min_child_weight": 2,
    "n_estimators": 153,
    "subsample": 0.9788478159629733
}

xgb_model = XGBClassifier(**params)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

recall = recall_score(y_test, y_pred)
print("Recall test: ", recall)

Recall test:  0.9035087719298246
