In [None]:
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
from sklearn.utils import shuffle
from tqdm import tqdm
import numpy as np

In [None]:
try:
    data = pd.read_csv("Churn.csv")
except Exception:
    data = pd.read_csv("/datasets/Churn.csv")

In [None]:
data.info()

In [None]:
print(data.head(20))

In [None]:
data = data.dropna(subset=["Tenure"]).reset_index(drop=True)
data.info()

In [None]:
data = data.drop(["RowNumber", "CustomerId", "Surname"], axis=1)
data = pd.get_dummies(data, drop_first=True)
data = data.sample(frac=1)

In [None]:
target = data['Exited']
features = data.drop('Exited', axis=1)

In [None]:
features_t, features_valid, target_t, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=12345, stratify=target)

features_train, features_test, target_train, target_test = train_test_split(
    features_t, target_t, test_size=0.25, random_state=12345, stratify=target_t)


In [None]:
threshold = 0.32

In [None]:
best_model_forest = None
best_f1_score = 0
best_precision = 0
best_recall = 0
best_auc_roc = 0

In [None]:
for i in tqdm(range(1, 100)):
    for g in range(3, 20):
        model = RandomForestClassifier(random_state=12345, n_estimators=i, max_depth=g)
        model.fit(features_train, target_train)
        
        probabilities = model.predict_proba(features_valid)
        probabilities_one = probabilities[:, 1]
        prediction_valid = probabilities_one > threshold

        precision = precision_score(target_valid, prediction_valid)
        recall = recall_score(target_valid, prediction_valid)
        f1 = f1_score(target_valid, prediction_valid)
        roc_auc = roc_auc_score(target_valid, prediction_valid)

        if f1 > best_f1_score:
            best_f1_score = f1
            best_precision = precision
            best_recall = recall
            best_model_forest = model
            best_auc_roc = roc_auc

In [None]:
print(f"Best F1 = {'%.4f' % (best_f1_score)}\nAUC-ROC = {'%.4f' % (best_auc_roc)}\nPecision = {'%.4f' % (best_precision)}\nRecall = {'%.4f' % (best_recall)}")

In [None]:
model_extra_trees = ExtraTreesClassifier(random_state=12345, max_depth=2000, n_estimators=1000)
model_extra_trees.fit(features_train, target_train)

probabilities_extra_trees = model_extra_trees.predict_proba(features_valid)
probabilities_extra_trees_one_valid = probabilities_extra_trees[:, 1]

predicted_extra_trees_valid = probabilities_extra_trees_one_valid > 0.36
precision_extra_trees = precision_score(target_valid, predicted_extra_trees_valid)
recall_extra_trees = recall_score(target_valid, predicted_extra_trees_valid)
f1_extra_trees = f1_score(target_valid, predicted_extra_trees_valid)
roc_auc_extra_trees = roc_auc_score(target_valid, probabilities_extra_trees_one_valid)

print(f"Best F1 = {'%.4f' % (f1_extra_trees)}\nAUC-ROC = {'%.4f' % (roc_auc_extra_trees)}\nPecision = {'%.4f' % (precision_extra_trees)}\nRecall = {'%.4f' % (recall_extra_trees)}")


In [None]:
probabilities = best_model_forest.predict_proba(features_test)
probabilities_one = probabilities[:, 1]
prediction_test = probabilities_one > threshold

precision = precision_score(target_test, prediction_test)
recall = recall_score(target_test, prediction_test)
f1 = f1_score(target_test, prediction_test)
roc_auc = roc_auc_score(target_test, prediction_test)

print(f"Best F1 = {'%.4f' % (f1)}\nAUC-ROC = {'%.4f' % (roc_auc)}\nPecision = {'%.4f' % (precision)}\nRecall = {'%.4f' % (recall)}")


In [None]:
model_weigted = RandomForestClassifier(random_state=12345, n_estimators=100, max_depth=20, class_weight="balanced")
model_weigted.fit(features_train, target_train)
        
probabilities_weigted = model_weigted.predict_proba(features_valid)
probabilities_one_weigted = probabilities_weigted[:, 1]
prediction_weigted_valid = probabilities_one_weigted > 0.38
precision_weigted = precision_score(target_valid, prediction_weigted_valid)
recall_weigted = recall_score(target_valid, prediction_weigted_valid)
f1_weigted = f1_score(target_valid, prediction_weigted_valid)
roc_auc_weigted = roc_auc_score(target_valid, prediction_weigted_valid)

print(f"Best F1 = {'%.4f' % (f1_weigted)}\nAUC-ROC = {'%.4f' % (roc_auc_weigted)}\nPecision = {'%.4f' % (precision_weigted)}\nRecall = {'%.4f' % (recall_weigted)}")


In [None]:
probabilities_weigted = model_weigted.predict_proba(features_test)
probabilities_one_weigted = probabilities_weigted[:, 1]
prediction_weigted_test = probabilities_one_weigted > 0.38
precision_weigted = precision_score(target_test, prediction_weigted_test)
recall_weigted = recall_score(target_test, prediction_weigted_test)
f1_weigted = f1_score(target_test, prediction_weigted_test)
roc_auc_weigted = roc_auc_score(target_test, prediction_weigted_test)

print(f"Best F1 = {'%.4f' % (f1_weigted)}\nAUC-ROC = {'%.4f' % (roc_auc_weigted)}\nPecision = {'%.4f' % (precision_weigted)}\nRecall = {'%.4f' % (recall_weigted)}")

In [None]:
def upsample(features, target, repeat):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)
    
    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=12345)
    
    return features_upsampled, target_upsampled

In [None]:
features_upsampled, target_upsampled = upsample(features_train, target_train, 4)

In [None]:
model_upsemled = RandomForestClassifier(random_state=12345, n_estimators=1000, max_depth=20)
model_upsemled.fit(features_upsampled, target_upsampled)
        
probabilities_upsemled = model_upsemled.predict_proba(features_valid)
probabilities_one_upsemled = probabilities_upsemled[:, 1]
prediction_upsemled_valid = probabilities_one_upsemled > 0.38
precision_upsemled = precision_score(target_valid, prediction_upsemled_valid)
recall_upsemled = recall_score(target_valid, prediction_upsemled_valid)
f1_upsemled = f1_score(target_valid, prediction_upsemled_valid)
roc_auc_upsemled = roc_auc_score(target_valid, prediction_upsemled_valid)

print(f"Best F1 = {'%.4f' % (f1_upsemled)}\nAUC-ROC = {'%.4f' % (roc_auc_upsemled)}\nPecision = {'%.4f' % (precision_upsemled)}\nRecall = {'%.4f' % (recall_upsemled)}")

In [None]:
probabilities_upsemled = model_upsemled.predict_proba(features_test)
probabilities_one_upsemled = probabilities_upsemled[:, 1]
prediction_upsemled_test = probabilities_one_upsemled > 0.38
precision_upsemled = precision_score(target_test, prediction_upsemled_test)
recall_upsemled = recall_score(target_test, prediction_upsemled_test)
f1_upsemled = f1_score(target_test, prediction_upsemled_test)
roc_auc_upsemled = roc_auc_score(target_test, prediction_upsemled_test)

print(f"Best F1 = {'%.4f' % (f1_upsemled)}\nAUC-ROC = {'%.4f' % (roc_auc_upsemled)}\nPecision = {'%.4f' % (precision_upsemled)}\nRecall = {'%.4f' % (recall_upsemled)}")