In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv(r'D:\Data_Engineer\projects\django_aws\customer_churn\customer_churn_server\Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
# Drop insignificant columns
df.drop(columns=["RowNumber","CustomerId","Surname"],inplace=True)

In [None]:
df.rename(columns={"Exited":"Churned"},inplace=True)
df["Churned"].replace({0:"No",1:"Yes"},inplace=True)
df.head()

In [5]:
conditions = [(df["NumOfProducts"]==1), (df["NumOfProducts"]==2), (df["NumOfProducts"]>2)]
values = ["One Product", "Two Products", "More Than 2 Products"]

In [7]:
df["Total_Products"] = np.select(conditions, values, default="Unknown")
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Total_Products
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1,One Product
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,One Product
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,More Than 2 Products
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0,Two Products
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,One Product


In [8]:
df["Total_Products"].unique()

array(['One Product', 'More Than 2 Products', 'Two Products'],
      dtype=object)

In [9]:
df.drop(columns="NumOfProducts", inplace=True)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Total_Products
0,619,France,Female,42,2,0.0,1,1,101348.88,1,One Product
1,608,Spain,Female,41,1,83807.86,0,1,112542.58,0,One Product
2,502,France,Female,42,8,159660.8,1,0,113931.57,1,More Than 2 Products
3,699,France,Female,39,1,0.0,0,0,93826.63,0,Two Products
4,850,Spain,Female,43,2,125510.82,1,1,79084.1,0,One Product


In [10]:
conditions = [(df["Balance"]==0), (df["Balance"]>0)]
values = ["Zero Balance", "More Than Zero Balance"]

In [12]:
df["Account_Balance"] = np.select(conditions, values, default="Unknown")
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Total_Products,Account_Balance
0,619,France,Female,42,2,0.0,1,1,101348.88,1,One Product,Zero Balance
1,608,Spain,Female,41,1,83807.86,0,1,112542.58,0,One Product,More Than Zero Balance
2,502,France,Female,42,8,159660.8,1,0,113931.57,1,More Than 2 Products,More Than Zero Balance
3,699,France,Female,39,1,0.0,0,0,93826.63,0,Two Products,Zero Balance
4,850,Spain,Female,43,2,125510.82,1,1,79084.1,0,One Product,More Than Zero Balance


In [13]:
df.drop(columns="Balance", inplace=True)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Total_Products,Account_Balance
0,619,France,Female,42,2,1,1,101348.88,1,One Product,Zero Balance
1,608,Spain,Female,41,1,0,1,112542.58,0,One Product,More Than Zero Balance
2,502,France,Female,42,8,1,0,113931.57,1,More Than 2 Products,More Than Zero Balance
3,699,France,Female,39,1,0,0,93826.63,0,Two Products,Zero Balance
4,850,Spain,Female,43,2,1,1,79084.1,0,One Product,More Than Zero Balance


In [14]:
cat_cols = ["Geography", "Gender", "Total_Products", "Account_Balance"]
for column in cat_cols:
    print(f"Unique Values in {column} column is:",df[column].unique())
    print("-"*100,"\n")

Unique Values in Geography column is: ['France' 'Spain' 'Germany']
---------------------------------------------------------------------------------------------------- 

Unique Values in Gender column is: ['Female' 'Male']
---------------------------------------------------------------------------------------------------- 

Unique Values in Total_Products column is: ['One Product' 'More Than 2 Products' 'Two Products']
---------------------------------------------------------------------------------------------------- 

Unique Values in Account_Balance column is: ['Zero Balance' 'More Than Zero Balance']
---------------------------------------------------------------------------------------------------- 



In [15]:
df = pd.get_dummies(columns=cat_cols, data=df)
#   convert categorical column to many columns that have all unique values of them

In [18]:
df.head()

Unnamed: 0,CreditScore,Age,Tenure,HasCrCard,IsActiveMember,EstimatedSalary,Churned,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male,Total_Products_More Than 2 Products,Total_Products_One Product,Total_Products_Two Products,Account_Balance_More Than Zero Balance,Account_Balance_Zero Balance
0,619,42,2,1,1,101348.88,1,True,False,False,True,False,False,True,False,False,True
1,608,41,1,0,1,112542.58,0,False,False,True,True,False,False,True,False,True,False
2,502,42,8,1,0,113931.57,1,True,False,False,True,False,True,False,False,True,False
3,699,39,1,0,0,93826.63,0,True,False,False,True,False,False,False,True,False,True
4,850,43,2,1,1,79084.1,0,False,False,True,True,False,False,True,False,True,False


In [17]:
df.rename(columns={"Exited": "Churned"}, inplace=True)

In [19]:
cols = ["CreditScore", "Age", "EstimatedSalary"]
df[cols].skew().to_frame().rename(columns={0:"Feature Sknewness"})

Unnamed: 0,Feature Sknewness
CreditScore,-0.071607
Age,1.01132
EstimatedSalary,0.002085


In [20]:
X = df.drop(columns="Churned")
y = df["Churned"]

In [21]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score ,f1_score
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score

from imblearn.over_sampling import SMOTE

In [22]:
#   splitting data for Model Traning and Testing
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print("Shape of x_train is ", x_train.shape)
print("Shape of x_test is ", x_test.shape)
print("Shape of y_train is ", y_train.shape)
print("Shape of y_test is ", y_test.shape)



Shape of x_train is  (8000, 16)
Shape of x_test is  (2000, 16)
Shape of y_train is  (8000,)
Shape of y_test is  (2000,)


In [23]:
y_train.value_counts()

# Applying SMOTE to overcome the class-imbalance in taget variable
smt = SMOTE(random_state=42)

x_train_resampled, y_train_resampled = smt.fit_resample(x_train, y_train)
print(x_train_resampled.shape, y_train_resampled.shape)

y_train_resampled.value_counts().to_frame()

(12736, 16) (12736,)


Unnamed: 0_level_0,count
Churned,Unnamed: 1_level_1
0,6368
1,6368


In [24]:
dtree = DecisionTreeClassifier()

In [25]:
param_grid = {"max_depth":[3,4,5,6,7,8,9,10],
              "min_samples_split":[2,3,4,5,6,7,8],
              "min_samples_leaf":[1,2,3,4,5,6,7,8],
              "criterion":["gini","entropy"],
              "splitter":["best","random"],
              "max_features":["auto",None],
              "random_state":[0,42]}

In [26]:
grid_search = GridSearchCV(dtree, param_grid, cv=5, n_jobs=-1)

grid_search.fit(x_train_resampled,y_train_resampled)

17920 fits failed out of a total of 35840.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8920 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Data_Engineer\projects\django_aws\customer_churn\customer_churn_server\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Data_Engineer\projects\django_aws\customer_churn\customer_churn_server\venv\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "d:\Data_Engineer\projects\django_aws\customer_churn\customer_churn_server\venv\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_

In [27]:
best_parameters = grid_search.best_params_

print("Best Parameters for DecisionTree Model is:\n\n")
best_parameters

Best Parameters for DecisionTree Model is:




{'criterion': 'gini',
 'max_depth': 10,
 'max_features': None,
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'random_state': 0,
 'splitter': 'random'}

In [28]:
dtree = DecisionTreeClassifier(**best_parameters)

dtree.fit(x_train_resampled,y_train_resampled)

In [29]:
y_train_pred = dtree.predict(x_train_resampled)
y_test_pred = dtree.predict(x_test)

print("Accuracy Score of Model on Training Data is =>",round(accuracy_score(y_train_resampled,y_train_pred)*100,2),"%")
print("Accuracy Score of Model on Testing Data  is =>",round(accuracy_score(y_test,y_test_pred)*100,2),"%")

Accuracy Score of Model on Training Data is => 89.53 %
Accuracy Score of Model on Testing Data  is => 82.85 %


In [30]:
def predict_churn(model, input_features):
    # Preprocess input features
    input_df = pd.DataFrame([input_features], columns=x_test.columns)
    
    # Make predictions
    predictions = model.predict(input_df)
    
    return predictions

# Example usage of the function
input_features = [597, 3.5553480614894135, 8, 1, 1, 192852.67, False, True, False, True, False, False, True, False, True, False]

# Assuming 'rfc' is your trained RandomForestClassifier
prediction = predict_churn(dtree, input_features)
print("Predicted churn:", prediction)

Predicted churn: [0]
