# CUSTOMER CHURN PREDICTION

 Develop a model to predict customer churn for a subscription
based service or business. Use historical customer data, including
 features like usage behavior and customer demographics, and try
 algorithms like Logistic Regression, Random Forests, or Gradient
 Boosting to predict churn.

In [1]:
## Import all the necessary libraries and models

import pandas as pd
import numpy as np 
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


In [2]:
## Read the data-set
df=pd.read_csv('Churn_modelling.csv')

In [3]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [5]:
## Checking the data types of all columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [6]:
## Get unique values from the 'Geography' column
df.Geography.unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [7]:
## data encoding for geography
encoder_geography=OneHotEncoder(sparse_output=False)
encoded_geography=encoder_geography.fit_transform(df[['Geography']])
New_geography=pd.DataFrame(encoded_geography,columns=encoder_geography.get_feature_names_out())


In [8]:
## data encoding for gender
encoder_gender=OneHotEncoder(sparse_output=False)
encoded_gender=encoder_gender.fit_transform(df[['Gender']])
New_gender=pd.DataFrame(encoded_gender,columns=encoder_gender.get_feature_names_out())


In [9]:
# removing unwanted columns 
df.drop(['RowNumber','CustomerId', 'Surname'],inplace=True,axis=1)

In [10]:
## Adding new columns to the dataset
df=pd.concat([df,New_geography,New_gender],axis=1)

In [11]:
## Droping unwanted columns
df.drop(['Gender','Geography'],axis=1,inplace=True)

In [12]:
# Save to CSV
df.to_csv('modified_Churn_Modelling.csv')

In [13]:
# Split features (X) and labels (y)
X=df.drop('Exited',axis=1)
y=df['Exited']

In [14]:
## spliting the dataset
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.3,random_state=42)

In [15]:
## Performing standardization
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [16]:
## Model training and metric evaluation

models={
    "Logisitic Regression":LogisticRegression(),
    "Random Forest":RandomForestClassifier(),
    "Gradient Boost":GradientBoostingClassifier(),
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
    model_train_precision = precision_score(y_train, y_train_pred) # Calculate Precision
    model_train_recall = recall_score(y_train, y_train_pred) # Calculate Recall
    model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)


    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
    model_test_precision = precision_score(y_test, y_test_pred) # Calculate Precision
    model_test_recall = recall_score(y_test, y_test_pred) # Calculate Recall
    model_test_rocauc_score = roc_auc_score(y_test, y_test_pred) #Calculate Roc


    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    
    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))

    
    
    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))

    
    print('='*35)
    print('\n')

Logisitic Regression
Model performance for Training set
- Accuracy: 0.8101
- F1 score: 0.7727
- Precision: 0.6188
- Recall: 0.2223
- Roc Auc Score: 0.5932
----------------------------------
Model performance for Test set
- Accuracy: 0.8113
- F1 score: 0.7743
- Precision: 0.5421
- Recall: 0.1986
- Roc Auc Score: 0.5790


Random Forest
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.8687
- F1 score: 0.8557
- Precision: 0.7669
- Recall: 0.4675
- Roc Auc Score: 0.7166


Gradient Boost
Model performance for Training set
- Accuracy: 0.8719
- F1 score: 0.8602
- Precision: 0.8048
- Recall: 0.5052
- Roc Auc Score: 0.7365
----------------------------------
Model performance for Test set
- Accuracy: 0.8720
- F1 score: 0.8601
- Precision: 0.7732
- Recall: 0.4846
- Roc Auc Score: 0.7251


