In [92]:
import numpy as npy
import pandas as pds

from pandas import Series, DataFrame

import scipy

import seaborn as sbn

from pylab import rcParams
import matplotlib.pyplot as plot

import sklearn
from sklearn.preprocessing import scale
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Data Visualisation Libraries
%config InlineBackend.figure_format = 'retina'

# Statistics
from scipy.stats import chi2_contingency

# Machine Learning
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier

from sklearn.metrics import accuracy_score, recall_score, precision_score, auc, roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix


In [93]:
font_size = 30
plot.rcParams['axes.labelsize'] = font_size
plot.rcParams['axes.titlesize'] = font_size + 5
plot.rcParams['ytick.labelsize'] = font_size - 5
plot.rcParams['legend.fontsize'] = font_size - 5

random_state = 23
scoring_metric = 'recall'
comparison_dict, comparison_test_dict = {}, {}

In [94]:
address = 'D:\ME781\PROJECT\Churn_Modelling.csv'
churn = pds.read_csv(address)
churn.columns = ('CustomerId','Surname','CreditScore','Geography','Gender','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember','EstimatedSalary','Excited')
churn.head()

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Excited
0,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [95]:
churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       10000 non-null  int64  
 1   Surname          10000 non-null  object 
 2   CreditScore      10000 non-null  int64  
 3   Geography        10000 non-null  object 
 4   Gender           10000 non-null  object 
 5   Age              10000 non-null  int64  
 6   Tenure           10000 non-null  int64  
 7   Balance          10000 non-null  float64
 8   NumOfProducts    10000 non-null  int64  
 9   HasCrCard        10000 non-null  int64  
 10  IsActiveMember   10000 non-null  int64  
 11  EstimatedSalary  10000 non-null  float64
 12  Excited          10000 non-null  int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 1015.8+ KB


In [96]:
churn.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CustomerId,10000.0,15690940.0,71936.186123,15565701.0,15628528.25,15690740.0,15753230.0,15815690.0
CreditScore,10000.0,650.5288,96.653299,350.0,584.0,652.0,718.0,850.0
Age,10000.0,38.9218,10.487806,18.0,32.0,37.0,44.0,92.0
Tenure,10000.0,5.0128,2.892174,0.0,3.0,5.0,7.0,10.0
Balance,10000.0,76485.89,62397.405202,0.0,0.0,97198.54,127644.2,250898.09
NumOfProducts,10000.0,1.5302,0.581654,1.0,1.0,1.0,2.0,4.0
HasCrCard,10000.0,0.7055,0.45584,0.0,0.0,1.0,1.0,1.0
IsActiveMember,10000.0,0.5151,0.499797,0.0,0.0,1.0,1.0,1.0
EstimatedSalary,10000.0,100090.2,57510.492818,11.58,51002.11,100193.9,149388.2,199992.48
Excited,10000.0,0.2037,0.402769,0.0,0.0,0.0,0.0,1.0


In [97]:
feature_cols = ['CreditScore','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember','EstimatedSalary']
X = churn[feature_cols] # Features
y = churn.Excited # Target variable

In [98]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state= 23)

In [99]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state= 23)
logreg.fit(Xtrain, ytrain)
ypred = logreg.predict(Xtest)

In [100]:
from sklearn import metrics

cnf_matrix = metrics.confusion_matrix(ytest, ypred)
cnf_matrix

array([[1948,   50],
       [ 474,   28]], dtype=int64)

In [101]:
from sklearn.metrics import classification_report
target_names = ['excited', 'not excited']
print(classification_report(ytest, ypred, target_names=target_names))

              precision    recall  f1-score   support

     excited       0.80      0.97      0.88      1998
 not excited       0.36      0.06      0.10       502

    accuracy                           0.79      2500
   macro avg       0.58      0.52      0.49      2500
weighted avg       0.71      0.79      0.72      2500

