# Customer Churn Prediction

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
p3=pd.read_excel('P3- Churn-Modelling Data.xlsx')
p3.head()

In [None]:
p3.shape

In [None]:
p3.info()

In [None]:
p3['churned'].value_counts()

## Customer Demographics:

In [None]:
p3['Age'].value_counts()

In [None]:
#defining bins and age groups
bins=[0,18,30,40,50,60,70,80,90]
age_groups=['0-18','19-30','31-40','41-50','51-60','61-70','71-80','81-90']
#creating new column 'age group'
p3['Age Group']=pd.cut(p3['Age'],labels=age_groups,bins=bins, right=False)
p3['num']=1
p3.head()

In [None]:
age_dist=p3.groupby('Age Group')['num'].sum().reset_index()
age_dist

In [None]:
p3['Age Group'].value_counts().sort_index().plot(kind='bar')
plt.xlabel('Age Group')
plt.ylabel('Count of customer')
plt.title('Distribution of customers across different age groups')

In [None]:
p3['Gender'].value_counts().reset_index()
p3['Gender'].value_counts().plot(kind='pie')

## Churn Analysis:

In [None]:
percent_cust_churned=(((p3['churned'].sum())/(len(p3['churned'])))*100).round(2)
print(f'Percentage of customer churned is {percent_cust_churned}%')

In [None]:
for num in [1,2,3,4]:
    print(f"percentage of customer churned having number of product = {num} is {((p3[p3['NumOfProducts']==num]['churned'].sum()/p3['churned'].sum())*100).round(2)}")

In [None]:
#number of customer churned who is not active member
print(f"percentage of customer churned who is not an active member is {((p3[p3['IsActiveMember']==0]['churned'].sum()/p3['churned'].sum())*100).round(2)}")

In [None]:
#age wise distribution of customer who are churned
churn_agewise=[]
for i in p3['Age Group'].unique():
    churn_agewise.append(p3[p3['Age Group']==i]['churned'].sum())
print(churn_agewise)

In [None]:
agewise_churn_df=pd.DataFrame({'Age Group':p3['Age Group'].unique(),'Churn':churn_agewise})

In [None]:
agewise_churn_df.sort_values(by='Age Group')

In [None]:
sns.barplot(data=agewise_churn_df,x='Age Group',y='Churn')

In [None]:
p3['churned'].value_counts().plot(kind='pie')

In [None]:
print(p3['CreditScore'].max(), p3['CreditScore'].min())
p3[p3['CreditScore']>600]['churned'].sum()

In [None]:
regionwise_churned=[]
for i in p3['Geography'].unique():
    regionwise_churned.append(p3[p3['Geography']==i]['churned'].sum())
print(regionwise_churned)

In [None]:
plt.bar(p3['Geography'].unique().tolist(),regionwise_churned)

In [None]:
p3[['CreditScore','Geography','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember','churned']].corr()

## Product Usage:

In [None]:
prod_usage=p3['NumOfProducts'].value_counts().reset_index()

In [None]:
prod_usage.plot(x='index', kind='bar')
plt.ylabel('Count')
plt.xlabel('Number of Products')
plt.title('most commonly used product or services')

In [None]:
p3

In [None]:
usage1_agewise=p3[p3['NumOfProducts']==1]['Age Group'].value_counts().reset_index().sort_values(by='index')
usage1_agewise.rename(columns={'index':'Age Group','Age Group':'count'},inplace=True)
usage1_agewise.plot(x='Age Group')
plt.ylabel('Frequency')
plt.title('Usage pattern of different customer segment according to age group of number of product = 1')
plt.grid(True)

In [None]:
usage2_agewise=p3[p3['NumOfProducts']==2]['Age Group'].value_counts().reset_index().sort_values(by='index')
usage2_agewise.rename(columns={'index':'Age Group','Age Group':'count'},inplace=True)
usage2_agewise.plot(x='Age Group')
plt.ylabel('Frequency')
plt.title('Usage pattern of different customer segment according to age group of number of product = 2')
plt.grid(True)

In [None]:
usage3_agewise=p3[p3['NumOfProducts']==3]['Age Group'].value_counts().reset_index().sort_values(by='index')
usage3_agewise.rename(columns={'index':'Age Group','Age Group':'count'},inplace=True)
usage3_agewise.plot(x='Age Group')
plt.ylabel('Frequency')
plt.title('Usage pattern of different customer segment according to age group of number of product = 3')
plt.grid(True)

In [None]:
usage4_agewise=p3[p3['NumOfProducts']==4]['Age Group'].value_counts().reset_index().sort_values(by='index')
usage4_agewise.rename(columns={'index':'Age Group','Age Group':'count'},inplace=True)
usage4_agewise.plot(x='Age Group')
plt.ylabel('Frequency')
plt.title('Usage pattern of different customer segment according to age group of number of product = 4')
plt.grid(True)

In [None]:
fig, axes=plt.subplots(figsize=(12,5))
axes.plot(usage1_agewise['Age Group'], usage1_agewise['count'], label='NumOfProduct=1')
axes.plot(usage2_agewise['Age Group'], usage2_agewise['count'], label='NumOfProduct=2')
axes.plot(usage3_agewise['Age Group'], usage3_agewise['count'], label='NumOfProduct=3')
axes.plot(usage4_agewise['Age Group'], usage4_agewise['count'], label='NumOfProduct=4')
axes.set_xlabel('Age Group')
axes.set_ylabel('Frequency')
axes.set_title('Usage Pattern of different customer segment according to age group')
axes.legend(title='Customer Segment')
usage1_agewise

## Financial Analysis:

In [None]:
avg_acc_bal=p3['Balance'].mean().round(2)
print(f'Average account Balance of the customers are {avg_acc_bal}')
print(f'''
Churned Customer:
​
Number of customer churned is {p3[p3['churned']==1]['churned'].sum()}
​
Average Credit Score is {p3[p3['churned']==1]['CreditScore'].mean().round()}
Median Credit Score is {p3[p3['churned']==1]['CreditScore'].median()}
Max and Min Credit Score is {p3[p3['churned']==1]['CreditScore'].max()} and {p3[p3['churned']==1]['CreditScore'].min()}
Average Balance is {p3[p3['churned']==1]['Balance'].mean().round()}
Average Tenure is {p3[p3['churned']==1]['Tenure'].mean().round()}
Average Number of Product used is {p3[p3['churned']==1]['NumOfProducts'].mean().round()}
Percentage of customers having credit card is {(p3[p3['churned']==1]['HasCrCard'].sum()/10000)*100}
Number of person who is active member is {p3[p3['churned']==1]['IsActiveMember'].sum()}
Average estimated Salary is {p3[p3['churned']==1]['EstimatedSalary'].mean().round()}
​
Non-Churned Customer:
​
Number of Non-churned customer is {len(p3[p3['churned']==0]['churned'])}
​
Average Credit Score is {p3[p3['churned']==0]['CreditScore'].mean().round()}
Median Credit Score is {p3[p3['churned']==0]['CreditScore'].median()}
Max and Min Credit Score is {p3[p3['churned']==0]['CreditScore'].max()} and {p3[p3['churned']==0]['CreditScore'].min()}
Average Balance is {p3[p3['churned']==0]['Balance'].mean().round()}
Average Tenure is {p3[p3['churned']==0]['Tenure'].mean().round()}
Average Number of Product used is {p3[p3['churned']==0]['NumOfProducts'].mean().round()}
Percentage of customers having credit card is {(p3[p3['churned']==0]['HasCrCard'].sum()/10000)*100}
Number of person who is active member is {p3[p3['churned']==0]['IsActiveMember'].sum()}
Average estimated Salary is {p3[p3['churned']==0]['EstimatedSalary'].mean().round()}
​
''')

In [None]:
p3['CustomerId'].nunique()

## Predictive Modeling:

In [None]:
col=['CreditScore','Balance','EstimatedSalary']

In [None]:
fig, axes=plt.subplots(1,3)
for i, col in enumerate(col):
    axes[i].boxplot(x=col,data=p3)
    axes[i].set_title(f'Box Plot of {col}')

plt.tight_layout()
plt.show()

In [None]:
p3.columns

In [None]:
p3_new=p3.drop(['Surname','Age Group','RowNumber','CustomerId','num'],axis=1)
p3_new.head()

In [None]:
p3_new.isnull().sum()

In [None]:
cat_feats=['Geography','Gender']
final_p3=pd.get_dummies(p3_new,columns=cat_feats,drop_first=True)
final_p3.head()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
#function of VIF
def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

In [None]:
final_p3

In [None]:
a = final_p3.drop(['churned','CreditScore','Age'], axis=1)
a

In [None]:
calc_vif(a)

In [None]:
final_p3=final_p3.drop(['CreditScore','Age'], axis=1)
final_p3

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
sm=SMOTE(sampling_strategy='minority', random_state=42)

In [None]:
oversampled_X, oversampled_y=sm.fit_resample(final_p3.drop('churned',axis=1), final_p3['churned'])

In [None]:
pd.DataFrame(oversampled_X)

In [None]:
pd.DataFrame(oversampled_y)

In [None]:
oversampled=pd.concat([pd.DataFrame(oversampled_X), pd.DataFrame(oversampled_y)], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X=oversampled.drop('churned',axis=1)
y=oversampled['churned']
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
logmodel=LogisticRegression()
logmodel.fit(X_train,y_train)
predictions=logmodel.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train)
pred=dtree.predict(X_test)

In [None]:
print(classification_report(y_test,pred))
print(confusion_matrix(y_test,pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()
rfc.fit(X_train,y_train)
pred1=rfc.predict(X_test)

In [None]:
print(classification_report(y_test,pred1))
print(confusion_matrix(y_test,pred1))

In [None]:
from pycaret.classification import *
class_experiment = setup(final_p3, 
                       target = 'churned', 
                       session_id=42, 
                       experiment_name='customer churn model',
                       normalize = True, 
                       transformation = True, 
                       remove_multicollinearity = True, #drop one of the two features that are highly correlated with each other
                       multicollinearity_threshold = 0.5
                           )

In [None]:
best_model=compare_models()

In [None]:
#max
#Gradient Boosting Classifier(gbc)=acc(0.8616),auc(0.8593),prec(0.7768),f1(0.5685),kappa(0.4934),mcc(0.5202)
#dt=recall(0.4887)

In [None]:
rf=create_model('rf')

In [None]:
rf=tune_model(rf,optimize='F1')

In [None]:
plot_model(rf)

In [None]:
plot_model(rf,plot='feature')

In [None]:
print(evaluate_model(rf))

In [None]:
interpret_model(rf)