# Importing Libraries

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Loading Dataset

In [4]:
df=pd.read_csv('Churn_Modelling.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Churn_Modelling.csv'

In [None]:
df.tail()

In [None]:
#Data is Clear
df.info()

In [None]:
#Three unique countries
df["Geography"].unique()

In [None]:
df.describe()

In [None]:
#Gender distribution
plt.figure(figsize = (10,5))
plt.pie(df['Gender'].value_counts().values,labels=['Male','Female'],autopct='%.f%%',explode = [0,0.1],shadow = True)
plt.show()

In [None]:
#Geography wise
region = df.Geography.value_counts().to_frame().reset_index()
region.columns = ['Country','Count']
plt.figure(figsize = (5,5))
ax=sns.barplot(x = region['Country'],y = region['Count'],palette='GnBu')
for i in ax.containers:
    ax.bar_label(i,)

# Data Cleaning

In [None]:
#Delete unnecessary columns
df.drop(columns=['RowNumber','CustomerId','Surname'],inplace=True)
df.head()

In [None]:
#Encoding
labelencoder=LabelEncoder()
df['Gender']=labelencoder.fit_transform(df['Gender'])
df['Geography']=labelencoder.fit_transform(df['Geography'])
df.head()

In [None]:
df.dtypes

In [None]:
#Droping coulmn from the features
x=df.drop(columns='Exited')
y=df['Exited']

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,shuffle=True,random_state=40)

# Random Forest

In [None]:
#Create Random Forest Classifier model
model=RandomForestClassifier()
model.fit(x_train,y_train)

In [None]:
#Model score
model.score(x_train,y_train)

In [None]:
#Prediction
y_pred = model.predict(x_test)
y_pred

In [None]:
#Length
len(y_pred)

In [None]:
#Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

In [None]:
#Visualization
y_prob = model.predict_proba(x_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob)

plt.figure(figsize=(10, 5))
plt.plot(fpr, tpr, color='red', lw=2)
plt.plot([0, 1], [0, 1], color='black', lw=2)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('RandomForestClassifier\nAccuracy: {:.2f}%'.format(accuracy * 100))
plt.show()

# Logistic Regression

In [None]:
#Create Logistic Regression model
model=LogisticRegression()
model.fit(x_train,y_train)

In [None]:
#Model Score
model_score=model.score(x_train,y_train)
print(model_score)

In [None]:
#Prediction
y_pred=model.predict(x_test)
print(y_pred)

In [None]:
#Accuracy
accuracy=accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

In [None]:
#Visualization
y_prob = model.predict_proba(x_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob)

plt.figure(figsize=(10, 5))
plt.plot(fpr, tpr, color='blue', lw=2)
plt.plot([0, 1], [0, 1], color='black', lw=2)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression\nAccuracy: {:.2f}%'.format(accuracy * 100))
plt.show()

# Gradient Boosting

In [None]:
#Create Gradient Boosting model
model=GradientBoostingClassifier()
model.fit(x_train,y_train)

In [None]:
#Model Score
model_score=model.score(x_train, y_train)
print(model_score)

In [None]:
#Prediction
y_pred = model.predict(x_test)
print(y_pred)

In [None]:
#Accuracy
accuracy=accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

In [None]:
#Visualization
y_prob = model.predict_proba(x_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob)

plt.figure(figsize=(10, 5))
plt.plot(fpr, tpr, color='green', lw=2)
plt.plot([0, 1], [0, 1], color='black', lw=2)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Gradient Boosting\nAccuracy: {:.2f}%'.format(accuracy * 100))
plt.show()

# Comparing all models

In [None]:
#Define model names
model_names = ["Random Forest", "Logistic Regression", "Gradient Boosting"]

In [None]:
#Define lists to store accuracies
acc_list = []

In [None]:
#Calculate and store accuracies for each model
for model in [RandomForestClassifier(), LogisticRegression(), GradientBoostingClassifier()]:
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    acc_list.append(accuracy)

In [None]:
#DataFrame to store results
model_results = pd.DataFrame({"Model": model_names,
                              "Accuracy_Score": acc_list})
model_results

In [None]:
#Barplot of accuracies
sns.barplot(x="Model", y="Accuracy_Score", data=model_results)