In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble  import RandomForestClassifier , GradientBoostingClassifier

In [2]:
data = pd.read_csv(r"D:\Visual Studio Code\ML\DataSet\Churn_Modelling.csv")
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
#Removing data which are not required 
data.drop(columns=['RowNumber','CustomerId','Surname'],inplace=True,axis=1)
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
data.shape

(10000, 11)

So, Now data contain only required informations like :
- CreditScore
- Geography
- Age 
- Gender
- Tenure
- Balance
- NumOfProducts
- HasCrCard
- IsActiveMember
- EstimatedSalary
- Exited

In [5]:
data.describe()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [6]:
#Checking for any missing data
data.isna().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [7]:
# Converting the Str Data into a label and standardizing the numerical data
from sklearn.preprocessing import LabelEncoder , StandardScaler

le = LabelEncoder()
data['Geography'] = le.fit_transform(data['Geography'])
data['Gender'] = le.fit_transform(data['Gender'])

sc = StandardScaler()
data[['CreditScore','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember','EstimatedSalary']] = sc.fit_transform(data[['CreditScore','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember','EstimatedSalary']])


In [8]:
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,-0.326221,0,0,0.293517,-1.04176,-1.225848,-0.911583,0.646092,0.970243,0.021886,1
1,-0.440036,2,0,0.198164,-1.387538,0.11735,-0.911583,-1.547768,0.970243,0.216534,0
2,-1.536794,0,0,0.293517,1.032908,1.333053,2.527057,0.646092,-1.03067,0.240687,1
3,0.501521,0,0,0.007457,-1.387538,-1.225848,0.807737,-1.547768,-1.03067,-0.108918,0
4,2.063884,2,0,0.388871,-1.04176,0.785728,-0.911583,0.646092,0.970243,-0.365276,0


In [9]:
#Now Divide the data 
x = data.drop('Exited',axis=1)
y = data['Exited']


In [10]:
#Now dividing the data into training and Testing 
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.2,random_state=42)


In [11]:
#Calling the Logestic Regression Model
model = LogisticRegression()
model.fit(x_train, y_train)

In [12]:
#Checking model in training data
train_pred = model.predict(x_train)
accuracy = accuracy_score(train_pred,y_train)
print(f"Accuracy on training data: {accuracy:.2f}")

Accuracy on training data: 0.82


In [13]:
#Checking model in Testing data
test_pred = model.predict(x_test)
accuracy = accuracy_score(test_pred,y_test)
print(f"Accuracy on training data: {accuracy:.2f}")

Accuracy on training data: 0.81


In [14]:
#Using Random Forest Classifier 
model = RandomForestClassifier(n_estimators= 200)
model.fit(x_train, y_train)

In [15]:
#Checking model in training data
train_pred = model.predict(x_train)
accuracy = accuracy_score(train_pred,y_train)
print(f"Accuracy on training data: {accuracy:.2f}")

Accuracy on training data: 1.00


In [16]:
#Checking model in Testing data
test_pred = model.predict(x_test)
accuracy = accuracy_score(test_pred,y_test)
print(f"Accuracy on training data: {accuracy:.2f}")

Accuracy on training data: 0.86


In [17]:
#Using Gradient Boosting Classifier 
model = GradientBoostingClassifier(n_estimators= 200,learning_rate=0.5,random_state=42)
model.fit(x_train, y_train)

In [18]:
#Checking model in training data
train_pred = model.predict(x_train)
accuracy = accuracy_score(train_pred,y_train)
print(f"Accuracy on training data: {accuracy:.2f}")

Accuracy on training data: 1.00


In [19]:
#Checking model in Testing data
test_pred = model.predict(x_test)
accuracy = accuracy_score(test_pred,y_test)
print(f"Accuracy on training data: {accuracy:.2f}")

Accuracy on training data: 0.83
