# **1. Download Dataset: Chrun_Modelling.csv**

# **2. Load The Dataset**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

: 

In [None]:
df = pd.read_csv('Churn_Modelling.csv')
df.head()

: 

In [None]:
df = df.drop(columns=['RowNumber', 'CustomerId', 'Surname'])
df.head()

: 

In [None]:
df['IsActiveMember'] = df['IsActiveMember'].astype('category')
df['Exited'] = df['Exited'].astype('category')
df['HasCrCard'] = df['HasCrCard'].astype('category')

: 

# **3. Perform**
# New Section
# *   Univariate Analysis
# *   Bi - Variate Analysis
# *   Multi - Variate Analysis 

In [None]:
sns.kdeplot(x='CreditScore', data = df , hue = 'Exited')
plt.show()

: 

In [None]:
density = df['Exited'].value_counts(normalize=True).reset_index()
sns.barplot(data=density, x='index', y='Exited', );
density

: 

In [None]:
categorical = df.drop(columns=['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary'])
rows = int(np.ceil(categorical.shape[1] / 2)) - 1
fig, axes = plt.subplots(nrows=rows, ncols=2, figsize=(10,6))
axes = axes.flatten()

for row in range(rows):
    cols = min(2, categorical.shape[1] - row*2)
    for col in range(cols):
        col_name = categorical.columns[2 * row + col]
        ax = axes[row*2 + col]       

        sns.countplot(data=categorical, x=col_name, hue="Exited", ax=ax);
        
plt.tight_layout()

: 

# **4. Descriptive statistics** **bold text**

In [None]:
df.info()

: 

In [None]:
df.describe()

: 

# **5. Handle Missing Values**

In [None]:
df.isna().sum()

: 

# In this dataset there is no missing values

# **6. Find the outliers and replace the outliers**

# **Finding Outliers**

In [None]:
def box_scatter(data, x, y):    
    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(16,6))
    sns.boxplot(data=data, x=x, ax=ax1)
    sns.scatterplot(data=data, x=x,y=y,ax=ax2)

: 

In [None]:
box_scatter(df,'CreditScore','Exited');
plt.tight_layout()
print(f"# of Bivariate Outliers: {len(df.loc[df['CreditScore'] < 400])}")

: 

In [None]:
box_scatter(df,'Age','Exited');
plt.tight_layout()
print(f"# of Bivariate Outliers: {len(df.loc[df['Age'] > 87])}")

: 

In [None]:
box_scatter(df,'Balance','Exited');
plt.tight_layout()
print(f"# of Bivariate Outliers: {len(df.loc[df['Balance'] > 220000])}")

: 

In [None]:
box_scatter(df,'EstimatedSalary','Exited');
plt.tight_layout()

: 

# **Removing The Outliers**

In [None]:
for i in df:
    if df[i].dtype=='int64' or df[i].dtypes=='float64':
        q1=df[i].quantile(0.25)
        q3=df[i].quantile(0.75)
        iqr=q3-q1
        upper=q3+1.5*iqr
        lower=q1-1.5*iqr
        df[i]=np.where(df[i] >upper, upper, df[i])
        df[i]=np.where(df[i] <lower, lower, df[i])

: 

In [None]:
box_scatter(df,'CreditScore','Exited');
plt.tight_layout()
print(f"# of Bivariate Outliers: {len(df.loc[df['CreditScore'] < 400])}")

: 

In [None]:
box_scatter(df,'Age','Exited');
plt.tight_layout()
print(f"# of Bivariate Outliers: {len(df.loc[df['Age'] > 87])}")

: 

In [None]:
box_scatter(df,'Balance','Exited');
plt.tight_layout()
print(f"# of Bivariate Outliers: {len(df.loc[df['Balance'] > 220000])}")

: 

# **7. Check for Categorical columns and perform encoding.**

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
for i in df:
    if df[i].dtype=='object' or df[i].dtype=='category':
        df[i]=encoder.fit_transform(df[i])

: 

# **8. Split the data into dependent and independent variables.**

In [None]:
x=df.iloc[:,:-1]
x.head()

: 

In [None]:
y=df.iloc[:,-1]
y.head()

: 

# **9. Scale the independent variables**

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x=scaler.fit_transform(x)
print(x)

: 

# **10. Split the data into training and testing.**

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20)

: 

In [None]:
print(x_train.shape)
print(x_test.shape)

: 

In [None]:
print(y_train.shape)
print(y_test.shape)

: 