In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import wget

# 1. Download the dataset: Churn_Modelling_dataset.csv

# 2. Load the dataset

In [None]:
df = pd.read_csv('C:/Users/Praneeth/Churn_Modelling.csv', index_col=0)

In [None]:
np.unique(df['Exited'])

# 5. Handle the Missing values

In [None]:
df.isnull().sum()

# 4. Perform descriptive statistics on the dataset.

In [None]:
df.describe(include='all')

# 3. Perform Below Visualizations.
* Univariate Analysis
* Bi - Variate Analysis
* Multi - Variate Analysis

In [None]:
plt.scatter(df.index,df['Exited'])

In [None]:
sns.displot(df['Balance'])

In [None]:
sns.rugplot(df['CreditScore'])

In [None]:
sns.scatterplot(x=df['CreditScore'], y=df['Balance'])

In [None]:
sns.boxplot(x='IsActiveMember', y='Exited', data=df)

In [None]:
sns.lmplot(x='Tenure', y='CreditScore', data=df, hue="Exited", fit_reg=False);

# 7. Check for Categorical columns and perform encoding.

In [None]:
df['Gender'].unique()

In [None]:
df['Geography'].unique()

In [None]:
df['Geography']=preprocessing.LabelEncoder().fit_transform(df['Geography'])

In [None]:
df['Gender']=df['Gender'].map({'Male':0,'Female':1})

# 6. Find the outliers and replace the outliers

In [None]:
df.describe()['Balance']

In [None]:
def impute_outliers_IQR(df):
    qu1 = df.quantile(0.25)
    qu3 = df.quantile(0.75)
    iqr = qu3 - qu1

    upper = df[~(df > (qu3 + 1.5 * iqr))].max()
    lower = df[~(df < (qu1 - 1.5 * iqr))].min()

    df = np.where(
       df > upper, 
       df.mean(), 
       np.where(df < lower, df.mean(), df)
    )

    return df

In [None]:
df['Balance'] = impute_outliers_IQR(df['Balance'])

In [None]:
df.describe()['Balance']

In [None]:
df.describe()['EstimatedSalary']

In [None]:
df['EstimatedSalary'] = impute_outliers_IQR(df['EstimatedSalary'])

In [None]:
df.describe()['EstimatedSalary']

In [None]:
def find_outliers_IQR(df):
    qu1=df.quantile(0.25)
    qu3=df.quantile(0.75)
    iqr = qu3 - qu1

    outliers = df[((df < (qu1 - 1.5 * iqr)) | (df > (qu3 + 1.5 * iqr)))]
    return outliers

In [None]:
outliers = find_outliers_IQR(df['EstimatedSalary'])

In [None]:
print(outliers)

In [None]:
outliers = find_outliers_IQR(df['Balance'])

In [None]:
print(len(outliers))
print(outliers.max())
print(outliers.min())

In [None]:
sns.boxplot(y=df['Balance'])

In [None]:
sns.boxplot(y=df['EstimatedSalary'])

In [None]:
sns.boxplot(y=df['CreditScore'])

In [None]:
outliers = find_outliers_IQR(df['CreditScore'])

In [None]:
print(len(outliers))
print(outliers.max())
print(outliers.min())

In [None]:
df['CreditScore'] = impute_outliers_IQR(df['CreditScore'])

In [None]:
df.describe()['CreditScore']

In [None]:
sns.boxplot(y=df['CreditScore'])

# 8. Split the data into dependent and independent variables.

In [None]:
features=list(set(df)-set(['Exited']))

In [None]:
features

In [None]:
x=df[features].values

In [None]:
x

In [None]:
y=df['Exited'].values

In [None]:
y

In [None]:
df[features]

# 9. Scale the independent variables

In [None]:
scale = StandardScaler()
x = scale.fit_transform(df[['EstimatedSalary','Balance','CreditScore']])
x

# 10. Split the data into training and testing

In [None]:
train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.3,random_state=0)

In [None]:
train_x,test_x,train_y,test_y