In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("Churn_Modelling.csv")

Handling Missing Values

In [3]:
print("Missing Values:\n", df.isnull().sum(), '\n')

Missing Values:
 RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64 



Handling Duplicates (if any)

In [4]:
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}\n")

Number of duplicate rows: 0



Checking For Outliers using describe()

In [5]:
print("Outliers (Summary Statistics):\n", df.describe(), '\n')

Outliers (Summary Statistics):
          RowNumber    CustomerId   CreditScore           Age        Tenure  \
count  10000.00000  1.000000e+04  10000.000000  10000.000000  10000.000000   
mean    5000.50000  1.569094e+07    650.528800     38.921800      5.012800   
std     2886.89568  7.193619e+04     96.653299     10.487806      2.892174   
min        1.00000  1.556570e+07    350.000000     18.000000      0.000000   
25%     2500.75000  1.562853e+07    584.000000     32.000000      3.000000   
50%     5000.50000  1.569074e+07    652.000000     37.000000      5.000000   
75%     7500.25000  1.575323e+07    718.000000     44.000000      7.000000   
max    10000.00000  1.581569e+07    850.000000     92.000000     10.000000   

             Balance  NumOfProducts    HasCrCard  IsActiveMember  \
count   10000.000000   10000.000000  10000.00000    10000.000000   
mean    76485.889288       1.530200      0.70550        0.515100   
std     62397.405202       0.581654      0.45584        0.499

In [6]:
df = df.drop(['Surname', 'Geography', 'Gender'], axis=1)

In [7]:
scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df.drop('Exited', axis=1)), columns=df.columns[:-1])

Normalized Dataset

In [8]:
print("Normalized dataset:\n", df_normalized.head(), "\n")

Normalized dataset:
    RowNumber  CustomerId  CreditScore       Age  Tenure   Balance  \
0     0.0000    0.275616        0.538  0.324324     0.2  0.000000   
1     0.0001    0.326454        0.516  0.310811     0.1  0.334031   
2     0.0002    0.214421        0.304  0.324324     0.8  0.636357   
3     0.0003    0.542636        0.698  0.283784     0.1  0.000000   
4     0.0004    0.688778        1.000  0.337838     0.2  0.500246   

   NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  
0       0.000000        1.0             1.0         0.506735  
1       0.000000        0.0             1.0         0.562709  
2       0.666667        1.0             0.0         0.569654  
3       0.333333        0.0             0.0         0.469120  
4       0.000000        1.0             1.0         0.395400   



In [9]:
X = df_normalized.values
y = df['Exited'].values

Input And Output Values

In [10]:
print("Input Values (Features):\n", X[:5])
print("\nOutput Values (Target):\n", y[:5])

Input Values (Features):
 [[0.00000000e+00 2.75616127e-01 5.38000000e-01 3.24324324e-01
  2.00000000e-01 0.00000000e+00 0.00000000e+00 1.00000000e+00
  1.00000000e+00 5.06734893e-01]
 [1.00010001e-04 3.26454364e-01 5.16000000e-01 3.10810811e-01
  1.00000000e-01 3.34031479e-01 0.00000000e+00 0.00000000e+00
  1.00000000e+00 5.62708739e-01]
 [2.00020002e-04 2.14421435e-01 3.04000000e-01 3.24324324e-01
  8.00000000e-01 6.36357176e-01 6.66666667e-01 1.00000000e+00
  0.00000000e+00 5.69654352e-01]
 [3.00030003e-04 5.42635876e-01 6.98000000e-01 2.83783784e-01
  1.00000000e-01 0.00000000e+00 3.33333333e-01 0.00000000e+00
  0.00000000e+00 4.69120051e-01]
 [4.00040004e-04 6.88778306e-01 1.00000000e+00 3.37837838e-01
  2.00000000e-01 5.00246216e-01 0.00000000e+00 1.00000000e+00
  1.00000000e+00 3.95400361e-01]]

Output Values (Target):
 [1 0 1 0 0]


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Splitting The Data Fot Training and Testing

In [12]:
print(f"\nTraining data size: {len(X_train)}")
print(f"Testing data size: {len(X_test)}")


Training data size: 8000
Testing data size: 2000
