In [1]:
# Importing Libarires
import pandas as pd
import numpy as np

In [2]:
# Load the CSV file
df = pd.read_csv("Churn_Modelling.csv")

In [3]:
# Display the data
print(df.head())

   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         790

In [4]:
# Check Dataset Info
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB
None


# Preprocessing

In [5]:
# Check Null Values
print(df.isnull().sum())

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


In [6]:
# Check Empty Strings
print(df[df.eq("").any(axis=1)])

Empty DataFrame
Columns: [RowNumber, CustomerId, Surname, CreditScore, Geography, Gender, Age, Tenure, Balance, NumOfProducts, HasCrCard, IsActiveMember, EstimatedSalary, Exited]
Index: []


In [7]:
# Checking Placeholder Values
for col in df.columns:
    count = df[col].isin(["?", "NA", "-", "--"]).sum()
    print(f"{col} → {count} suspicious values")


RowNumber → 0 suspicious values
CustomerId → 0 suspicious values
Surname → 0 suspicious values
CreditScore → 0 suspicious values
Geography → 0 suspicious values
Gender → 0 suspicious values
Age → 0 suspicious values
Tenure → 0 suspicious values
Balance → 0 suspicious values
NumOfProducts → 0 suspicious values
HasCrCard → 0 suspicious values
IsActiveMember → 0 suspicious values
EstimatedSalary → 0 suspicious values
Exited → 0 suspicious values


In [8]:
# Checking for duplicates
print(df.duplicated().sum())

0


In [9]:
# Dropping non-useful columns

## Check 1--getting the high unique values
for col in df.columns:
    print(col, "→ unique values:", df[col].nunique())



RowNumber → unique values: 10000
CustomerId → unique values: 10000
Surname → unique values: 2932
CreditScore → unique values: 460
Geography → unique values: 3
Gender → unique values: 2
Age → unique values: 70
Tenure → unique values: 11
Balance → unique values: 6382
NumOfProducts → unique values: 4
HasCrCard → unique values: 2
IsActiveMember → unique values: 2
EstimatedSalary → unique values: 9999
Exited → unique values: 2


In [10]:
## Analysis :- Since, RowNumber, CustomerId and Surname has high frequency unique data, so will drop that
df.drop(columns=["RowNumber", "CustomerId", "Surname"], inplace=True)

In [11]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  object 
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB
None


In [12]:
print(df['Geography'].nunique())

3


# HANDLING CATEGORICAL DATAS

In [13]:
# We should see mostly int / float types and 2 categorical cols:-
# Geography
# Gender

## Handling Gender Data
df["Gender"] = df["Gender"].map({"Male": 1, "Female": 0})


In [14]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  int64  
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int64(8), object(1)
memory usage: 859.5+ KB
None


In [15]:
print(df.head)

<bound method NDFrame.head of       CreditScore Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0             619    France       0   42       2       0.00              1   
1             608     Spain       0   41       1   83807.86              1   
2             502    France       0   42       8  159660.80              3   
3             699    France       0   39       1       0.00              2   
4             850     Spain       0   43       2  125510.82              1   
...           ...       ...     ...  ...     ...        ...            ...   
9995          771    France       1   39       5       0.00              2   
9996          516    France       1   35      10   57369.61              1   
9997          709    France       0   36       7       0.00              1   
9998          772   Germany       1   42       3   75075.31              2   
9999          792    France       0   28       4  130142.79              1   

      HasCrCard  IsActiveMember  

In [None]:
# Encoding the "Geography" in One Hot Encoding
df = df.join(pd.get_dummies(df["Geography"], dtype=int))

In [None]:
# Dropping the values
df = df.drop(columns=["Geography"])

# Scale / Normalize numeric values

In [17]:
for col in ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary"]:
    min_val = df[col].min()
    max_val = df[col].max()
    df[col] = df[col].apply(lambda x: (x - min_val) / (max_val - min_val))


In [18]:
print(df.head())

   CreditScore Geography  Gender       Age  Tenure   Balance  NumOfProducts  \
0        0.538    France       0  0.324324     0.2  0.000000       0.000000   
1        0.516     Spain       0  0.310811     0.1  0.334031       0.000000   
2        0.304    France       0  0.324324     0.8  0.636357       0.666667   
3        0.698    France       0  0.283784     0.1  0.000000       0.333333   
4        1.000     Spain       0  0.337838     0.2  0.500246       0.000000   

   HasCrCard  IsActiveMember  EstimatedSalary  Exited  
0          1               1         0.506735       1  
1          0               1         0.562709       0  
2          1               0         0.569654       1  
3          0               0         0.469120       0  
4          1               1         0.395400       0  


# Seperating Features(X) and Target(y)

In [19]:
# Taking target values
y = df["Exited"]

# Removing Target values from features section
X = df.drop(columns=["Exited"])
