In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle

In [7]:
data= pd.read_csv("Churn_Modelling.csv")
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [8]:
## Preprocess the data
### 1. Drop irrelevant features
data=data.drop(['RowNumber','CustomerId','Surname'], axis=1)
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


Here we encounter categorical variables `Gender` and `Geography`. These need to be converted into numerical format before feeding into a neural network.

### ✳️ Label Encoding for Gender

Gender = ["Male", "Female"]
Label Encoded: Male → 1, Female → 0

✅ Why Label Encoding is suitable for Gender:

Only two categories: "Male" and "Female" — this makes it a binary feature.

Since there’s no ordinal relationship, using values 0 and 1 is safe and efficient.

Saves memory compared to one-hot (which would require two columns).

Most ML models, including neural networks, can handle binary features well.

In [9]:
le_gender= LabelEncoder()
data['Gender']= le_gender.fit_transform(data['Gender'])
data.head()


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


OHE for Geography

Geography = ["France", "Spain", "Germany"]
One-Hot Encoded:
France  → [1, 0, 0]  
Spain   → [0, 1, 0]  
Germany → [0, 0, 1]

✅ Why One-Hot Encoding is used for Geography:

More than two categories, and no natural ordering (nominal data).

If we use Label Encoding here (e.g., Spain → 0, France → 1, Germany → 2), the model might mistakenly assume that Germany > France > Spain — which introduces false ordinal relationships.

One-hot encoding solves this by treating each category independently and equally.

---

## Steps to Apply OneHotEncoder() in sklearn

* Import the encoder:
  from sklearn.preprocessing import OneHotEncoder

* Initialize the encoder with parameters:
  ohe = OneHotEncoder(sparse=False)

* Fit and transform the target column (as 2D):
  transformed = ohe.fit_transform(df[['ColumnName']])
  
  🔹 Note: The input must be 2D (i.e., df[['ColumnName']], not df['ColumnName']) because OneHotEncoder() expects a 2D array as input — even for a single column — since it can process multiple columns     simultaneously.

* Get new feature names:
  ohe.get_feature_names_out(['ColumnName'])

* Create a DataFrame from the encoded array:
  pd.DataFrame(transformed, columns=feature_names)

* Join encoded features with original DataFrame after dropping the original column.


In [10]:
from sklearn.preprocessing import OneHotEncoder
ohe_geo= OneHotEncoder(sparse=False)
geo_transformed= ohe_geo.fit_transform(data[['Geography']])



In [11]:
ohe_feature_names =ohe_geo.get_feature_names_out(['Geography'])
ohe_feature_names

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [12]:
geo_transformed

array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [13]:
geo_encoded_df= pd.DataFrame(geo_transformed, columns= ohe_feature_names)
geo_encoded_df.head()

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0


In [14]:
## Combining encoded columns with original data and dropping original "Geography column"

data= pd.concat([data.drop('Geography', axis=1), geo_encoded_df], axis=1)
data.head()


Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [15]:
# Save the LabelEncoder for Gender column
# This ensures the same encoding can be reused during testing or deployment
with open('le_gender.pkl', 'wb') as file:
    pickle.dump(le_gender, file)

# Save the OneHotEncoder for Geography column
# Preserves the fitted encoder so new data can be transformed identically
with open('ohe_geo.pkl', 'wb') as file:
    pickle.dump(ohe_geo, file)


In [16]:
## Divide df into dependent and independent features
X= data.drop('Exited', axis=1)
y= data['Exited']

In [17]:
## Train test split
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.2, random_state=123)

In [18]:
## Scaling the features
std_scaler= StandardScaler()
X_train=std_scaler.fit_transform(X_train)
X_test=std_scaler.transform(X_test)

✅ Why Apply a Scaler (e.g., StandardScaler) Before ANN
Brings All Features to the Same Scale
Neural networks perform better when input features are standardized (e.g., mean = 0, std = 1) or normalized to a common range (e.g., 0 to 1).
➤ This ensures that no single feature dominates the training just because of its magnitude.

Speeds Up Convergence
If features have very different scales, the model takes longer to find the optimal weights.
➤ Scaling leads to faster and more stable training.

Improves Gradient Descent Performance
ANNs use gradient descent to update weights. When input features are on wildly different scales, the optimization may become unstable or slow.

Required by Many Activation Functions
Some activation functions (like sigmoid or tanh) are sensitive to input range. If inputs are too large or too small, they may saturate, causing gradients to vanish.

💡 Common Scalers:
StandardScaler() → Scales to mean 0, standard deviation 1

MinMaxScaler() → Scales features to a fixed range, usually [0, 1]

---
## fit_transform() vs transform()

### fit_transform(X_train)

* Used only on training data

* The scaler learns the mean and standard deviation

* Then applies scaling to standardize the training data

### transform(X_test)

* Used only on test (or validation) data

* Applies the same scaling learned from training data

* Ensures no data leakage from test data into training

In [19]:
## Saving Scaler
with open('std_scaler.pkl', 'wb') as file:
    pickle.dump(std_scaler,file)

## ANN Implementation
### Prerequisite 
- ANN workflow; Trainable parameters, Sequential network, dense model, activation function, Optimizers, Performance Metrics

In [21]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime

In [24]:
## Building ANN Model
model= Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)), # Hidden Layer 1
    Dense(32, activation='relu'), # Hidden Layer 2
    Dense(1, activation='sigmoid') # Output Layer
])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 64)                832       
                                                                 
 dense_7 (Dense)             (None, 32)                2080      
                                                                 
 dense_8 (Dense)             (None, 1)                 33        
                                                                 
Total params: 2,945
Trainable params: 2,945
Non-trainable params: 0
_________________________________________________________________


In [27]:
## Compile the Model

opt= tf.keras.optimizers.Adam(learning_rate=0.01)

model.compile(optimizer=opt, loss="binary_crossentropy", metrics=['accuracy'])

In [28]:
## Set up the Tensorboard

log_dir= "logs\fit" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tf_callback= TensorBoard(log_dir=log_dir, histogram_freq=1)