In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import pickle

In [6]:
# Load the dataset
df = pd.read_csv('Churn_Modelling.csv')

# Display the first few rows of the dataset
print(df.head())



   RowNumber  CustomerId   Surname  ...  IsActiveMember EstimatedSalary Exited
0          1    15634602  Hargrave  ...               1       101348.88      1
1          2    15647311      Hill  ...               1       112542.58      0
2          3    15619304      Onio  ...               0       113931.57      1
3          4    15701354      Boni  ...               0        93826.63      0
4          5    15737888  Mitchell  ...               1        79084.10      0

[5 rows x 14 columns]


In [7]:
# Drop the columns that are not needed

df = df.drop(["RowNumber", "CustomerId", "Surname"], axis=1)
df


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [8]:
## Convert categorical variables to numerical

# LabelEncoder converts categorical text data into numerical values (0, 1, 2, etc.)
label_encoder_gender = LabelEncoder()
df['Gender'] = label_encoder_gender.fit_transform(df['Gender'])
df
# What does fit_transform do?
# fit: The encoder learns the possible categories in the 'Gender' column (in this case, 'Male' and 'Female').
# transform: It converts those categories to numerical values (like 'Female' to 0 and 'Male' to 1).

# What's happening behind the scenes:

# The encoder sees all unique values in the 'Gender' column
# It assigns a unique integer to each value (alphabetically by default)
# It replaces each original value with its corresponding integer
# The mapping is stored in the encoder for future use (like when processing new data)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.80,3,1,0,113931.57,1
3,699,France,0,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,1,39,5,0.00,2,1,0,96270.64,0
9996,516,France,1,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,0,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,1,42,3,75075.31,2,1,0,92888.52,1


In [9]:
# OneHotEncode the Geography column

# OneHotEncoder creates binary columns for each category in a categorical feature. Each column represents one possible category and uses 1 or 0 to indicate if that category is present.

# Original data: ['France', 'Spain', 'Germany']
# After OneHotEncoder:
# France_column: [1, 0, 0]
# Spain_column:  [0, 1, 0]
# Germany_column:[0, 0, 1]

onehot_encoder_geography = OneHotEncoder()
geography_encoder = onehot_encoder_geography.fit_transform(df[['Geography']])
geography_encoder


# For Geography (OneHotEncoder):
# Geography has three categories: France, Germany, and Spain
# If we used LabelEncoder, we'd get: France=0, Germany=1, Spain=2
# This would be a problem because:
# The model might think Germany (1) is "between" France (0) and Spain (2)
# The model might think Spain (2) is "twice as far" from France (0) as Germany (1)
# These numerical relationships don't make sense for country names

# Simple explanation:
# For Gender, we can use a simple 0/1 encoding because there are only two options
# For Geography, we need to create separate yes/no columns for each country to avoid implying that countries have a numerical order or relationship to each other


# A simple rule of thumb:

# Use LabelEncoder when:

# The category is the target variable (what you're predicting)
# The category has a natural order (like Small, Medium, Large)
# There are only two categories (binary) and you're comfortable with them being 0 and 1

# Use OneHotEncoder when:

# The category has no natural order
# There are more than two categories
# The category is an input feature (not the target)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10000 stored elements and shape (10000, 3)>

In [10]:
onehot_encoder_geography.get_feature_names_out(["Geography"])

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [11]:
geo_encoder_df = pd.DataFrame(geography_encoder.toarray(),columns=onehot_encoder_geography.get_feature_names_out(["Geography"]))
geo_encoder_df

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
...,...,...,...
9995,1.0,0.0,0.0
9996,1.0,0.0,0.0
9997,1.0,0.0,0.0
9998,0.0,1.0,0.0


In [12]:
#Combine the onehot encoded geography with the original dataframe

df = pd.concat([df.drop("Geography",axis=1),geo_encoder_df],axis=1)
df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [33]:
## Save the encoders and scaler

with open('label_encoder_gender.pkl', 'wb') as f:
    pickle.dump(label_encoder_gender, f)

with open('onehot_encoder_geography.pkl', 'wb') as f:
    pickle.dump(onehot_encoder_geography, f)

In [13]:
# Divide the dataset into training and testing sets

X = df.drop(["Exited"],axis=1)
y = df["Exited"]

# The random_state parameter is a seed value for the random number generator
# Setting it to a fixed value (like 42) ensures that the train-test split is reproducible
# This means running the same code multiple times will produce identical splits
# Without setting random_state, each run would produce different train/test sets
# This reproducibility is crucial for:
#   1. Debugging and troubleshooting model issues
#   2. Comparing different models on the same data splits
#   3. Ensuring scientific reproducibility of results
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# StandardScaler standardizes numerical features by removing the mean and scaling to unit variance. This transforms your data so that it has a mean of 0 and a standard deviation of 1.
# Used when numerical features are in different scales like age in years vs. salary in thousands,

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# StandardScaler makes all your numerical features have similar scales by:
# Subtracting the average (so values center around 0)
# Dividing by the standard deviation (so most values fall between -1 and 1)

# Is it necessary for all ML models?
# Not always necessary, but often helpful for:
# Neural Networks (like in your code)
# Support Vector Machines
# K-means clustering

# Any algorithm that uses distances or gradients
# Less important for:
# Decision Trees
# Random Forests
# Some other tree-based methods

# Why it helps neural networks specifically:
# Faster learning: Neural networks learn faster when features have similar scales
# More stable: Prevents some weights from becoming extremely large while others stay tiny
# Better accuracy: Often leads to better predictions

# Real-world example:
# Imagine trying to predict house prices using:
# Number of rooms (typically 1-10)
# Square footage (typically 500-5000)
# Price (typically $100,000-$1,000,000)
# Without scaling, the model might focus too much on price (because the numbers are bigger) and ignore the number of rooms (because the numbers are smaller), even though both features might be equally important for predictions.

In [14]:
X_test

array([[-0.57749609,  0.91324755, -0.6557859 , ..., -0.99850112,
         1.72572313, -0.57638802],
       [-0.29729735,  0.91324755,  0.3900109 , ...,  1.00150113,
        -0.57946723, -0.57638802],
       [-0.52560743, -1.09499335,  0.48508334, ..., -0.99850112,
        -0.57946723,  1.73494238],
       ...,
       [ 0.81311987, -1.09499335,  0.77030065, ...,  1.00150113,
        -0.57946723, -0.57638802],
       [ 0.41876609,  0.91324755, -0.94100321, ...,  1.00150113,
        -0.57946723, -0.57638802],
       [-0.24540869,  0.91324755,  0.00972116, ..., -0.99850112,
         1.72572313, -0.57638802]])

In [36]:
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)


In [15]:
df

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.00,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.80,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.00,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.10,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,1,39,5,0.00,2,1,0,96270.64,0,1.0,0.0,0.0
9996,516,1,35,10,57369.61,1,1,1,101699.77,0,1.0,0.0,0.0
9997,709,0,36,7,0.00,1,0,1,42085.58,1,1.0,0.0,0.0
9998,772,1,42,3,75075.31,2,1,0,92888.52,1,0.0,1.0,0.0


In [19]:
# ANN Implementation

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime

In [17]:
X_train.shape

(8000, 12)

In [20]:
# Build our ANN model

model = Sequential([
    Dense(64,activation="relu",input_shape=(X_train.shape[1],)), # Hidden Layer 1
    Dense(32,activation="relu"), # Hidden Layer 2
    Dense(1,activation="sigmoid") # Output Layer
])

In [None]:
# Think of the below code as designing the "brain" of your prediction model.

# model = Sequential([...]):
# What it is: This creates a basic type of neural network where information flows straight through, layer by layer, like a production line. Sequential means the layers are stacked one after the other.
# Analogy: Imagine building with LEGOs; you stack one brick on top of the next.

# Dense(...):
# What it is: This defines a "layer" in your network. "Dense" means every "neuron" (or processing unit) in this layer is connected to every neuron in the previous layer. It's the most common type of layer.
# Analogy: Think of it as a group of workers. In a dense layer, every worker in this group talks to every worker in the previous group to get information.

# Dense(64, activation="relu", input_shape=(X_train.shape[1],)) - Hidden Layer 1:
# 64 (Number of Neurons): This is the number of processing units or "neurons" in this first hidden layer.
# Why 64? This number is a hyperparameter – a setting you choose before training. There's no magic formula! Common practices include:
# Starting with powers of 2 (like 32, 64, 128).
# Choosing a number related to the input features (you have X_train.shape[1] features, which was 12 in your notebook). 64 is larger than 12, allowing the network to potentially learn complex combinations of those features.
# Experimentation: Often, developers try different numbers (e.g., 32, 64, 128) and see which performs best on their specific problem. 64 is a reasonable starting point.

# activation="relu" (Activation Function): This is like an "on/off" switch or a filter for each neuron. It decides what information gets passed to the next layer.
# What ReLU does: If the neuron's input is positive, it passes that value along. If it's zero or negative, it outputs zero. (Think: max(0, input)).
# Why ReLU? It's very popular for hidden layers because:
# It's simple and fast to compute.
# It helps prevent a problem called the "vanishing gradient," making training more effective, especially in deeper networks.
#  input_shape=(X_train.shape[1],): This tells the first layer what the input data looks like. X_train.shape[1] is the number of columns (features) in your training data (which was 12 after preprocessing). You only need to specify this for the very first layer.

# Dense(32, activation="relu") - Hidden Layer 2:
# 32 (Number of Neurons): This second hidden layer has 32 neurons.
# Why 32? Again, it's a hyperparameter. It's common to gradually decrease the number of neurons in deeper hidden layers. The idea is that the network combines features in the first layer and then refines those combinations into more abstract patterns in subsequent layers, requiring fewer neurons. Going from 64 to 32 follows this pattern.
# activation="relu": Same reason as before – a good default choice for hidden layers.

# Dense(1, activation="sigmoid") - Output Layer:
# 1 (Number of Neurons): This final layer has only one neuron.
# Why 1? Because you are doing binary classification. You want to predict one of two outcomes: either the customer Exited (1) or Did Not Exit (0). A single neuron is sufficient to output a single value representing this prediction.
# activation="sigmoid": This activation function is crucial for the output layer in binary classification.
# What Sigmoid does: It squishes any input value into a range between 0 and 1.
# Why Sigmoid? The output (between 0 and 1) can be interpreted as a probability. For example, an output of 0.85 means the model is 85% confident that the customer will exit. An output of 0.1 means it's only 10% confident they will exit. This is exactly what we need for predicting yes/no outcomes.

In [None]:
# Layers Similar to Dense
# Convolutional Layers (Conv1D, Conv2D, Conv3D)
# Specialized for grid-like data (images, time series)
# Detect patterns regardless of where they appear in the input
# Example: Conv2D(32, kernel_size=(3,3)) for image processing

# Recurrent Layers (SimpleRNN, LSTM, GRU)
# Process sequential data where order matters
# Maintain internal memory of previous inputs
# Example: LSTM(64) for text or time series analysis

# Embedding Layers
# Convert categorical data (like words) into dense vectors
# Learn meaningful representations of categories
# Example: Embedding(vocab_size, embedding_dim) for text processing

# Attention Layers
# Help models focus on relevant parts of the input
# Critical for modern language models
# Example: MultiHeadAttention(num_heads=8) for transformer-based models


# Hidden Layer Size and Count

# Layer Size Progression
# The "half of previous" rule (64→32) is a common starting point but not a strict requirement

# Other common patterns:
# Same size for all layers (64→64→64)
# Gradual reduction (128→64→32)
# Bottleneck architecture (64→32→64) for autoencoders

# Deciding Number of Hidden Layers

# There's no one-size-fits-all rule. Consider:
# 1. Problem Complexity:
# Simple problems: 1-2 layers often sufficient
# Complex problems: May benefit from 3+ layers

# 2. Data Amount:
# More data can support deeper networks
# Limited data may overfit with too many layers

# 3. Computational Resources:
# Deeper networks require more computation
# Consider your hardware limitations

# 4. Empirical Testing:
# Start simple (1-2 layers)
# Add layers if underfitting (poor performance on both training and validation)
# Remove layers if overfitting (good on training, poor on validation)
# Monitor validation performance to guide decisions


# Understanding Sigmoid vs. Softmax Activation Functions

## Sigmoid Activation

# **What it does:**
# - Takes any input value and squeezes it into a range between 0 and 1
# - Formula: `f(x) = 1 / (1 + e^(-x))`
# - Outputs a single value that can be interpreted as a probability

# **When to use:**
# - **Binary classification problems** (predicting one of two classes)
# - In the output layer when you need a yes/no prediction
# - Examples: spam detection, customer churn prediction, disease diagnosis

# **Why it works for binary tasks:**
# - The output (between 0 and 1) represents the probability of the positive class
# - Values closer to 1 indicate higher confidence in the positive class
# - Values closer to 0 indicate higher confidence in the negative class
# - Decision threshold is typically 0.5 (above = positive class, below = negative class)

# ## Softmax Activation

# **What it does:**
# - Takes a vector of values and transforms them into a probability distribution
# - All outputs sum to exactly 1.0
# - Formula: `softmax(x)_i = e^(x_i) / Σ(e^(x_j))` for all j
# - Emphasizes the largest values while suppressing lower values

# **When to use:**
# - **Multi-class classification problems** (predicting one of 3+ classes)
# - In the output layer when you need to choose among several options
# - Examples: image classification, language identification, product categorization

# **Why it works for multi-class tasks:**
# - Each output represents the probability for one specific class
# - The class with the highest probability is the model's prediction
# - Allows the model to express uncertainty across multiple classes
# - Enables meaningful comparisons between different class probabilities

# ## Key Differences

# - **Sigmoid**: One output (0-1) for binary decisions
# - **Softmax**: Multiple outputs (all sum to 1) for choosing among multiple options

# Think of sigmoid as answering "How likely is this single outcome?" while softmax answers "Which of these multiple outcomes is most likely?"


In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                832       
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 2945 (11.50 KB)
Trainable params: 2945 (11.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
## Understanding `model.summary()`

# This summary gives you a quick overview of your model's architecture:

# 1.  **`Model: "sequential"`**: Confirms you built a `Sequential` model.
# 2.  **`Layer (type)`**: Lists each layer you added (`dense`, `dense_1`, `dense_2` are just default names) and their type (`Dense`).
# 3.  **`Output Shape`**: Shows the shape of the data *coming out* of each layer.
#     *   `(None, 64)`: The first layer outputs a batch of data (the `None` part means the batch size can vary) where each item has 64 values (one from each neuron).
#     *   `(None, 32)`: The second layer outputs data with 32 values per item.
#     *   `(None, 1)`: The final layer outputs data with 1 value per item (our probability).
# 4.  **`Param #` (Parameters):** This is the most important part – it shows the number of *learnable values* (weights and biases) in each layer. These are the numbers the model adjusts during training to make better predictions.
#     *   **`dense` (Layer 1):** 832 parameters.
#         *   *How it's calculated:* You have 12 input features (`X_train.shape[1]`). Each input connects to each of the 64 neurons (12 \* 64 = 768 weights). Each neuron also has 1 bias value (+ 64 biases). Total = 768 + 64 = 832.
#     *   **`dense_1` (Layer 2):** 2080 parameters.
#         *   *How it's calculated:* This layer takes input from the 64 neurons of the previous layer. Each of these 64 connects to the 32 neurons here (64 \* 32 = 2048 weights). Each of the 32 neurons has 1 bias (+ 32 biases). Total = 2048 + 32 = 2080.
#     *   **`dense_2` (Layer 3 - Output):** 33 parameters.
#         *   *How it's calculated:* Takes input from the 32 neurons of the previous layer. Each connects to the 1 output neuron (32 \* 1 = 32 weights). The output neuron has 1 bias (+ 1 bias). Total = 32 + 1 = 33.
# 5.  **`Total params: 2945`**: The total number of values the model needs to learn during training.
# 6.  **`Trainable params: 2945`**: The number of parameters that will actually be updated during training. Usually, this is the same as the total params unless you specifically "freeze" some layers.
# 7.  **`Non-trainable params: 0`**: Parameters that are not updated during training (none in this case).




# Understanding Neural Network Parameters: Weights and Biases

# Let me explain these fundamental concepts in simple terms:

## What Are Parameters?

# **Parameters** are the values that a neural network learns during training. They're like the "knobs" the model adjusts to get better at making predictions. There are two types of parameters:

# 1. **Weights**
# 2. **Biases**

# ## Weights Explained

# **Weights** determine how strongly each input affects a neuron's output.

# **Simple analogy:** Imagine you're deciding whether to go to the beach. Different factors influence your decision:
# - Temperature (very important) → high weight
# - Wind speed (somewhat important) → medium weight
# - Day of week (less important) → low weight

# In a neural network:
# - Each connection between neurons has a weight
# - Higher weight = stronger connection
# - Weights can be positive (encouraging) or negative (discouraging)
# - The network learns which connections should be strong and which should be weak

# ## Biases Explained

# **Biases** are additional values that help the model make better predictions even when all inputs are zero or very small.

# **Simple analogy:** Continuing with the beach example:
# - Even if all conditions (temperature, wind, etc.) are neutral, you might still have a personal preference
# - If you generally like beaches, you have a positive bias toward going
# - If you generally dislike beaches, you have a negative bias against going

# In a neural network:
# - Each neuron has its own bias
# - Bias allows neurons to activate (or not) even when inputs are minimal
# - It's like each neuron's "default opinion" before considering any inputs

# ## Why One Bias Per Neuron?

# Each neuron gets exactly one bias because:

# 1. **Purpose of bias:** The bias shifts the activation function left or right. One value is sufficient to create this shift.

# 2. **Mathematical necessity:** In the neuron's equation: `output = activation_function(sum(inputs × weights) + bias)`, one bias term is all that's needed.

# 3. **Unique threshold:** Each neuron needs its own unique "firing threshold" - the bias provides this.


# ## What Happens During Training

# 1. The network starts with random weights and biases
# 2. For each training example:
#    - The network makes a prediction
#    - The error is calculated (how far off the prediction was)
#    - Weights and biases are adjusted slightly to reduce the error
# 3. This process repeats thousands of times
# 4. Gradually, the weights and biases evolve to capture patterns in your data

# The beauty of neural networks is that they automatically learn which features are important (by assigning higher weights) and how they should be combined to make accurate predictions.


In [23]:
import tensorflow

optimizer = tensorflow.keras.optimizers.Adam(learning_rate=0.01)
loss = tensorflow.keras.losses.BinaryCrossentropy()

In [None]:


# Understanding Optimizers and Loss Functions in Neural Networks

## The Optimizer (Adam)

# ```python
# optimizer = tensorflow.keras.optimizers.Adam(learning_rate=0.01)
# ```

# **What it does:** The optimizer controls *how* the model learns from its mistakes. It's like the "learning strategy" of your neural network.

# **Adam specifically:**
# - A popular and efficient optimizer that adapts the learning rate for each parameter
# - Combines the benefits of two other optimizers (AdaGrad and RMSProp)
# - Works well for most problems without much tuning

# **Learning rate (0.01):**
# - Controls how big of a step the model takes when updating weights
# - Higher value (e.g., 0.1): Learns faster but might overshoot the optimal solution
# - Lower value (e.g., 0.001): Learns more carefully but might take too long
# - 0.01 is a common starting point that balances speed and stability

# **Other popular optimizers:**
# - **SGD** (Stochastic Gradient Descent): Simple but requires more tuning
# - **RMSProp**: Good for recurrent neural networks
# - **Adagrad**: Adapts learning rates based on parameter frequency
# - **Adamax**: A variant of Adam that can be more stable in some cases

# ## The Loss Function (Binary Cross-Entropy)

# ```python
# loss = tensorflow.keras.losses.BinaryCrossentropy()
# ```

# **What it does:** The loss function measures how wrong the model's predictions are. It's like the "scoring system" that tells the model how badly it's performing.

# **Binary Cross-Entropy specifically:**
# - Designed for binary classification problems (yes/no, 0/1)
# - Measures the difference between predicted probabilities and actual values
# - Heavily penalizes confident wrong predictions (e.g., predicting 0.9 when the true value is 0)
# - Works well with sigmoid activation in the output layer

# **When to use Binary Cross-Entropy:**
# - When predicting one of two possible outcomes (like customer churn/no churn)
# - When your output layer uses sigmoid activation
# - When your target values are 0 or 1

# **Other common loss functions:**
# - **Categorical Cross-Entropy**: For multi-class classification (3+ classes)
# - **Sparse Categorical Cross-Entropy**: Same as above but with integer labels
# - **Mean Squared Error**: For regression problems (predicting continuous values)
# - **Huber Loss**: For regression that's less sensitive to outliers

# ## Compiling the Model

# ```python
# model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"])
# ```

# This line puts everything together and prepares the model for training by specifying:

# 1. **optimizer**: How the model should update its weights (Adam in this case)
# 2. **loss**: How to measure prediction errors (Binary Cross-Entropy)
# 3. **metrics**: What to track during training ("accuracy" measures the percentage of correct predictions)

# You can specify the loss function either as a string (`"binary_crossentropy"`) or as an object (`loss=tensorflow.keras.losses.BinaryCrossentropy()`). Both approaches work the same way.

# **The relationship between these components:**
# - The model makes predictions
# - The loss function calculates how wrong those predictions are
# - The optimizer determines how to adjust the weights to reduce the loss
# - The metrics track how well the model is performing

# This process repeats for each batch of training data until the model learns to make accurate predictions.


In [24]:
# Compile the model
model.compile(optimizer=optimizer, loss="binary_crossentropy",metrics=["accuracy"])

In [25]:
# Set up the Tensorboard

from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorflow_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

# TensorBoard is a visualization tool that helps you track and analyze your model's training progress.

# Key features:
# Creates interactive graphs of metrics (loss, accuracy) over time
# Shows distributions of weights and biases
# Helps identify problems like overfitting
# Saves all data to the specified log directory with a timestamp

In [29]:
early_stopping_callback = EarlyStopping(monitor="val_loss", patience=15, restore_best_weights=True) 


# Understanding  EarlyStopping Callbacks

## EarlyStopping

# ```python
# early_stopping_callback = EarlyStopping(monitor="val_loss", patience=15, restore_best_weights=True)
# ```

# **What it does:** Automatically stops training when the model stops improving.

# **Key parameters:**
# - `monitor="val_loss"`: Watches validation loss to determine improvement
# - `patience=15`: Waits 15 epochs of no improvement before stopping
# - `restore_best_weights=True`: Keeps the best model version, not the final one

# **Why it's useful:** Prevents wasting time and computational resources on unproductive training and helps avoid overfitting by stopping before the model starts memorizing the training data.

# Both callbacks are added to the `model.fit()` function to enhance the training process without changing the model itself.


In [30]:
# Train the model

history = model.fit(
    X_train,y_train,validation_data = (X_test, y_test), epochs = 100,
    callbacks = [tensorflow_callback,early_stopping_callback]
)



# Understanding `model.fit()` - Training Your Neural Network

## What's Happening

# 1. **Starting the Training Process**: `model.fit()` begins the training of your neural network.

# 2. **Feeding Data**:
#    - `X_train`: Your input features (customer data like age, credit score, etc.)
#    - `y_train`: Your target values (whether customers exited or not)
#    - `validation_data=(X_test, y_test)`: Separate data to check how well the model generalizes

# 3. **Setting Training Duration**:
#    - `epochs=100`: The model will go through the entire dataset up to 100 times
#    - Each "epoch" is one complete pass through all training data

# 4. **Using Callbacks for Monitoring and Control**:
#    - `tensorflow_callback`: Records training metrics for visualization in TensorBoard
#    - `early_stopping_callback`: Watches for when training stops improving and ends early

# 5. **Behind the Scenes**:
#    - For each batch of data:
#      - The model makes predictions
#      - Compares predictions to actual values
#      - Calculates the loss (error)
#      - Updates weights and biases to reduce the error
#    - After each epoch, it evaluates on the validation data
#    - Callbacks monitor progress and can take actions (like stopping training)

# 6. **Storing Results**:
#    - `history`: Captures training metrics over time (loss, accuracy for both training and validation)
#    - This can be used later to plot learning curves

# ## What You See in the Output

# The output shows progress for each epoch:

# ```language=
# Epoch 1/100
# 250/250 [==============================] - 1s 3ms/step - loss: 0.3435 - accuracy: 0.8590 - val_loss: 0.3397 - val_accuracy: 0.8590
# ```


# - `250/250`: Processed 250 batches out of 250 total
# - `loss: 0.3435`: Training loss (error) for this epoch
# - `accuracy: 0.8590`: Training accuracy (85.9% correct predictions)
# - `val_loss: 0.3397`: Validation loss
# - `val_accuracy: 0.8590`: Validation accuracy

# The training stopped after 16 epochs (instead of the maximum 100) because the early stopping callback detected that the validation loss wasn't improving anymore, preventing overfitting and saving computation time.




# Understanding Epochs and Batches in Neural Network Training

## What Are Epochs?

# An **epoch** means training your model on the entire dataset exactly once. 

# **Simple explanation:**
# - Imagine you have a textbook with 1000 pages
# - Reading the entire book once = 1 epoch
# - Reading it again from start to finish = another epoch

# In your code, `epochs=100` means:
# - The model will process all your training data up to 100 times
# - Each time through the data, the model learns and adjusts its weights
# - It's like giving your model multiple chances to study the same material

# ## Why 100 Epochs?

# The number 100 is just a maximum limit, not a requirement. It means:
# - "Keep training for up to 100 complete passes through the data"
# - The early stopping callback will likely stop training before reaching 100

# **How to choose the number:**
# - Too few epochs: The model might not learn enough (underfitting)
# - Too many epochs: The model might memorize the training data (overfitting)
# - The "right" number depends on your specific data and model

# **Common approaches:**
# 1. Set a high number (like 100) but use early stopping to prevent overfitting
# 2. Try different values and see which gives the best validation performance
# 3. Look at learning curves to see when improvement plateaus

# ## What Are Batches?

# **Batches** are smaller chunks of your training data processed at one time.

# **Simple explanation:**
# - Instead of studying the entire textbook at once, you study it chapter by chapter
# - Each "chapter" is a batch of data

# In your output, `250/250` means:
# - Your training data was divided into 250 batches
# - The model has processed all 250 batches (completing one epoch)

# ## Why Use Batches?

# 1. **Memory efficiency**: Processing all data at once might not fit in memory
# 2. **Training speed**: Updates happen more frequently, potentially speeding up learning
# 3. **Better generalization**: Small random variations between batches can help the model generalize better

# ## How Batch Size Is Determined

# The default batch size in Keras is 32, meaning:
# - If you have 8000 training examples, you'd have 8000 ÷ 32 = 250 batches

# You can explicitly set the batch size with the `batch_size` parameter:
# ```python
# model.fit(X_train, y_train, batch_size=64, epochs=100, ...)
# ```

# **Choosing batch size:**
# - Smaller batches (8-64): More updates, potentially better generalization, but slower training
# - Larger batches (128-512): Faster training, but might generalize less well
# - Very large batches: May require more epochs to reach the same performance

# The optimal values for both epochs and batch size depend on your specific dataset and model architecture, and often require experimentation to find the best balance.




Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100


In [33]:
model.save("model.h5")


# Understanding .h5 vs .pkl File Formats

## .h5 (HDF5) Format

# **What it is:**
# - HDF5 (Hierarchical Data Format version 5) is a file format designed for storing large, complex datasets
# - It's optimized for storing numerical data arrays and supports complex hierarchical structures
# - In deep learning, it's commonly used to save entire neural network models including architecture and weights

# **Key features:**
# - Efficient storage of large numerical arrays
# - Hierarchical organization (like folders within folders)
# - Fast read/write operations
# - Cross-platform compatibility
# - Supports partial reading (you can load just part of a file)

# **When to use .h5:**
# - When saving complete neural network models (architecture + weights)
# - When working with large datasets that need efficient storage
# - When you need to preserve the exact structure of complex data
# - When using frameworks like TensorFlow/Keras that natively support it

# ## .pkl (Pickle) Format

# **What it is:**
# - Pickle is Python's native serialization format
# - It converts Python objects into byte streams that can be saved to disk and loaded later
# - It can store almost any Python object (not just numerical data)

# **Key features:**
# - Python-specific (not easily used by other programming languages)
# - Can serialize almost any Python object
# - Simple to use with Python's built-in pickle module
# - Less efficient for large numerical data compared to HDF5

# **When to use .pkl:**
# - When saving scikit-learn models or preprocessors (like your LabelEncoder)
# - When storing Python-specific objects that aren't just numerical data
# - When you need to quickly save and load Python objects without worrying about their internal structure
# - When working with smaller datasets or models

# ## Key Differences

# 1. **Compatibility:**
#    - .h5: Cross-platform, can be used by multiple programming languages
#    - .pkl: Python-specific, not easily used outside Python

# 2. **Performance:**
#    - .h5: More efficient for large numerical datasets
#    - .pkl: Less efficient for large data, but simpler for small objects

# 3. **Use cases:**
#    - .h5: Better for neural networks and large numerical datasets
#    - .pkl: Better for scikit-learn models and general Python objects

# 4. **Structure:**
#    - .h5: Maintains hierarchical structure, allows partial access
#    - .pkl: Serializes the entire object at once

# In your notebook, you're using both formats appropriately:
# - `.h5` for saving your neural network model
# - `.pkl` would be appropriate for saving preprocessors like your encoders


  saving_api.save_model(


In [31]:
%load_ext tensorboard

In [1]:
!kill 113682

/bin/bash: line 1: kill: (113682) - No such process


In [34]:
%tensorboard --logdir logs/fit

Reusing TensorBoard on port 6006 (pid 113682), started 0:07:15 ago. (Use '!kill 113682' to kill it.)