In [None]:
import pandas as pd
import numpy as np
import io
from google.colab import files # For file upload in Colab

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer # Was missing from original imports
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
# Read the uploaded CSV file into a pandas DataFrame
df = pd.read_csv("Churn_Modelling.csv")

In [None]:
print(df.head())
print("\n--- Info on missing values ---")
print(df.isnull().sum())

   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         790

In [None]:
# --- 2. Define Features and Target ---
# IMPORTANT: Make sure 'Exited' is the correct name of your target column.
# If your target column has a different name, change it below.
X = df.drop('Exited', axis=1)
y = df['Exited']


In [None]:
# --- 3. Preprocessing Steps ---

# Identify numerical and categorical features automatically
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

print(f"\nIdentified Numerical Features: {numerical_features}")
print(f"Identified Categorical Features: {categorical_features}")


Identified Numerical Features: ['RowNumber', 'CustomerId', 'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
Identified Categorical Features: ['Surname', 'Geography', 'Gender']


In [None]:
# Create preprocessing pipelines for both feature types
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # Handle missing numerical values
    ('scaler', StandardScaler())                    # Standardize numerical columns
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Handle missing categorical values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))   # One-hot encode categorical columns
])

In [None]:
# Combine pipelines into a single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

In [None]:
# --- 4. Split the dataset into training and testing sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# --- 5. Apply the preprocessing pipeline to the data ---
# The preprocessor learns from the training data and transforms both sets
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(f"\nShape of processed training data: {X_train_processed.shape}")
print(f"Shape of processed testing data: {X_test_processed.shape}")


Shape of processed training data: (8000, 2635)
Shape of processed testing data: (2000, 2635)


In [None]:
# --- 6. Build the Keras Deep Learning Model ---
model = Sequential([
    # Input layer: The input_shape must match the number of features after preprocessing
    Dense(16, activation='relu', input_shape=(X_train_processed.shape[1],)),

    # Hidden layer
    Dense(8, activation='relu'),

    # Output layer: 1 neuron with a sigmoid activation for binary classification
    Dense(1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Display the model's architecture
print("\n--- Model Summary ---")
model.summary()



--- Model Summary ---


In [None]:
# --- 7. Train the Model ---
print("\n--- Training the model ---")
history = model.fit(
    X_train_processed,
    y_train,
    epochs=50,
    batch_size=32, # A batch size of 32 is a common default
    validation_split=0.2,
    verbose=1
)


--- Training the model ---
Epoch 1/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.7359 - loss: 0.5959 - val_accuracy: 0.8100 - val_loss: 0.4251
Epoch 2/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8248 - loss: 0.3932 - val_accuracy: 0.8456 - val_loss: 0.3874
Epoch 3/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8588 - loss: 0.3430 - val_accuracy: 0.8562 - val_loss: 0.3693
Epoch 4/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8737 - loss: 0.3123 - val_accuracy: 0.8506 - val_loss: 0.3720
Epoch 5/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9018 - loss: 0.2535 - val_accuracy: 0.8450 - val_loss: 0.3876
Epoch 6/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9160 - loss: 0.2253 - val_accuracy: 0.8338 - val_loss: 0.4183


In [None]:
# --- 8. Evaluate the Model ---
print("\n--- Evaluating the model on the test set ---")
loss, accuracy = model.evaluate(X_test_processed, y_test, verbose=0)
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Test Loss: {loss:.4f}")


--- Evaluating the model on the test set ---
Test Accuracy: 77.60%
Test Loss: 1.3469
