In [1]:
# To perform Neural Network modeling on the provided dataset, you first need to prepare the data,
# including preprocessing and splitting it into training and testing sets.
# Using different types of Neural Networks for different types of features, we'll need to process the data accordingly.
# General overview of how to approach this task:

# 1. Data Preprocessing:
# Feature Engineering: Based on dataset, need to engineer features or encode categorical variables appropriately
# (e.g., one-hot encoding for categorical variables).
# Normalization/Scaling: Normalize or scale numerical features to ensure they have similar ranges.
# Train/Test Split: Split the dataset into training and testing sets.

# 2. Model Building:
# Feedforward Neural Networks (FNN):
# Design a Feedforward Neural Network architecture. we will Use libraries like TensorFlow or PyTorch to build and train your model.
# Define the input layer according to the number of features.
# Design hidden layers with appropriate activation functions (e.g., ReLU) and units.
# Define the output layer. Since it's a binary classification problem (churn prediction),
# we use a sigmoid activation function in the output layer.
# Convolutional Neural Networks (CNN):
# If you have image-related features, you need to reshape your data accordingly.
# Design a CNN architecture using libraries like TensorFlow or PyTorch.
# Convolutional layers followed by pooling layers to extract relevant features.
# Flatten the output and connect to fully connected layers for classification.
# Recurrent Neural Networks (RNN):
# If you have sequence-related features, you can use RNNs.
# LSTM or GRU layers can be suitable for capturing sequential patterns.
# Define the input shape considering the sequence length.
# Design the RNN architecture and connect it to fully connected layers for classification.

# 3. Model Training:
# Compile the models with appropriate loss functions (e.g., binary cross-entropy) and optimizers (e.g., Adam).
# Train the models on the training data.
# Monitor performance on the validation set to avoid overfitting.
# Tweak hyperparameters if necessary.

# 4. Model Evaluation:
# Evaluate the models on the test set using metrics like accuracy, precision, recall, F1-score, ROC AUC, etc.
# Compare the performance of different models to choose the best one.

In [120]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import numpy as np
from scipy.sparse import issparse

In [87]:
# 1. Data Loading and Preprocessing:

In [88]:
# load Data
from google.colab import files
uploaded = files.upload()

Saving E_Commerce_Customer_Behavior.csv to E_Commerce_Customer_Behavior.csv


In [89]:
# Read the file with tab delimiter
data = pd.read_csv("E_Commerce_Customer_Behavior.csv", sep='\t')

In [90]:
# 1. Handling Missing Values
# Drop rows with missing values
data.dropna(inplace=True)

In [91]:
# 3. Removing Duplicates
data.drop_duplicates(inplace=True)

In [92]:
# load to dataframe
df = pd.DataFrame(data)

In [93]:
# look at the data see whats cooking
df.head(3)

Unnamed: 0,account length;location code;user id;credit card info save;push status;add to wishlist;desktop sessions;app sessions;desktop transactions;total product detail views;session duration;promotion clicks;avg order value;sale product views;discount rate per visited products;product detail view per app session;app transactions;add to cart per session;customer service calls;churn
0,128;415;3824657;no;yes;25;265;45;17;110;197;87...
1,107;415;3717191;no;yes;26;162;27;17;123;196;10...
2,137;415;3581921;no;no;0;243;41;10;114;121;110;...


In [94]:
# Split the string data into separate columns based on semicolon delimiter to solve issue
df = pd.DataFrame([row.split(';') for row in data],
                  columns=['account length', 'location code', 'user id', 'credit card info save',
                           'push status', 'add to wishlist', 'desktop sessions', 'app sessions',
                           'desktop transactions', 'total product detail views', 'session duration',
                           'promotion clicks', 'avg order value', 'sale product views',
                           'discount rate per visited products', 'product detail view per app session',
                           'app transactions', 'add to cart per session', 'customer service calls', 'churn'])

# Remove the first row (header row)
df = df.iloc[1:]

# Now, you can view the DataFrame
print(df.head(3))

Empty DataFrame
Columns: [account length, location code, user id, credit card info save, push status, add to wishlist, desktop sessions, app sessions, desktop transactions, total product detail views, session duration, promotion clicks, avg order value, sale product views, discount rate per visited products, product detail view per app session, app transactions, add to cart per session, customer service calls, churn]
Index: []


In [95]:
# Load the data using read_csv with appropriate delimiter
df = pd.read_csv("E_Commerce_Customer_Behavior.csv", sep=';')

# Now, you can view the DataFrame
df.head(6)

Unnamed: 0,account length,location code,user id,credit card info save,push status,add to wishlist,desktop sessions,app sessions,desktop transactions,total product detail views,session duration,promotion clicks,avg order value,sale product views,discount rate per visited products,product detail view per app session,app transactions,add to cart per session,customer service calls,churn
0,128,415,3824657,no,yes,25,265,45,17,110,197,87,2447,91,1101,10,3,27,1,0
1,107,415,3717191,no,yes,26,162,27,17,123,196,103,2544,103,1145,137,3,37,1,0
2,137,415,3581921,no,no,0,243,41,10,114,121,110,1626,104,732,122,5,329,0,0
3,84,408,3759999,yes,no,0,299,51,5,71,62,88,1969,89,886,66,7,178,2,0
4,75,415,3306626,yes,no,0,167,28,13,113,148,122,1869,121,841,101,3,273,3,0
5,118,510,3918027,yes,no,0,223,38,19,98,221,101,2039,118,918,63,6,17,0,0


In [98]:
# be prepared to deal with various issues when it come to preprocessing. Use print function for debugging

In [99]:
# Define features and target variable
X = df.drop(columns=['churn'])
y = df['churn']

In [100]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape before preprocessing:", X_train.shape)
print("X_val shape before preprocessing:", X_val.shape)


X_train shape before preprocessing: (2666, 19)
X_val shape before preprocessing: (667, 19)


In [105]:
# Define preprocessing steps
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
numeric_transformer = StandardScaler()

categorical_features = X.select_dtypes(include=['object']).columns
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [106]:
# Preprocess the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)

print("X_train_preprocessed shape:", X_train_preprocessed.shape)
print("X_val_preprocessed shape:", X_val_preprocessed.shape)


X_train_preprocessed shape: (2666, 2667)
X_val_preprocessed shape: (667, 2667)


In [114]:
# an extra step to ensure the data matches
# Check if the data is in sparse format
if issparse(X_train_preprocessed):
    # Convert sparse matrix to dense array
    X_train_preprocessed = X_train_preprocessed.toarray()

# Check if the data is in sparse format
if issparse(X_val_preprocessed):
    # Convert sparse matrix to dense array
    X_val_preprocessed = X_val_preprocessed.toarray()

# Print the shapes of the preprocessed data to verify
print("X_train_preprocessed shape after conversion:", X_train_preprocessed.shape)
print("X_val_preprocessed shape after conversion:", X_val_preprocessed.shape)
# Print the first few rows of preprocessed data
print("X_train_preprocessed:")
print(X_train_preprocessed[:5])
print("X_val_preprocessed:")
print(X_val_preprocessed[:5])

X_train_preprocessed shape after conversion: (2666, 2667)
X_val_preprocessed shape after conversion: (667, 2667)
X_train_preprocessed:
[[ 3.60138166  1.73584027 -0.68323211 ...  0.          0.
   0.        ]
 [ 0.18495105 -0.51716801  0.89955655 ...  0.          0.
   0.        ]
 [-0.65017643 -0.51716801  0.34211893 ...  0.          0.
   0.        ]
 [ 1.02007854 -0.51716801 -1.24203922 ...  0.          0.
   0.        ]
 [-0.3718006   1.73584027 -0.61608715 ...  0.          0.
   0.        ]]
X_val_preprocessed:
[[ 0.31148552  1.73584027 -0.80201924 ...  0.          0.
   0.        ]
 [-0.85263157 -0.51716801 -0.19406621 ...  0.          0.
   0.        ]
 [-0.06811788 -0.51716801  0.63262153 ...  0.          0.
   0.        ]
 [ 1.1719199  -0.68317915  0.77895429 ...  0.          0.
   0.        ]
 [-0.11873167 -0.68317915 -1.651713   ...  0.          0.
   0.        ]]


In [115]:
# Define the FNN model
fnn_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_preprocessed.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])


In [116]:
# Compile the FNN model
fnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [117]:
# Train the FNN model
fnn_model.fit(X_train_preprocessed, y_train, epochs=10, batch_size=32, validation_data=(X_val_preprocessed, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7ab530180100>

In [121]:
# Model Evaluation
# Predict on the validation set
y_pred = (fnn_model.predict(X_val_preprocessed) > 0.5).astype("int32")

# Calculate evaluation metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_pred)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)


Accuracy: 0.8785607196401799
Precision: 0.65625
Recall: 0.4158415841584158
F1 Score: 0.509090909090909
ROC AUC Score: 0.6884861631039428


In [122]:
# saving model to be run and improved later
# import pickle

# Save the model using pickle
# with open("fnn_model.pkl", "wb") as f:
#    pickle.dump(fnn_model, f)

# Optionally, you can load the saved model later
# with open("fnn_model.pkl", "rb") as f:
#     loaded_model = pickle.load(f)