In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from mpl_toolkits.mplot3d import Axes3D


In [2]:
# The superstore wants to predict the likelihood of the customer 
# giving a positive response and wants to identify the different factors which affect the customer's response
df = pd.read_csv("superstore_data.csv")
print(df.shape)
print(df.columns)

(2240, 22)
Index(['Id', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'Response', 'Complain'],
      dtype='object')


In [3]:
df.head()
# Me being biased, the following columns might be more relevant in this task then the others 
# NumDealsPurchases - number of purchases made with discount
# NumCatalogPurchases - number of purchases made using catalog (buying goods to be shipped through the mail)
# NumStorePurchases - number of purchases made directly in stores
# NumWebPurchases - number of purchases made through the company's website
# NumWebVisitsMonth - number of visits to company's website in the last month
# I will pick three of them and try to make a 3D plot with a decision boundary 

Unnamed: 0,Id,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Response,Complain
0,1826,1970,Graduation,Divorced,84835.0,0,0,6/16/2014,0,189,...,111,189,218,1,4,4,6,1,1,0
1,1,1961,Graduation,Single,57091.0,0,0,6/15/2014,0,464,...,7,0,37,1,7,3,7,5,1,0
2,10476,1958,Graduation,Married,67267.0,0,1,5/13/2014,0,134,...,15,2,30,1,3,2,5,2,0,0
3,1386,1967,Graduation,Together,32474.0,1,1,11/5/2014,0,10,...,0,0,0,1,1,0,2,7,0,0
4,5371,1989,Graduation,Single,21474.0,1,0,8/4/2014,0,6,...,11,0,34,2,3,1,2,7,1,0


In [4]:
df['Response'].unique()

array([1, 0], dtype=int64)

In [5]:
df['Response'].isnull().sum()
# so it is clean 

0

In [6]:
# how about the other colummns
df.isnull().sum()

Id                      0
Year_Birth              0
Education               0
Marital_Status          0
Income                 24
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
Response                0
Complain                0
dtype: int64

In [7]:
# Keep only numerical columns for PCA
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
# PCA is implemented on the features, but not the output(Response)
numerical_features = [col for col in numerical_features if col != "Response"]

In [8]:
# Drop rows with missing values again just in case
df = df[numerical_features + ["Response"]].dropna()

# Step 4: Prepare X and y
X = df[numerical_features].values
y = df["Response"].values

In [9]:
# Standardize Features: generally increases the efficiency of gradient descent
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
# Apply PCA, only using the first 3 components to spare the time and to facilitate a visualization
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

In [12]:
# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y, test_size=0.2, random_state=42
)

In [24]:
# Use a common architecture here
model = Sequential([
    Dense(16, input_shape=(3,), activation='relu'),
    Dense(8, activation='relu'),
    Dense(1, activation='sigmoid')
])
# Customize the learning rate (default of adam is 0.001)
from tensorflow.keras.optimizers import Adam
model.compile(optimizer=Adam(learning_rate=0.0001), loss="binary_crossentropy", metrics=["accuracy"])

In [25]:
# Train the Model
history = model.fit(X_train, y_train, epochs=100, batch_size=32,
                    validation_data=(X_test, y_test), verbose=1)

Epoch 1/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8584 - loss: 0.5708 - val_accuracy: 0.8536 - val_loss: 0.5619
Epoch 2/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8523 - loss: 0.5513 - val_accuracy: 0.8536 - val_loss: 0.5482
Epoch 3/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8542 - loss: 0.5433 - val_accuracy: 0.8536 - val_loss: 0.5375
Epoch 4/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8515 - loss: 0.5406 - val_accuracy: 0.8536 - val_loss: 0.5274
Epoch 5/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8445 - loss: 0.5315 - val_accuracy: 0.8536 - val_loss: 0.5186
Epoch 6/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8473 - loss: 0.5192 - val_accuracy: 0.8536 - val_loss: 0.5106
Epoch 7/100
[1m56/56[0m [32m━━━

In [26]:
# Here we are with a not so good loss
# Let's inspect possible causes
print(np.mean(y))  # Shows the proportion of class 1 (e.g., "Yes")

0.15027075812274368


In [29]:
# So most of the responses are NO = 0, this means an imbalanced data set
# which in turn possibly causes the Plateau-of-loss at 0.38 due to the lack of predictability of 1 (minority)
# To improve this let's build another model
# First, define how the classes shall be weighted
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(enumerate(class_weights))
print("Calculated Class Weights:", class_weight_dict)

Calculated Class Weights: {0: 0.589095744680851, 1: 3.3059701492537314}


In [30]:
history_weighted = model.fit(X_train, y_train, epochs=100, batch_size=32, class_weight=class_weight_dict,
                    validation_data=(X_test, y_test), verbose=1)

Epoch 1/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8618 - loss: 0.8190 - val_accuracy: 0.8536 - val_loss: 0.3945
Epoch 2/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8577 - loss: 0.7883 - val_accuracy: 0.8536 - val_loss: 0.4110
Epoch 3/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8414 - loss: 0.8006 - val_accuracy: 0.8536 - val_loss: 0.4258
Epoch 4/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8412 - loss: 0.7498 - val_accuracy: 0.8536 - val_loss: 0.4402
Epoch 5/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8403 - loss: 0.7339 - val_accuracy: 0.8536 - val_loss: 0.4530
Epoch 6/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8488 - loss: 0.6910 - val_accuracy: 0.8514 - val_loss: 0.4635
Epoch 7/100
[1m56/56[0m [32m━━━

In [None]:
# It is even worse :P
# We will come back to this data set later, after we have tried out the other main algorithms :)
