# Superstore Membership Purchase - Predictive Modeling  
**Author:** Giovanna Cardenas  
**Date:** May 2025  
**Description:** This notebook builds and evaluates classification models to predict whether a customer will purchase a $499 membership using decision trees, boosting, and bagging methods. The final model selection is based on accuracy and feature importance.

In [185]:
# Import packages
%matplotlib inline
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split
from dmba import classificationSummary

warnings.filterwarnings('ignore')

In [187]:
# Load data
df= pd.read_csv('superstore_data_clean.csv')
df.head()

Unnamed: 0,Id,Year_Birth,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,...,Response,Complain,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Marital_Status_Widow,Education_Basic,Education_Graduation,Education_PhD,Education_Postgrad
0,1826,1970,84835.0,0,0,0,189,104,379,111,...,1,0,True,False,False,False,False,True,False,False
1,1,1961,57091.0,0,0,0,464,5,64,7,...,1,0,False,False,True,False,False,True,False,False
2,10476,1958,67267.0,0,1,0,134,11,59,15,...,0,0,False,True,False,False,False,True,False,False
3,5371,1989,21474.0,1,0,0,6,16,24,11,...,1,0,False,False,True,False,False,True,False,False
4,7348,1958,71691.0,0,0,0,336,130,411,240,...,1,0,False,False,True,False,False,False,True,False


In [189]:
# Separate the features and target variable to split the data into training and testing sets
x = df.drop(columns=['Response'])
y = df['Response']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)

In [191]:
# Initialize and fit a Decision Tree Classifier
defaultTree = DecisionTreeClassifier(random_state=20)
defaultTree.fit(x_train, y_train)

# Evaluate using classification summary
classes = defaultTree.classes_
classificationSummary(y_test, defaultTree.predict(x_test), class_names=defaultTree.classes_)

Confusion Matrix (Accuracy 0.8384)

       Prediction
Actual   0   1
     0 248  28
     1  25  27


In [193]:
# Get feature importances from the tree
importances = defaultTree.feature_importances_
df_i = pd.DataFrame({'Feature': x_train.columns, 'Importance': importances})
df_i = df_i.sort_values('Importance')
print(df_i)

                    Feature  Importance
18  Marital_Status_Divorced    0.000000
17                 Complain    0.000000
21     Marital_Status_Widow    0.000917
24            Education_PhD    0.002261
4                  Teenhome    0.003670
23     Education_Graduation    0.005780
20    Marital_Status_Single    0.007798
22          Education_Basic    0.009436
25       Education_Postgrad    0.012058
3                   Kidhome    0.012615
15        NumStorePurchases    0.028732
19   Marital_Status_Married    0.029380
1                Year_Birth    0.035336
13          NumWebPurchases    0.039325
16        NumWebVisitsMonth    0.040606
12        NumDealsPurchases    0.040814
9           MntFishProducts    0.045825
7                 MntFruits    0.046715
8           MntMeatProducts    0.048195
10         MntSweetProducts    0.049460
0                        Id    0.060618
5                   Recency    0.063761
14      NumCatalogPurchases    0.066441
6                  MntWines    0.085484


In [195]:
# Remove unimportant variables
df.drop(columns=['Kidhome','Complain', 'Teenhome'], inplace=True)

# Redefine features and target and split the new reduced data set into training and testing sets
x = df.drop(columns=['Response'])
y = df['Response']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)

# Run tree calssification again
defaultTree = DecisionTreeClassifier(random_state=20)
defaultTree.fit(x_train, y_train)

classes = defaultTree.classes_
classificationSummary(y_test, defaultTree.predict(x_test), class_names=defaultTree.classes_)

Confusion Matrix (Accuracy 0.8445)

       Prediction
Actual   0   1
     0 251  25
     1  26  26


In [197]:
# Boost Tree and check for accuracy improvement
boost = AdaBoostClassifier(DecisionTreeClassifier(random_state=20), n_estimators=100, random_state=1)
boost.fit(x_train, y_train)

classificationSummary(y_test, boost.predict(x_test), class_names=classes)

Confusion Matrix (Accuracy 0.8506)

       Prediction
Actual   0   1
     0 253  23
     1  26  26


In [199]:
# Bag Tree and check for accuracy improvement
bagging = BaggingClassifier(DecisionTreeClassifier(random_state=20), 
                            n_estimators=100, random_state=1)
bagging.fit(x_train, y_train)

classificationSummary(y_test, bagging.predict(x_test), class_names=classes)

Confusion Matrix (Accuracy 0.8811)

       Prediction
Actual   0   1
     0 271   5
     1  34  18


### Bagged tree is the best model as it has the highest accuracy score.