In [1]:
#code Snippet 1
#setting pycaret for predictor ML model
!pip install pycaret



In [2]:
# Code snippet updated 2
import pandas as pd
from datetime import datetime

# Importing the data for ML modeling predictor on 'Avg Monthly Spend'
df = pd.read_csv('https://raw.githubusercontent.com/Imjuandiaz/Customer_data_Ebook/refs/heads/main/Customer%20Data%20Apr%202024%20-%20Customer%20Data.csv')

# Extract/split Values in feature 'City-ZipCode-State'
df[['City', 'ZipCode', 'State']] = df['City-ZipCode-State'].str.split('-', expand=True)

# Calculate Age, correcting two-digit years
def calculate_age(born):
    today = datetime.today()
    age = today.year - born.year - ((today.month, today.day) < (born.month, born.day))
    if age < 0:  # Correct negative ages caused by 1900s vs 2000s
        age += 100
    return age

# Convert Birth Date and calculate Age
df['birth_date_dt'] = pd.to_datetime(df['Birth Date'], format='%m/%d/%y', errors='coerce')
df.dropna(subset=['birth_date_dt'], inplace=True)
df['Age'] = df['birth_date_dt'].apply(calculate_age)
df['Age'] = df['Age'].astype(int)
df.drop(columns=['birth_date_dt'], inplace=True)

# Drop original and unnecessary columns used during training
df = df.drop(columns=['City-ZipCode-State', 'Title' ,'Suffix','Middle Name','Street Address2',
                      'Customer ID', 'First Name', 'Last Name', 'Street Address1', 'Birth Date'])

# Check final cleaned dataframe
df

Unnamed: 0,Education Level,Occupation,Gender,Marital Status,Home Owner Status,Number of Cars Owned,Number of Children At Home,Total Number of Children,Annual Income,Avg Monthly Spend,eBook Subscriber Flag,City,ZipCode,State,Age
0,Bachelors,Professional,M,M,1,0,0,2,137947,89,0,Cleveland,44101,Ohio,59
1,Bachelors,Professional,M,S,0,1,3,3,101141,117,1,Seattle,98101,Washington,60
2,Bachelors,Professional,M,M,1,1,3,3,91945,123,0,Omaha,68101,Nebraska,60
3,Bachelors,Professional,F,S,0,1,0,0,86688,50,0,Fort Worth,76101,Texas,57
4,Bachelors,Professional,F,S,1,4,5,5,92771,95,1,Oakland,94601,California,57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16514,Bachelors,Professional,F,M,1,4,5,5,101542,101,0,San Antonio,78201,Texas,60
16515,Partial College,Professional,F,S,1,2,0,3,46549,46,0,Pittsburgh,15201,Pennsylvania,89
16516,Bachelors,Management,M,M,1,2,0,5,133053,79,0,Honolulu,96801,Hawaii,85
16517,High School,Skilled Manual,M,M,1,2,0,4,31930,65,0,Anaheim,92801,California,78


In [3]:
#Code snippet 3
#displaying columns
df.columns

Index(['Education Level', 'Occupation', 'Gender', 'Marital Status',
       'Home Owner Status', 'Number of Cars Owned',
       'Number of Children At Home', 'Total Number of Children',
       'Annual Income', 'Avg Monthly Spend', 'eBook Subscriber Flag', 'City',
       'ZipCode', 'State', 'Age'],
      dtype='object')

In [4]:
#code snippet 4
#setting up the classification pycaret ML model
from pycaret.classification import *


In [5]:
# Code snippet 5
# Setting the target variable
y = 'eBook Subscriber Flag'


In [6]:
# Code snippet 6
# Setting the ignored variables(noise variables, redundant variables,  variables that are missing more than 20% data etc)
ignored_cols = ['City-ZipCode-State', 'Title' ,'Suffix','Middle Name','Street Address2', 'Customer ID', 'First Name', 'Last Name', 'Street Address1', 'Birth Date']

In [7]:
# Code snippet 7
# Setting the numerical variables
num_cols = ['Number of Cars Owned',
       'Number of Children At Home', 'Total Number of Children',
       'Annual Income', 'Avg Monthly Spend','Age','Home Owner Status']

In [8]:
# Code snippet 8
# Setting the categorical variables
cat_cols = ['Education Level', 'Occupation', 'Gender','City','ZipCode', 'State','Marital Status']

In [9]:
# Code snippet 9
# Setting up/configuring the pycaret setup for Classification ML modeling
classification_setup = setup(df,
                        target = y,
                        ignore_features = ignored_cols,
                        categorical_features = cat_cols,
                        numeric_features = num_cols, train_size = 0.7)

Unnamed: 0,Description,Value
0,Session id,8652
1,Target,eBook Subscriber Flag
2,Target type,Binary
3,Original data shape,"(16519, 15)"
4,Transformed data shape,"(16519, 23)"
5,Transformed train set shape,"(11563, 23)"
6,Transformed test set shape,"(4956, 23)"
7,Ignore features,10
8,Numeric features,7
9,Categorical features,7


In [10]:
#code snippet 10
# Running pycaret to invoke ML algorithms
compare_models()


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.7986,0.8593,0.5872,0.7525,0.6594,0.5196,0.5278,1.937
lightgbm,Light Gradient Boosting Machine,0.7949,0.8553,0.5924,0.7389,0.6574,0.5135,0.5201,1.98
ada,Ada Boost Classifier,0.7902,0.852,0.6174,0.7128,0.6615,0.5107,0.5136,0.648
lda,Linear Discriminant Analysis,0.7851,0.8363,0.5351,0.7462,0.6232,0.4786,0.4915,0.204
xgboost,Extreme Gradient Boosting,0.7846,0.8439,0.5981,0.7085,0.6484,0.4948,0.4986,0.511
ridge,Ridge Classifier,0.7836,0.8365,0.5234,0.7498,0.6164,0.4723,0.4871,0.305
rf,Random Forest Classifier,0.7835,0.8382,0.5843,0.7132,0.6421,0.4891,0.4943,1.599
lr,Logistic Regression,0.7831,0.8344,0.5554,0.7274,0.6298,0.4803,0.4891,1.757
et,Extra Trees Classifier,0.7732,0.8245,0.5869,0.6859,0.6324,0.4699,0.473,1.446
nb,Naive Bayes,0.7645,0.802,0.4649,0.7285,0.5673,0.4163,0.4362,0.195


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [11]:
# Code snippet 11
# Building the model with the best algorithm
best_model = create_model('gbc')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8012,0.8521,0.5781,0.7655,0.6588,0.5223,0.5326
1,0.8038,0.8717,0.6,0.7599,0.6705,0.5336,0.5411
2,0.8038,0.8667,0.6052,0.7565,0.6724,0.5349,0.5416
3,0.7976,0.8556,0.5807,0.7534,0.6559,0.5159,0.5246
4,0.8106,0.8761,0.6016,0.7778,0.6784,0.5472,0.5563
5,0.7915,0.8632,0.6094,0.72,0.6601,0.5112,0.5149
6,0.7967,0.855,0.5807,0.7508,0.6549,0.5141,0.5227
7,0.7837,0.843,0.5391,0.7393,0.6235,0.477,0.4887
8,0.7933,0.8497,0.5885,0.7362,0.6541,0.5093,0.5158
9,0.8036,0.8604,0.5885,0.7661,0.6657,0.53,0.5393


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
# Code snippet 12
# Predicting 'eBook Subscriber Flag' on a dataset
predictions = predict_model(best_model, df)


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.8074,0.8706,0.5976,0.7714,0.6734,0.54,0.5489


In [13]:
# Code snippet 13
# Exporting predictions to a csv file
predictions.to_csv('gbc Ebook predictions.csv')

In [19]:
# Code snippet 14
# Importing the client data (for which the 'Avg Month Spend' predictions need to be made)
client_data = pd.read_csv('https://raw.githubusercontent.com/Imjuandiaz/Customer_data_Ebook/refs/heads/main/Customer%20Data%20Apr%202024%20-%20Customer%20Data.csv')

# Extract/split Values in feature  'City-ZipCode-State'
client_data[['City', 'ZipCode', 'State']] = client_data['City-ZipCode-State'].str.split('-', expand=True)

# Calculate Age, similar to the training data
def calculate_age(born):
    today = datetime.today()
    age = today.year - born.year - ((today.month, today.day) < (born.month, born.day))
    if age < 0:  # Correct negative ages caused by 1900s vs 2000s
        age += 100
    return age


client_data['birth_date_dt'] = pd.to_datetime(client_data['Birth Date'], format='%m/%d/%y', errors='coerce')
client_data.dropna(subset=['birth_date_dt'], inplace=True)
client_data['Age'] = client_data['birth_date_dt'].apply(calculate_age)
client_data['Age'] = client_data['Age'].astype(int)
client_data.drop(columns=['birth_date_dt'], inplace=True)


# Drop original and unnecessary columns used during training
client_data = client_data.drop(columns=['City-ZipCode-State', 'Title' ,'Suffix','Middle Name','Street Address2', 'Customer ID', 'First Name', 'Last Name', 'Street Address1', 'Birth Date'])


client_data

Unnamed: 0,Education Level,Occupation,Gender,Marital Status,Home Owner Status,Number of Cars Owned,Number of Children At Home,Total Number of Children,Annual Income,Avg Monthly Spend,eBook Subscriber Flag,City,ZipCode,State,Age
0,Bachelors,Professional,M,M,1,0,0,2,137947,89,0,Cleveland,44101,Ohio,59
1,Bachelors,Professional,M,S,0,1,3,3,101141,117,1,Seattle,98101,Washington,60
2,Bachelors,Professional,M,M,1,1,3,3,91945,123,0,Omaha,68101,Nebraska,60
3,Bachelors,Professional,F,S,0,1,0,0,86688,50,0,Fort Worth,76101,Texas,57
4,Bachelors,Professional,F,S,1,4,5,5,92771,95,1,Oakland,94601,California,57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16514,Bachelors,Professional,F,M,1,4,5,5,101542,101,0,San Antonio,78201,Texas,60
16515,Partial College,Professional,F,S,1,2,0,3,46549,46,0,Pittsburgh,15201,Pennsylvania,89
16516,Bachelors,Management,M,M,1,2,0,5,133053,79,0,Honolulu,96801,Hawaii,85
16517,High School,Skilled Manual,M,M,1,2,0,4,31930,65,0,Anaheim,92801,California,78


In [20]:
# Code snippet 15
# Predicting 'eBook Subscriber Flag' for the client dataset
client_predictions = predict_model(best_model, client_data)


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.8074,0.8706,0.5976,0.7714,0.6734,0.54,0.5489


In [21]:
# Code snippet 16
# Predicting 'eBook Subscriber Flag' for the client dataset
client_predictions = predict_model(best_model, client_data)


# Renaming label as Predicted 'eBook Subscriber Flag' and rounding it to 2 decimals
client_predictions = client_predictions.rename(columns = {'Label':'eBook Subscriber Flag'}).round(2)


# Exporting 'eBook Subscriber Flag' predictions to a csv file
client_predictions.to_csv('Final gbc Subcription Ebook Prediction.csv')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.8074,0.8706,0.5976,0.7714,0.6734,0.54,0.5489


In [22]:
#code snippet 17
# getting the parameter for the Regression ML model
best_model_params = best_model.get_params()

In [23]:
#code snippet 18
#Print the Diccionary of Parameters
import pprint
pprint.pprint(best_model_params)

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 8652,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}
