In [37]:
#code snippet 1
#installing pycaret package for ML model prediction
!pip install pycaret



In [38]:
# Code snippet updated 2
import pandas as pd
from datetime import datetime

# Importing the data for ML modeling predictor on 'Avg Monthly Spend'
df = pd.read_csv('https://raw.githubusercontent.com/Imjuandiaz/Customer_data_Ebook/refs/heads/main/Customer%20Data%20Apr%202024%20-%20Customer%20Data.csv')

# Extract/split Values in feature 'City-ZipCode-State'
df[['City', 'ZipCode', 'State']] = df['City-ZipCode-State'].str.split('-', expand=True)

# Calculate Age, correcting two-digit years
def calculate_age(born):
    today = datetime.today()
    age = today.year - born.year - ((today.month, today.day) < (born.month, born.day))
    if age < 0:  # Correct negative ages caused by 1900s vs 2000s
        age += 100
    return age

# Convert Birth Date and calculate Age
df['birth_date_dt'] = pd.to_datetime(df['Birth Date'], format='%m/%d/%y', errors='coerce')
df.dropna(subset=['birth_date_dt'], inplace=True)
df['Age'] = df['birth_date_dt'].apply(calculate_age)
df['Age'] = df['Age'].astype(int)
df.drop(columns=['birth_date_dt'], inplace=True)

# Drop original and unnecessary columns used during training
df = df.drop(columns=['City-ZipCode-State', 'Title' ,'Suffix','Middle Name','Street Address2',
                      'Customer ID', 'First Name', 'Last Name', 'Street Address1', 'Birth Date'])

# Check final cleaned dataframe
df



Unnamed: 0,Education Level,Occupation,Gender,Marital Status,Home Owner Status,Number of Cars Owned,Number of Children At Home,Total Number of Children,Annual Income,Avg Monthly Spend,eBook Subscriber Flag,City,ZipCode,State,Age
0,Bachelors,Professional,M,M,1,0,0,2,137947,89,0,Cleveland,44101,Ohio,59
1,Bachelors,Professional,M,S,0,1,3,3,101141,117,1,Seattle,98101,Washington,60
2,Bachelors,Professional,M,M,1,1,3,3,91945,123,0,Omaha,68101,Nebraska,60
3,Bachelors,Professional,F,S,0,1,0,0,86688,50,0,Fort Worth,76101,Texas,57
4,Bachelors,Professional,F,S,1,4,5,5,92771,95,1,Oakland,94601,California,57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16514,Bachelors,Professional,F,M,1,4,5,5,101542,101,0,San Antonio,78201,Texas,60
16515,Partial College,Professional,F,S,1,2,0,3,46549,46,0,Pittsburgh,15201,Pennsylvania,89
16516,Bachelors,Management,M,M,1,2,0,5,133053,79,0,Honolulu,96801,Hawaii,85
16517,High School,Skilled Manual,M,M,1,2,0,4,31930,65,0,Anaheim,92801,California,78


In [39]:
#code snippet 3
#Displaying columns
df.columns

Index(['Education Level', 'Occupation', 'Gender', 'Marital Status',
       'Home Owner Status', 'Number of Cars Owned',
       'Number of Children At Home', 'Total Number of Children',
       'Annual Income', 'Avg Monthly Spend', 'eBook Subscriber Flag', 'City',
       'ZipCode', 'State', 'Age'],
      dtype='object')

In [40]:
#code snippet 4
#importing regression ML pycare model
from pycaret.regression import *

In [41]:
# Code snippet 5
# Setting the target variable
y = 'Avg Monthly Spend'


In [42]:
# Code snippet 6
# Setting the ignored variables(noise variables, redundant variables,  variables that are missing more than 20% data etc)
ignored_cols = ['City-ZipCode-State', 'Title' ,'Suffix','Middle Name','Street Address2', 'Customer ID', 'First Name', 'Last Name', 'Street Address1', 'Birth Date']

In [43]:
# Code snippet 7
# Setting the numerical variables
num_cols = ['Number of Cars Owned',
       'Number of Children At Home', 'Total Number of Children',
       'Annual Income','Age','Home Owner Status']

In [44]:
# Code snippet 8
# Setting the categorical variables
cat_cols = ['Education Level', 'Occupation', 'Gender','City','ZipCode', 'State','Marital Status','eBook Subscriber Flag']

In [45]:
# Code snippet 9
# Setting up/configuring the pycaret setup for ML  regressionmodeling
regression_setup = setup(df,
                        target = y,
                        ignore_features = ignored_cols,
                        categorical_features = cat_cols,
                        numeric_features = num_cols, train_size = 0.7)

Unnamed: 0,Description,Value
0,Session id,7771
1,Target,Avg Monthly Spend
2,Target type,Regression
3,Original data shape,"(16519, 15)"
4,Transformed data shape,"(16519, 23)"
5,Transformed train set shape,"(11563, 23)"
6,Transformed test set shape,"(4956, 23)"
7,Ignore features,10
8,Numeric features,6
9,Categorical features,8


In [46]:
#code snippet 10
# Running pycaret to invoke ML algorithms
compare_models()


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,2.5059,10.0135,3.1636,0.9865,0.0507,0.0389,0.91
gbr,Gradient Boosting Regressor,2.5373,10.2129,3.1953,0.9862,0.0508,0.0391,1.584
xgboost,Extreme Gradient Boosting,2.6259,10.9349,3.3061,0.9853,0.0524,0.0406,0.518
rf,Random Forest Regressor,2.6458,11.2167,3.3484,0.9849,0.0532,0.041,6.117
et,Extra Trees Regressor,2.7454,12.0652,3.4727,0.9838,0.0553,0.0427,4.406
dt,Decision Tree Regressor,3.5551,20.3084,4.5061,0.9726,0.0712,0.0549,0.276
lr,Linear Regression,4.8354,40.6043,6.3687,0.9454,0.0913,0.0701,0.548
ridge,Ridge Regression,4.8346,40.6044,6.3687,0.9454,0.0912,0.0701,0.241
br,Bayesian Ridge,4.8348,40.6044,6.3687,0.9454,0.0912,0.0701,0.206
lar,Least Angle Regression,4.8689,40.9963,6.3981,0.9449,0.0918,0.0706,0.207


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

In [47]:
# Code snippet 11
# Building the model with the best algorithm
best_model = create_model('lightgbm')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.4969,9.7349,3.1201,0.9863,0.0505,0.0388
1,2.5556,10.2031,3.1942,0.9877,0.0499,0.0392
2,2.4792,9.7074,3.1157,0.987,0.0501,0.0384
3,2.5562,10.5038,3.241,0.9854,0.0505,0.0395
4,2.5501,10.4793,3.2372,0.9862,0.0548,0.0408
5,2.5672,10.4856,3.2382,0.9859,0.0507,0.0394
6,2.4829,9.9182,3.1493,0.9872,0.049,0.0376
7,2.4745,9.7201,3.1177,0.9857,0.0484,0.0378
8,2.5118,10.3402,3.2156,0.9867,0.0537,0.0399
9,2.3852,9.0423,3.007,0.9869,0.0496,0.0379


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [48]:
# Code snippet 12
# Predicting 'Avg Monthly Spend' on a dataset
predictions = predict_model(best_model, df)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,2.3201,8.5809,2.9293,0.9885,0.0473,0.0363


In [49]:
# Code snippet 13
# Exporting predictions to a csv file
predictions.to_csv('Ebook predictions.csv')

In [55]:
# Code snippet 14
# Importing the client data (for which the 'Avg Month Spend' predictions need to be made)
client_data = pd.read_csv('https://raw.githubusercontent.com/Imjuandiaz/Customer_data_Ebook/refs/heads/main/Customer%20Data%20Apr%202024%20-%20Customer%20Data.csv')

# Extract/split Values in feature  'City-ZipCode-State'
client_data[['City', 'ZipCode', 'State']] = client_data['City-ZipCode-State'].str.split('-', expand=True)

# Calculate Age, similar to the training data
def calculate_age(born):
    today = datetime.today()
    age = today.year - born.year - ((today.month, today.day) < (born.month, born.day))
    if age < 0:  # Correct negative ages caused by 1900s vs 2000s
        age += 100
    return age


client_data['birth_date_dt'] = pd.to_datetime(client_data['Birth Date'], format='%m/%d/%y', errors='coerce')
client_data.dropna(subset=['birth_date_dt'], inplace=True)
client_data['Age'] = client_data['birth_date_dt'].apply(calculate_age)
client_data['Age'] = client_data['Age'].astype(int)
client_data.drop(columns=['birth_date_dt'], inplace=True)


# Drop original and unnecessary columns used during training
client_data = client_data.drop(columns=['City-ZipCode-State', 'Title' ,'Suffix','Middle Name','Street Address2', 'Customer ID', 'First Name', 'Last Name', 'Street Address1', 'Birth Date'])


client_data

Unnamed: 0,Education Level,Occupation,Gender,Marital Status,Home Owner Status,Number of Cars Owned,Number of Children At Home,Total Number of Children,Annual Income,Avg Monthly Spend,eBook Subscriber Flag,City,ZipCode,State,Age
0,Bachelors,Professional,M,M,1,0,0,2,137947,89,0,Cleveland,44101,Ohio,59
1,Bachelors,Professional,M,S,0,1,3,3,101141,117,1,Seattle,98101,Washington,60
2,Bachelors,Professional,M,M,1,1,3,3,91945,123,0,Omaha,68101,Nebraska,60
3,Bachelors,Professional,F,S,0,1,0,0,86688,50,0,Fort Worth,76101,Texas,57
4,Bachelors,Professional,F,S,1,4,5,5,92771,95,1,Oakland,94601,California,57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16514,Bachelors,Professional,F,M,1,4,5,5,101542,101,0,San Antonio,78201,Texas,60
16515,Partial College,Professional,F,S,1,2,0,3,46549,46,0,Pittsburgh,15201,Pennsylvania,89
16516,Bachelors,Management,M,M,1,2,0,5,133053,79,0,Honolulu,96801,Hawaii,85
16517,High School,Skilled Manual,M,M,1,2,0,4,31930,65,0,Anaheim,92801,California,78


In [56]:
# Code snippet 15
# Predicting 'Avg Monthly Spend' for the client dataset
client_predictions = predict_model(best_model, client_data)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,2.3201,8.5809,2.9293,0.9885,0.0473,0.0363


In [58]:

# Code snippet 16
# Predicting 'Avg Monthly Spend' for the client dataset
client_predictions = predict_model(best_model, client_data)


# Renaming label as Predicted 'Avg Monthly Spend' and rounding it to 2 decimals
client_predictions = client_predictions.rename(columns = {'Label':'eBook Subscriber Flag'}).round(2)


# Exporting 'Avg Monthly Spend' predictions to a csv file
client_predictions.to_csv('lightgbm Final Avg Monthly Spend Ebook Prediction.csv')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,2.3201,8.5809,2.9293,0.9885,0.0473,0.0363


In [53]:
#code snippet 17
# getting the parameter for the Regression ML model
best_model_params = best_model.get_params()

In [54]:
#code snippet 18
#Print the Diccionary of Parameters
import pprint
pprint.pprint(best_model_params)

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': 7771,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}
