## build classification model

In [1]:
import pandas as pd
import numpy as np
import pathlib
import json

In [2]:
import model
from sklearn.model_selection import train_test_split
from sklearn import set_config

### load data

In [3]:
csv_filename = pathlib.Path("input", "marketing.csv")

In [4]:
try:
    df = pd.read_csv(csv_filename,na_values='?')
except Exception as e:
    print(f'error: {e}')

### prepare data

#### a) create categorical target from income (i.e. 1=high, 0=low)

In [5]:
df['target'] = np.where(df.income < 5, 0, 1)
df.drop(columns='income',inplace=True)

In [6]:
#### create categorical and numerical features

In [7]:
cat_cols = ['sex', 'marital_status', 'education', 'occupation', 'dual_income','household_status','type_of_home', 'ethnic_class', 'language']
num_cols=['age', 'years_in_sf', 'household_members', 'under_18']

In [8]:
for i in cat_cols:
    df[i] = df[i].astype(str)

In [9]:
for i in num_cols:
    df[i] = df[i].astype(float)

In [10]:
df.head()

Unnamed: 0,sex,marital_status,age,education,occupation,years_in_sf,dual_income,household_members,under_18,household_status,type_of_home,ethnic_class,language,target
0,2,1.0,5.0,4.0,5.0,5.0,3,3.0,0.0,1.0,1.0,7.0,,1
1,1,1.0,5.0,5.0,5.0,5.0,3,5.0,2.0,1.0,1.0,7.0,1.0,1
2,2,1.0,3.0,5.0,1.0,5.0,2,3.0,1.0,2.0,3.0,7.0,1.0,1
3,2,5.0,1.0,2.0,6.0,5.0,1,4.0,2.0,3.0,1.0,7.0,1.0,0
4,2,5.0,1.0,2.0,6.0,3.0,1,4.0,2.0,3.0,1.0,7.0,1.0,0


#### b) split data into train and test

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df[cat_cols+num_cols], df.target, test_size=0.2,random_state=42)
print('Data for Modeling: ' + str(X_train.shape))
print('Test / Unseen Data For Predictions: ' + str(X_test.shape))

Data for Modeling: (7194, 13)
Test / Unseen Data For Predictions: (1799, 13)


#### c) recombine target and features

In [12]:
data_train_df = pd.concat([X_train,y_train],axis=1)
data_test_df = pd.concat([X_test,y_test],axis=1)

### model (xgb classifier)

In [13]:
# specify target
model_features = {"numeric_features":num_cols, "categorical_features":cat_cols, "target": "target"}

In [14]:
# xgb model params
xgb_model_params = {'colsample_bytree': 1, 'learning_rate': 0.3, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 1, 'booster': 'gbtree', 'objective': 'binary:logistic', 'random_state': 40, 'scale_pos_weight': 1}

In [15]:
# join parameters into one dictionary
training_config = {**model_features, **xgb_model_params}

In [16]:
pipe_final = model.ClassificationModel(training_config)

In [17]:
pipe_final.fit(data_train_df)

[32m2023-09-11 20:19:49.708[0m | [1mINFO    [0m | [36mmodel[0m:[36mfit[0m:[36m117[0m - [1mStarting training[0m
[32m2023-09-11 20:19:51.439[0m | [1mINFO    [0m | [36mmodel[0m:[36mfit[0m:[36m138[0m - [1mTraining complete for all models.[0m


In [18]:
# check accuracy on unseen
pipe_final.evaluate(data_test_df)

[32m2023-09-11 20:19:51.491[0m | [1mINFO    [0m | [36mmodel[0m:[36mevaluate[0m:[36m202[0m - [1mComputed metrics...[0m


(array([[0.7716086 , 0.22839142],
        [0.1419295 , 0.8580705 ],
        [0.5132678 , 0.48673218],
        ...,
        [0.9978479 , 0.0021521 ],
        [0.02820557, 0.9717944 ],
        [0.3578518 , 0.6421482 ]], dtype=float32),
 {'f1_score': 0.8084042021010506,
  'roc_auc': 0.8683116704748349,
  'precision': 0.800792864222002,
  'recall': 0.8161616161616162,
  'balanced_acc': 0.7838533667952704,
  'accuracy': 0.7871039466370205})

### save model

In [19]:
pipe_final.save_model("./output/marketing_model")

[32m2023-09-11 20:19:51.526[0m | [1mINFO    [0m | [36mmodel[0m:[36msave_model[0m:[36m226[0m - [1mSaved model to ./output/marketing_model.[0m
[32m2023-09-11 20:19:51.528[0m | [1mINFO    [0m | [36mmodel[0m:[36msave_model[0m:[36m231[0m - [1mSaved training parameters to ./output/marketing_model.json.[0m


### save data (use parquet as it size is smaller and saves data format)

In [20]:
data_train_df.to_parquet("./output/data_train_df.parquet",index=False)
data_test_df.to_parquet("./output/data_test_df.parquet",index=False)

#### show pipeline transformation

In [21]:
set_config(display='diagram')
pipe_final.model_instance

#### show 'transformed' feature names

In [22]:
pipe_final.get_transformed_feature_names()

['age',
 'years_in_sf',
 'household_members',
 'under_18',
 'sex_1',
 'sex_2',
 'marital_status_1.0',
 'marital_status_2.0',
 'marital_status_3.0',
 'marital_status_4.0',
 'marital_status_5.0',
 'marital_status_nan',
 'education_1.0',
 'education_2.0',
 'education_3.0',
 'education_4.0',
 'education_5.0',
 'education_6.0',
 'education_nan',
 'occupation_1.0',
 'occupation_2.0',
 'occupation_3.0',
 'occupation_4.0',
 'occupation_5.0',
 'occupation_6.0',
 'occupation_7.0',
 'occupation_8.0',
 'occupation_9.0',
 'occupation_nan',
 'dual_income_1',
 'dual_income_2',
 'dual_income_3',
 'household_status_1.0',
 'household_status_2.0',
 'household_status_3.0',
 'household_status_nan',
 'type_of_home_1.0',
 'type_of_home_2.0',
 'type_of_home_3.0',
 'type_of_home_4.0',
 'type_of_home_5.0',
 'type_of_home_nan',
 'ethnic_class_1.0',
 'ethnic_class_2.0',
 'ethnic_class_3.0',
 'ethnic_class_4.0',
 'ethnic_class_5.0',
 'ethnic_class_6.0',
 'ethnic_class_7.0',
 'ethnic_class_8.0',
 'ethnic_class_nan'