In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_validate
import warnings 
from lightgbm import LGBMClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
warnings.filterwarnings('ignore')   

  from pandas.core import (


In [2]:
try:
    df = pd.read_csv('https://raw.githubusercontent.com/aladelca/machine_learning_model/main/archivos_trabajo/bank-full.csv', sep=';')
except:
    df = pd.read_csv('/Users/aladelca/Library/CloudStorage/OneDrive-McGillUniversity/MMA/Enterprise_Architecture/targeted_marketing/Dataset/bank-full.csv', sep=';')
df['y'] = df['y'].map({'yes': 1, 'no': 0})

### Delete columns leading to data leakage

df = df.drop(['contact','poutcome', 'duration'], axis=1)


### First experiment, catboost handling categorical variables

In [5]:
mlflow.set_experiment("targeted_marketing")
with mlflow.start_run(run_name = 'catboost'):

    VARS = [
        'age', 
        'job', 
        'marital', 
        'education', 
        'balance', 
        'housing', 
        'loan', 
        'day', 
        'month', 
        'campaign', 
        ]
    TARGET = ['y']
    CAT_VARS = [
        'job',
        'marital',
        'education',
        'month',
        ]
    x = df[VARS]
    y = df[TARGET]

    x['housing'] = x['housing'].map({'yes': 1, 'no': 0})
    x['loan'] = x['loan'].map({'yes': 1, 'no': 0})

    ### Split the data into training and testing
    x_old, x_new, y_old, y_new = train_test_split(x, y, test_size=0.2, random_state=123)

    x_train, x_test, y_train, y_test = train_test_split(x_old, y_old, test_size=0.2, random_state=123)

    x_fit, x_val, y_fit, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=123)
    print('Training dataset:', x_fit.shape)
    print('Validation dataset:', x_val.shape)
    print('Test dataset:', x_test.shape)

    model = CatBoostClassifier(
        random_state=123, 
        cat_features = CAT_VARS, 
        verbose=0,
        eval_metric='AUC'
        )

    model.fit(x_fit, y_fit, early_stopping_rounds=10, eval_set=(x_val, y_val))
    mlflow.log_param('early_stopping_rounds', 10)
    mlflow.log_param('eval_metric', 'AUC')
    mlflow.log_metric('auc_validation', roc_auc_score(y_val, model.predict_proba(x_val)[:,1]))
    mlflow.log_metric('auc_test', roc_auc_score(y_test, model.predict_proba(x_test)[:,1]))
    mlflow.catboost.log_model(model, 'model_1')   
    x_fit.to_csv("training_data.csv")
    mlflow.log_artifact("training_data.csv")
    

mlflow.end_run()

Training dataset: (23147, 10)
Validation dataset: (5787, 10)
Test dataset: (7234, 10)


PermissionError: [Errno 13] Permission denied: '/Users/aladelca'

In [None]:
mlflow.set_experiment("targeted_marketing")
with mlflow.start_run(run_name = 'lgbm categorical features'):

    VARS = [
        'age', 
        'job', 
        'marital', 
        'education', 
        'balance', 
        'housing', 
        'loan', 
        'day', 
        'month', 
        'campaign', 
        ]
    TARGET = ['y']
    CAT_VARS = [
        'job',
        'marital',
        'education',
        'month',
        ]
    x = df[VARS]
    y = df[TARGET]

    x['housing'] = x['housing'].map({'yes': 1, 'no': 0})
    x['loan'] = x['loan'].map({'yes': 1, 'no': 0})

    x[CAT_VARS] = x[CAT_VARS].astype('category')
    ### Split the data into training and testing
    x_old, x_new, y_old, y_new = train_test_split(x, y, test_size=0.2, random_state=123)

    x_train, x_test, y_train, y_test = train_test_split(x_old, y_old, test_size=0.2, random_state=123)

    x_fit, x_val, y_fit, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=123)
    print('Training dataset:', x_fit.shape)
    print('Validation dataset:', x_val.shape)
    print('Test dataset:', x_test.shape)

    model = LGBMClassifier(random_state=123, enable_categorical=True, eval_metric='AUC')

    model.fit(x_fit, y_fit)
    mlflow.log_param('early_stopping_rounds', 10)
    mlflow.log_param('eval_metric', 'AUC')
    mlflow.log_metric('auc_validation', roc_auc_score(y_val, model.predict_proba(x_val)[:,1]))
    mlflow.log_metric('auc_test', roc_auc_score(y_test, model.predict_proba(x_test)[:,1]))
    mlflow.lightgbm.log_model(model, 'model_2')   
    x_fit.to_csv("training_data.csv")
    mlflow.log_artifact("training_data.csv")
    

mlflow.end_run()

Training dataset: (23147, 10)
Validation dataset: (5787, 10)
Test dataset: (7234, 10)
[LightGBM] [Info] Number of positive: 2664, number of negative: 20483
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000448 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 434
[LightGBM] [Info] Number of data points in the train set: 23147, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.115091 -> initscore=-2.039767
[LightGBM] [Info] Start training from score -2.039767


In [4]:
mlflow.set_experiment("targeted_marketing")
with mlflow.start_run(run_name = 'lgbm onehot encoding'):

    VARS = [
        'age', 
        'job', 
        'marital', 
        'education', 
        'balance', 
        'housing', 
        'loan', 
        'day', 
        'month', 
        'campaign', 
        ]
    TARGET = ['y']
    CAT_VARS = [
        'job',
        'marital',
        'education',
        'month',
        ]
    x = df[VARS]
    y = df[TARGET]

    x['housing'] = x['housing'].map({'yes': 1, 'no': 0})
    x['loan'] = x['loan'].map({'yes': 1, 'no': 0})

    #x[CAT_VARS] = x[CAT_VARS].astype('category')
    ### Split the data into training and testing
    x_old, x_new, y_old, y_new = train_test_split(x, y, test_size=0.2, random_state=123)

    x_train, x_test, y_train, y_test = train_test_split(x_old, y_old, test_size=0.2, random_state=123)

    x_fit, x_val, y_fit, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=123)
    print('Training dataset:', x_fit.shape)
    print('Validation dataset:', x_val.shape)
    print('Test dataset:', x_test.shape)
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, CAT_VARS)])
    model = Pipeline(steps=[('preprocessor', preprocessor),
                  ('classifier', LGBMClassifier(random_state=123, enable_categorical=True, eval_metric='AUC'))])  
    

    model.fit(x_fit, y_fit)
    mlflow.log_param('early_stopping_rounds', 10)
    mlflow.log_param('eval_metric', 'AUC')
    mlflow.log_metric('auc_validation', roc_auc_score(y_val, model.predict_proba(x_val)[:,1]))
    mlflow.log_metric('auc_test', roc_auc_score(y_test, model.predict_proba(x_test)[:,1]))
    mlflow.lightgbm.log_model(model, 'model_2')   
    x_fit.to_csv("training_data.csv")
    mlflow.log_artifact("training_data.csv")
    

mlflow.end_run()

Training dataset: (23147, 10)
Validation dataset: (5787, 10)
Test dataset: (7234, 10)
[LightGBM] [Info] Number of positive: 2664, number of negative: 20483
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000265 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 62
[LightGBM] [Info] Number of data points in the train set: 23147, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.115091 -> initscore=-2.039767
[LightGBM] [Info] Start training from score -2.039767


PermissionError: [Errno 13] Permission denied: '/Users/aladelca'

In [6]:
mlflow.set_experiment("targeted_marketing")

# Start the MLflow run
with mlflow.start_run(run_name='lgbm ordinal encoding'):
    # Define variables
    VARS = [
        'age', 
        'job', 
        'marital', 
        'education', 
        'balance', 
        'housing', 
        'loan', 
        'day', 
        'month', 
        'campaign',
    ]
    TARGET = ['y']
    CAT_VARS = ['job', 'marital', 'education', 'month']

    x = df[VARS]
    y = df[TARGET].values.ravel()  # Flatten the array for model compatibility

    # Map yes/no to 1/0
    x['housing'] = x['housing'].map({'yes': 1, 'no': 0})
    x['loan'] = x['loan'].map({'yes': 1, 'no': 0})

    # Data splitting
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123)

    # Define the ordinal encoder transformation for categorical variables
    categorical_transformer = Pipeline(steps=[
        ('ordinal', OrdinalEncoder())
    ])

    # Column transformer for applying transformations to the specified columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, CAT_VARS)
        ],
        remainder='passthrough'
    )

    # Define the complete pipeline
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LGBMClassifier(random_state=123, eval_metric='auc', verbose = -1))
    ])

    # Fit the model
    model.fit(x_train, y_train)
    
    # Evaluate the model
    y_val_pred_proba = model.predict_proba(x_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_val_pred_proba)

    # Log parameters, metrics, and model
    mlflow.log_param('early_stopping_rounds', 10)
    mlflow.log_param('eval_metric', 'auc')
    mlflow.log_metric('auc_validation', roc_auc_score(y_val, model.predict_proba(x_val)[:,1]))
    mlflow.log_metric('auc_test', auc_score)
    mlflow.lightgbm.log_model(model, 'model_2')

    # Save and log training data
    x_train.to_csv("training_data.csv")
    mlflow.log_artifact("training_data.csv")

# End the MLflow run
mlflow.end_run()

In [7]:
mlflow.set_experiment("targeted_marketing")

# Start the MLflow run
with mlflow.start_run(run_name='lgbm ordinal encoding with parameters'):
    # Define variables
    VARS = [
        'age', 
        'job', 
        'marital', 
        'education', 
        'balance', 
        'housing', 
        'loan', 
        'day', 
        'month', 
        'campaign',
    ]
    TARGET = ['y']
    CAT_VARS = ['job', 'marital', 'education', 'month']

    x = df[VARS]
    y = df[TARGET].values.ravel()  # Flatten the array for model compatibility

    # Map yes/no to 1/0
    x['housing'] = x['housing'].map({'yes': 1, 'no': 0})
    x['loan'] = x['loan'].map({'yes': 1, 'no': 0})

    # Data splitting
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123)

    # Define the ordinal encoder transformation for categorical variables
    categorical_transformer = Pipeline(steps=[
        ('ordinal', OrdinalEncoder())
    ])

    # Column transformer for applying transformations to the specified columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, CAT_VARS)
        ],
        remainder='passthrough'
    )

    # Define the complete pipeline
    params = {
        'max_depth':10,
        'learning_rate':0.1,
        'n_estimators':2000,
        'num_leaves':5,
              }
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LGBMClassifier(random_state=123, eval_metric='auc', verbose = -1, **params))
    ])

    # Fit the model
    model.fit(x_train, y_train)
    
    # Evaluate the model
    y_val_pred_proba = model.predict_proba(x_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_val_pred_proba)

    # Log parameters, metrics, and model
    mlflow.log_param('early_stopping_rounds', 10)
    mlflow.log_param('eval_metric', 'auc')
    mlflow.log_metric('auc_validation', roc_auc_score(y_val, model.predict_proba(x_val)[:,1]))
    mlflow.log_metric('auc_test', auc_score)
    mlflow.log_params(params)
    mlflow.lightgbm.log_model(model, 'model_2')

    # Save and log training data
    x_train.to_csv("training_data.csv")
    mlflow.log_artifact("training_data.csv")

# End the MLflow run
mlflow.end_run()