In [14]:
import pickle
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, FunctionTransformer, LabelBinarizer,OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score
from imblearn.over_sampling import RandomOverSampler
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from joblib import dump

# Data loading (use your path)
path = "bank-additional/bank-additional.csv"
bank = pd.read_csv(path, sep=';')


In [15]:
bank.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,30,blue-collar,married,basic.9y,no,yes,no,cellular,may,fri,...,2,999,0,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,no
1,39,services,single,high.school,no,no,no,telephone,may,fri,...,4,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,no
2,25,services,married,high.school,no,yes,no,telephone,jun,wed,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.962,5228.1,no
3,38,services,married,basic.9y,no,unknown,unknown,telephone,jun,fri,...,3,999,0,nonexistent,1.4,94.465,-41.8,4.959,5228.1,no
4,47,admin.,married,university.degree,no,yes,no,cellular,nov,mon,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,no


In [16]:
#We get all numeric features as a list
numeric_features_add = bank.select_dtypes(include = 'number').columns.tolist()
print(numeric_features_add)

#We get all categorical features as a list
categorical_features_add = bank.select_dtypes(include = 'object').columns.tolist()
categorical_features_add.remove("y")
print(categorical_features_add)

#So we can see that only categorical features has nulls
null_columns_add = bank.columns[bank.isnull().any()].tolist()
print(null_columns_add)

['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
[]


In [17]:

# Cyclic encoding for month
month_mapping = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
                 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
bank['month'] = bank['month'].map(month_mapping)
bank['month_sin'] = np.sin(2 * np.pi * bank['month'] / 12)
bank['month_cos'] = np.cos(2 * np.pi * bank['month'] / 12)
bank.drop('month', axis=1, inplace=True)

# Cyclic encoding for day_of_week
"""
day_of_week_mapping = {'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5}
bank['day_of_week'] = bank['day_of_week'].map(day_of_week_mapping)
bank['day_of_week_sin'] = np.sin(2 * np.pi * bank['day_of_week'] / 5)
bank['day_of_week_cos'] = np.cos(2 * np.pi * bank['day_of_week'] / 5)
bank.drop('day_of_week', axis=1, inplace=True)
"""
bank = pd.get_dummies(bank, columns=['day_of_week'], prefix='day_of_week')

# Ordinal encoding for education
education_mapping = [['illiterate', 'unknown', 'basic.4y', 'basic.6y', 'basic.9y',
                    'high.school', 'university.degree', 'professional.course']]
education_encoder = OrdinalEncoder(categories=education_mapping)
bank['education'] = education_encoder.fit_transform(bank[['education']])

# Ordinal encoding for default, housing, and loan
has_loan_order = [['no', 'unknown', 'yes']]
loan_encoder = OrdinalEncoder(categories=has_loan_order * 3)  # Repeat for three columns
bank[['default', 'housing', 'loan']] = loan_encoder.fit_transform(bank[['default', 'housing', 'loan']])

# Ordinal encoding for poutcome
poutcome_mapping = [['failure', 'nonexistent', 'success']]  # Ordered: worst to best
poutcome_encoder = OrdinalEncoder(categories=poutcome_mapping)
bank['poutcome'] = poutcome_encoder.fit_transform(bank[['poutcome']])

# Create a OneHotEncoder instance
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the categorical columns
encoded_data = encoder.fit_transform(bank[['contact', 'job', 'marital']])
# Convert the encoded data to a DataFrame
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())

# Concatenate the encoded DataFrame with the original DataFrame
bank = pd.concat([bank.drop(['contact', 'job', 'marital'], axis=1), encoded_df], axis=1)


In [18]:
bank.head()

Unnamed: 0,age,education,default,housing,loan,duration,campaign,pdays,previous,poutcome,...,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,marital_unknown
0,30,4.0,0.0,2.0,0.0,487,2,999,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,39,5.0,0.0,0.0,0.0,346,4,999,0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,25,5.0,0.0,2.0,0.0,227,1,999,0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,38,4.0,0.0,1.0,1.0,17,3,999,0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,47,6.0,0.0,2.0,0.0,58,1,999,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:

def preprocess_bank_data(bank):
    # Cyclic encoding for month
    month_mapping = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
                     'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
    bank['month'] = bank['month'].map(month_mapping)
    bank['month_sin'] = np.sin(2 * np.pi * bank['month'] / 12)
    bank['month_cos'] = np.cos(2 * np.pi * bank['month'] / 12)
    bank.drop('month', axis=1, inplace=True)

    # Cyclic encoding for day_of_week
    """
    day_of_week_mapping = {'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5}
    bank['day_of_week'] = bank['day_of_week'].map(day_of_week_mapping)
    bank['day_of_week_sin'] = np.sin(2 * np.pi * bank['day_of_week'] / 5)
    bank['day_of_week_cos'] = np.cos(2 * np.pi * bank['day_of_week'] / 5)
    bank.drop('day_of_week', axis=1, inplace=True)
    """
    bank = pd.get_dummies(bank, columns=['day_of_week'], prefix='day_of_week')

    # Ordinal encoding for education
    education_mapping = [['illiterate', 'unknown', 'basic.4y', 'basic.6y', 'basic.9y',
                        'high.school', 'university.degree', 'professional.course']]
    education_encoder = OrdinalEncoder(categories=education_mapping)
    bank['education'] = education_encoder.fit_transform(bank[['education']])

    # Ordinal encoding for default, housing, and loan
    has_loan_order = [['no', 'unknown', 'yes']]
    loan_encoder = OrdinalEncoder(categories=has_loan_order * 3)  # Repeat for three columns
    bank[['default', 'housing', 'loan']] = loan_encoder.fit_transform(bank[['default', 'housing', 'loan']])

    # Ordinal encoding for poutcome
    poutcome_mapping = [['failure', 'nonexistent', 'success']]  # Ordered: worst to best
    poutcome_encoder = OrdinalEncoder(categories=poutcome_mapping)
    bank['poutcome'] = poutcome_encoder.fit_transform(bank[['poutcome']])

    return bank


In [None]:
# Extract target and features
a = bank.drop(['y'], axis=1)
b = bank['y'].map({'no': 0, 'yes': 1})  # Convert 'y' to binary


# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        # Apply Preprocessing Encoding
        ('cyclic', FunctionTransformer(preprocess_bank_data, validate=False),
         ['month','day_of_week','education','poutcome','default', 'housing', 'loan']),

        # Apply OneHotEncoder for categorical columns
        ("ohe", OneHotEncoder(drop='first'), ['contact', 'job', 'marital']),

        # Scale numerical features
        ('num_scaler', StandardScaler(), numeric_features_add)
    ]
)


# Assuming you have your data in a DataFrame named 'X'
bank = preprocessor.fit_transform(a)

In [None]:
# Extract target and features
X = bank.drop(['y'], axis=1)
y = bank['y'].map({'no': 0, 'yes': 1})  # Convert 'y' to binary

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

# Resample training data to handle class imbalance
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        # Apply Preprocessing Encoding
        ('cyclic', FunctionTransformer(preprocess_bank_data, validate=False),
         ['month','day_of_week','education','poutcome','default', 'housing', 'loan']),

        # Apply OneHotEncoder for categorical columns
        ("ohe", OneHotEncoder(drop='first'), ['contact', 'job', 'marital']),

        # Scale numerical features
        ('num_scaler', StandardScaler(), numeric_features_add)
    ]
)

# Models to compare
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=1000),
    'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42),
    'XGBoost': XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42),
    'Bagging': BaggingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'MLP': MLPClassifier(max_iter=1000, random_state=42),
    'Linear SVM': LinearSVC(max_iter=1000, random_state=42),
    'Kernel SVM': SVC(kernel='rbf', probability=True, random_state=42)
}

# Hyperparameter grids
param_grids = {
    'Logistic Regression': {
        'classifier__C': [0.1, 1, 10]
    },
    'Random Forest': {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [3, 5, 7]
    },
    'XGBoost': {
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [3, 5, 7]
    },
    'Bagging': {
        'classifier__n_estimators': [10, 50, 100],
        'classifier__max_samples': [0.5, 0.75, 1.0]
    },
    'AdaBoost': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 1]
    },
    'Gradient Boosting': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7]
    },
    'MLP': {
        'classifier__hidden_layer_sizes': [(50,), (100,), (100, 50)],
        'classifier__activation': ['relu', 'tanh'],
        'classifier__alpha': [0.0001, 0.001, 0.01]
    },
    'Linear SVM': {
        'classifier__C': [0.1, 1, 10]
    },
    'Kernel SVM': {
        'classifier__C': [0.1, 1, 10],
        'classifier__gamma': ['scale', 0.1, 0.01]
    }
}


In [None]:
# Dictionary to store model results
model_results = {}

for model_name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    grid_search = GridSearchCV(pipeline, param_grids[model_name], cv=3, scoring='f1', n_jobs=-1)
    grid_search.fit(X_train_resampled, y_train_resampled)

    # Get best estimator
    best_model = grid_search.best_estimator_

    # Evaluate on test data
    y_pred = best_model.predict(X_test)
    f1 = f1_score(y_test, y_pred)

    # Store results
    model_results[model_name] = {
        'model': best_model,
        'f1_score': f1,
        'accuracy': accuracy_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred)
    }

    print(f"Results for {model_name}:")
    print(f"Accuracy: {model_results[model_name]['accuracy']}")
    print(f"Recall: {model_results[model_name]['recall']}")
    print(f"Precision: {model_results[model_name]['precision']}")
    print(f"F1 Score: {model_results[model_name]['f1_score']}")
    print(classification_report(y_test, y_pred))

In [None]:
# Choose the best model based on F1 score
best_model_name = max(model_results, key=lambda name: model_results[name]['f1_score'])
best_model = model_results[best_model_name]['model']

In [None]:
# Save the best model
filename = 'best_model_pipeline.sav'
pickle.dump(best_model, open(filename, 'wb'))
print(
    f"Best model saved as {filename}: {best_model_name} with F1 score of {model_results[best_model_name]['f1_score']:.4f}")