In [3]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import matthews_corrcoef, accuracy_score
from sklearn.model_selection import KFold
import pandas as pd
from sklearn.model_selection import train_test_split
from time import strptime
from sklearn.preprocessing import StandardScaler

data = pd.read_csv("bank-full.csv", sep = ";")

# change month to numeric
data['month'] = [strptime(str(x), '%b').tm_mon for x in data['month']]
# change all object data type to categorical
list_str_obj_cols = data.columns[data.dtypes == "object"].tolist()
for str_obj_col in list_str_obj_cols:
    data[str_obj_col] = data[str_obj_col].astype("category")

# encode all categorical data
df_encoded = pd.get_dummies(data, columns=['job', 'marital', 'education', 'default',
                                           'housing', 'loan', 'contact', 'poutcome'], )

# standardize all numeric data
data_numeric = data[["age", "balance", "day", "duration", "campaign", "pdays", "previous", "month"]]

std_scaler = StandardScaler()
df_scaled = std_scaler.fit_transform(data_numeric.to_numpy())
df_scaled = pd.DataFrame(df_scaled,
                         columns=["age", "balance", "day", "duration", "campaign", "pdays", "previous", "month"])

# combine both datasets
df_encoded.update(df_scaled)

# change class label to 0 and 1
df_encoded.y = pd.Categorical(df_encoded.y).codes
newdata = df_encoded

X = newdata.drop(["y"], axis=1)
y = newdata["y"]

#used for parameter tuning
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size = .8)


In [2]:
# Define individual models with best parameters
rf = RandomForestClassifier(max_depth=60, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=0)
logreg = LogisticRegression(C=1, penalty='l2', solver='liblinear', random_state=0)
nn = MLPClassifier(activation='relu', alpha=0.0001, hidden_layer_sizes=(50, 50), learning_rate_init=0.001, max_iter=500,
                   random_state=0)
dt = DecisionTreeClassifier(max_depth=20, min_samples_leaf=4, min_samples_split=5, random_state=0)

# Create an ensemble of the models using a majority class voting strategy
ensemble_model = VotingClassifier(estimators=[('rf', rf), ('logreg', logreg), ('nn', nn), ('dt', dt)], voting='hard')

# Define 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=0)

mcc_scores = []
accuracy_scores = []

# Perform 5-fold cross-validation
for train_index, val_index in kf.split(X):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

    ensemble_model.fit(X_train_fold, y_train_fold)
    y_pred = ensemble_model.predict(X_val_fold)

    mcc = matthews_corrcoef(y_val_fold, y_pred)
    accuracy = accuracy_score(y_val_fold, y_pred)

    mcc_scores.append(mcc)
    accuracy_scores.append(accuracy)

print("MCC Scores:", mcc_scores)
print("Accuracy Scores:", accuracy_scores)




MCC Scores: [0.40976983875814915, 0.3959852559899423, 0.4011848992019024, 0.41621544752997264, 0.4016352213461504]
Accuracy Scores: [0.8995908437465443, 0.9000221190002212, 0.9037823490378235, 0.9021234240212342, 0.900464499004645]


# 加权

In [3]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import matthews_corrcoef, accuracy_score
from sklearn.model_selection import train_test_split

# Calculate the weights based on MCC scores
mcc_values = {
    'logreg': 0.43491332157004836,
    'rf': 0.5147329105011038,
    'nn': 0.4531938207274481,
    'dt': 0.4505434003255415
}

total_mcc = sum(mcc_values.values())
weights = {model: mcc/total_mcc for model, mcc in mcc_values.items()}

# Define individual models with updated parameters
rf = RandomForestClassifier(max_depth=60, min_samples_leaf=1, min_samples_split=2, n_estimators=200, random_state=0)
logreg = LogisticRegression(C=1, penalty='l2', solver='liblinear', random_state=0)
nn = MLPClassifier(activation='relu', alpha=0.0001, hidden_layer_sizes=(50, 50), learning_rate_init=0.001, max_iter=500, random_state=0)
dt = DecisionTreeClassifier(max_depth=20, min_samples_leaf=4, min_samples_split=5, random_state=0)

# Create a weighted ensemble of the models
ensemble_weighted_model = VotingClassifier(estimators=[('rf', rf), ('logreg', logreg), ('nn', nn), ('dt', dt)], 
                                          voting='soft', weights=[weights['rf'], weights['logreg'], weights['nn'], weights['dt']])

# Fit the ensemble model
ensemble_weighted_model.fit(X_train, y_train)

# Predict and evaluate on test set
y_pred = ensemble_weighted_model.predict(X_test)

mcc_weighted = matthews_corrcoef(y_test, y_pred)
accuracy_weighted = accuracy_score(y_test, y_pred)

print("Weighted Ensemble MCC:", mcc_weighted)
print("Weighted Ensemble Accuracy:", accuracy_weighted)


Weighted Ensemble MCC: 0.4448151709012132
Weighted Ensemble Accuracy: 0.8991485126617274


# 5CV Neural Network + Decision Tree + Logistic Regression + Random Forest Soft Voting

In [5]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import matthews_corrcoef, accuracy_score
from sklearn.model_selection import KFold

# Calculate the weights based on MCC scores
mcc_values = {
    'logreg': 0.43491332157004836,
    'rf': 0.5147329105011038,
    'nn': 0.4531938207274481,
    'dt': 0.4505434003255415
}

total_mcc = sum(mcc_values.values())
weights = {model: mcc/total_mcc for model, mcc in mcc_values.items()}

# Define individual models with best parameters
rf = RandomForestClassifier(max_depth=60, min_samples_leaf=1, min_samples_split=2, n_estimators=200, random_state=0)
logreg = LogisticRegression(C=1, penalty='l2', solver='liblinear', random_state=0)
nn = MLPClassifier(activation='relu', alpha=0.0001, hidden_layer_sizes=(50, 50), learning_rate_init=0.001, max_iter=500, random_state=0)
dt = DecisionTreeClassifier(max_depth=20, min_samples_leaf=4, min_samples_split=5, random_state=0)

# Create a weighted ensemble of the models
ensemble_weighted_model = VotingClassifier(estimators=[('rf', rf), ('logreg', logreg), ('nn', nn), ('dt', dt)], 
                                          voting='soft', weights=[weights['rf'], weights['logreg'], weights['nn'], weights['dt']])

# Define 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=0)

mcc_scores_weighted = []
accuracy_scores_weighted = []

# Perform 5-fold cross-validation
for train_index, val_index in kf.split(X):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
    
    # Fit the ensemble model on the training fold
    ensemble_weighted_model.fit(X_train_fold, y_train_fold)
    
    # Predict on the validation fold
    y_pred_fold = ensemble_weighted_model.predict(X_val_fold)
    
    # Compute MCC and accuracy for the current fold
    mcc_fold = matthews_corrcoef(y_val_fold, y_pred_fold)
    accuracy_fold = accuracy_score(y_val_fold, y_pred_fold)
    
    mcc_scores_weighted.append(mcc_fold)
    accuracy_scores_weighted.append(accuracy_fold)

# Compute mean MCC and accuracy over all folds
mean_mcc_weighted = sum(mcc_scores_weighted) / len(mcc_scores_weighted)
mean_accuracy_weighted = sum(accuracy_scores_weighted) / len(accuracy_scores_weighted)

print("Weighted Ensemble MCC:", mcc_scores_weighted)
print("Weighted Ensemble Accuracy:", accuracy_scores_weighted)
print("Mean MCC for Weighted Ensemble:", mean_mcc_weighted)
print("Mean Accuracy for Weighted Ensemble:", mean_accuracy_weighted)



Weighted Ensemble MCC: [0.46634579584638075, 0.4501952289260701, 0.4539465434753073, 0.47410171172875626, 0.47053221957267233]
Weighted Ensemble Accuracy: [0.900807254229791, 0.9020128290201282, 0.9057730590577306, 0.9046671090466711, 0.9044459190444591]
Mean MCC for Weighted Ensemble: 0.4630242999098373
Mean Accuracy for Weighted Ensemble: 0.903541234079756


# 5CV Neural Network + Decision Tree + Logistic Regression + Random Forest Hard Voting

In [6]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import matthews_corrcoef, accuracy_score
from sklearn.model_selection import KFold

# Calculate the weights based on MCC scores
mcc_values = {
    'logreg': 0.43491332157004836,
    'rf': 0.5147329105011038,
    'nn': 0.4531938207274481,
    'dt': 0.4505434003255415
}

total_mcc = sum(mcc_values.values())
weights = {model: mcc/total_mcc for model, mcc in mcc_values.items()}

# Define individual models with best parameters
rf = RandomForestClassifier(max_depth=60, min_samples_leaf=1, min_samples_split=2, n_estimators=200, random_state=0)
logreg = LogisticRegression(C=1, penalty='l2', solver='liblinear', random_state=0)
nn = MLPClassifier(activation='relu', alpha=0.0001, hidden_layer_sizes=(50, 50), learning_rate_init=0.001, max_iter=500, random_state=0)
dt = DecisionTreeClassifier(max_depth=20, min_samples_leaf=4, min_samples_split=5, random_state=0)

# Create a weighted ensemble of the models
ensemble_weighted_model = VotingClassifier(estimators=[('rf', rf), ('logreg', logreg), ('nn', nn), ('dt', dt)], 
                                          voting='hard', weights=[weights['rf'], weights['logreg'], weights['nn'], weights['dt']])

# Define 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=0)

mcc_scores_weighted = []
accuracy_scores_weighted = []

# Perform 5-fold cross-validation
for train_index, val_index in kf.split(X):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
    
    # Fit the ensemble model on the training fold
    ensemble_weighted_model.fit(X_train_fold, y_train_fold)
    
    # Predict on the validation fold
    y_pred_fold = ensemble_weighted_model.predict(X_val_fold)
    
    # Compute MCC and accuracy for the current fold
    mcc_fold = matthews_corrcoef(y_val_fold, y_pred_fold)
    accuracy_fold = accuracy_score(y_val_fold, y_pred_fold)
    
    mcc_scores_weighted.append(mcc_fold)
    accuracy_scores_weighted.append(accuracy_fold)

# Compute mean MCC and accuracy over all folds
mean_mcc_weighted = sum(mcc_scores_weighted) / len(mcc_scores_weighted)
mean_accuracy_weighted = sum(accuracy_scores_weighted) / len(accuracy_scores_weighted)

print("Weighted Ensemble MCC:", mcc_scores_weighted)
print("Weighted Ensemble Accuracy:", accuracy_scores_weighted)
print("Mean MCC for Weighted Ensemble:", mean_mcc_weighted)
print("Mean Accuracy for Weighted Ensemble:", mean_accuracy_weighted)



Weighted Ensemble MCC: [0.44129601011216607, 0.43790521259678256, 0.43722909579898145, 0.4611350725988718, 0.4493776563247105]
Weighted Ensemble Accuracy: [0.900364923144974, 0.9017916390179164, 0.9046671090466711, 0.9056624640566246, 0.9038929440389294]
Mean MCC for Weighted Ensemble: 0.4453886094863025
Mean Accuracy for Weighted Ensemble: 0.9032758158610232


# 5CV Neural Network + Decision Tree + Logistic Regression

In [13]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import matthews_corrcoef, accuracy_score
from sklearn.model_selection import KFold

# Calculate the weights based on MCC scores
mcc_values = {
    'logreg': 0.43491332157004836,
    'nn': 0.4531938207274481,
    'dt': 0.4505434003255415
}

total_mcc = sum(mcc_values.values())
weights = {model: mcc/total_mcc for model, mcc in mcc_values.items()}

# Define individual models with best parameters
logreg = LogisticRegression(C=1, penalty='l2', solver='liblinear', random_state=0)
nn = MLPClassifier(activation='relu', alpha=0.0001, hidden_layer_sizes=(50, 50), learning_rate_init=0.001, max_iter=500, random_state=0)
dt = DecisionTreeClassifier(max_depth=20, min_samples_leaf=4, min_samples_split=5, random_state=0)

# Create a weighted ensemble of the models
ensemble_weighted_model = VotingClassifier(estimators=[('logreg', logreg), ('nn', nn), ('dt', dt)], 
                                          voting='soft', weights=[weights['logreg'], weights['nn'], weights['dt']])

# Define 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=0)

mcc_scores_weighted = []
accuracy_scores_weighted = []

# Perform 5-fold cross-validation
for train_index, val_index in kf.split(X):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
    
    # Fit the ensemble model on the training fold
    ensemble_weighted_model.fit(X_train_fold, y_train_fold)
    
    # Predict on the validation fold
    y_pred_fold = ensemble_weighted_model.predict(X_val_fold)
    
    # Compute MCC and accuracy for the current fold
    mcc_fold = matthews_corrcoef(y_val_fold, y_pred_fold)
    accuracy_fold = accuracy_score(y_val_fold, y_pred_fold)
    
    mcc_scores_weighted.append(mcc_fold)
    accuracy_scores_weighted.append(accuracy_fold)

# Compute mean MCC and accuracy over all folds
mean_mcc_weighted = sum(mcc_scores_weighted) / len(mcc_scores_weighted)
mean_accuracy_weighted = sum(accuracy_scores_weighted) / len(accuracy_scores_weighted)

print("Weighted Ensemble MCC:", mcc_scores_weighted)
print("Weighted Ensemble Accuracy:", accuracy_scores_weighted)
print("Mean MCC for Weighted Ensemble:", mean_mcc_weighted)
print("Mean Accuracy for Weighted Ensemble:", mean_accuracy_weighted)



Weighted Ensemble MCC: [0.4631143936218434, 0.4530689130929212, 0.4552452624433544, 0.4747581723397219, 0.4731540300055983]
Weighted Ensemble Accuracy: [0.8983744332632976, 0.9017916390179164, 0.9051094890510949, 0.9037823490378235, 0.9034505640345056]
Mean MCC for Weighted Ensemble: 0.46386815430068784
Mean Accuracy for Weighted Ensemble: 0.9025016948809277


# 5CV Neural Network + Decision Tree 

In [14]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import matthews_corrcoef, accuracy_score
from sklearn.model_selection import KFold

# Calculate the weights based on MCC scores
mcc_values = {
    'nn': 0.4531938207274481,
    'dt': 0.4505434003255415
}

total_mcc = sum(mcc_values.values())
weights = {model: mcc/total_mcc for model, mcc in mcc_values.items()}

# Define individual models with best parameters
logreg = LogisticRegression(C=1, penalty='l2', solver='liblinear', random_state=0)
nn = MLPClassifier(activation='relu', alpha=0.0001, hidden_layer_sizes=(50, 50), learning_rate_init=0.001, max_iter=500, random_state=0)
dt = DecisionTreeClassifier(max_depth=20, min_samples_leaf=4, min_samples_split=5, random_state=0)

# Create a weighted ensemble of the models
ensemble_weighted_model = VotingClassifier(estimators=[('nn', nn), ('dt', dt)], voting='soft', weights=[weights['nn'], weights['dt']])


# Define 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=0)

mcc_scores_weighted = []
accuracy_scores_weighted = []

# Perform 5-fold cross-validation
for train_index, val_index in kf.split(X):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
    
    # Fit the ensemble model on the training fold
    ensemble_weighted_model.fit(X_train_fold, y_train_fold)
    
    # Predict on the validation fold
    y_pred_fold = ensemble_weighted_model.predict(X_val_fold)
    
    # Compute MCC and accuracy for the current fold
    mcc_fold = matthews_corrcoef(y_val_fold, y_pred_fold)
    accuracy_fold = accuracy_score(y_val_fold, y_pred_fold)
    
    mcc_scores_weighted.append(mcc_fold)
    accuracy_scores_weighted.append(accuracy_fold)

# Compute mean MCC and accuracy over all folds
mean_mcc_weighted = sum(mcc_scores_weighted) / len(mcc_scores_weighted)
mean_accuracy_weighted = sum(accuracy_scores_weighted) / len(accuracy_scores_weighted)

print("Weighted Ensemble MCC:", mcc_scores_weighted)
print("Weighted Ensemble Accuracy:", accuracy_scores_weighted)
print("Mean MCC for Weighted Ensemble:", mean_mcc_weighted)
print("Mean Accuracy for Weighted Ensemble:", mean_accuracy_weighted)



Weighted Ensemble MCC: [0.4567033901198766, 0.45217076570372233, 0.46374782622112637, 0.4788808183358292, 0.4780032078726325]
Weighted Ensemble Accuracy: [0.8896383943381622, 0.8963724839637248, 0.9015704490157045, 0.8989161689891617, 0.898363193983632]
Mean MCC for Weighted Ensemble: 0.46590120165063736
Mean Accuracy for Weighted Ensemble: 0.8969721380580771
