### The Imports

In [None]:
#Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import balanced_accuracy_score, f1_score

### Hard Example Mining (HEM) algorithm

In [None]:
# Hard Example Mining (HEM) algorithm
def hard_example_mining(clf, X_train, y_train, n_hard_examples):
    w = X_train.shape[1]
    y_pred = clf.predict(X_train).reshape(-1,1)
    y_pred[y_pred==-1] = 0
    errors = np.abs(y_train - y_pred)
    hard_examples_idx = np.argsort(errors)[-n_hard_examples:]
    return X_train[hard_examples_idx].reshape(-1,w), y_train[hard_examples_idx].reshape(-1,1)

### Soft Example Mining (SEM) algorithm


In [None]:
# Soft Example Mining (SEM) algorithm
def soft_example_mining(clf, X_train, y_train, n_soft_examples):
    w = X_train.shape[1]
    y_proba = np.min(clf.predict_proba(X_train), axis=1).reshape(-1,1)
    soft_examples_idx = np.argsort(np.abs(y_train - y_proba))[:n_soft_examples]
    return X_train[soft_examples_idx].reshape(-1,w), y_train[soft_examples_idx].reshape(-1,1)


### Balanced Cascade with Filters (BCWF) algorithm


In [None]:
# Balanced Cascade with Filters (BCWF) algorithm
def bcwf(X, y, T, filter_type='hard', n_hard_examples=10, n_soft_examples=10):
    w = X.shape[1]
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)
    classifiers = []
    f = y_train.reshape(-1,1)
    for i in range(w-1):
        f = np.concatenate((f, y_train.reshape(-1,1)), axis=1)

    n = np.sum(y_train==-1)
    p = np.sum(y_train==1)

    for t in range(T):
        clf = AdaBoostClassifier(n_estimators=50)
        clf.fit(X_train, y_train)
        classifiers.append(clf)
        NHE = X_train[f==-1].reshape(n,w)[np.argsort(np.max(clf.predict_proba(X_train[f==-1].reshape(n,w)), axis=1))]
        PHE = X_train[f==1].reshape(p,w)[np.argsort(np.max(clf.predict_proba(X_train[f==1].reshape(p,w)), axis=1))[::-1]]
        
        n2 = int(np.ceil((n - p + PHE.shape[0]) / T))
        n1 = int(np.ceil(NHE.shape[0] / T))
        p1 = int(np.ceil(PHE.shape[0] / T))
        
        if filter_type == 'hard':
            X_hard, y_hard = hard_example_mining(clf, X_train, y_train, n_hard_examples)
            X_train = np.vstack((X_train, X_hard))
            y_train = np.vstack((y_train.reshape(-1, 1), y_hard)).squeeze()

        elif filter_type == 'soft':
            X_soft, y_soft = soft_example_mining(clf, X_train, y_train, n_soft_examples)
            X_train = np.vstack((X_train, X_soft))
            y_train = np.vstack((y_train.reshape(-1, 1), y_soft)).squeeze()

        else:
            raise ValueError("Unknown filter type. Please choose 'hard' or 'soft'")

        return classifiers

### Ensemble voting strategies

In [None]:
# Ensemble voting strategies
def ensemble_predict(classifiers, X, strategy='majority_vote'):
    
    preds = []
    for clf in classifiers:
        if strategy == 'majority_vote':
            pred = clf.predict(X)
        elif strategy == 'average_probability':
            pred = clf.predict_proba(X)[:, 1]  # probability of positive class
        else:
            raise ValueError("Invalid prediction strategy: %s" % strategy)
        preds.append(pred)
        
    preds = np.asarray(preds) 
    preds = np.squeeze(preds)
    preds = np.where(preds == -1, 0, preds)
    if strategy == 'majority_vote':
        preds = np.ravel(preds)  # flatten the input array
        k = np.argmax(np.bincount(preds))
    elif strategy == 'average_probability':
        k = np.mean(preds, axis=0)
    preds[preds == 0] = -1
    return preds.reshape(-1,1), k

# To calculate the accuracy
def accuracy(preds,y_test):
    return (preds == y_test).sum()/y_test.shape[0]

### Loading the Data from the data set CSV Files

In [None]:
# Loading data
df_train = pd.read_csv('train_hou.csv')
df_test = pd.read_csv('test_hou.csv')

# Setting up train and test set
X_train,y_train = np.array(df_train[df_train.columns[:df_train.shape[1]-1]]), np.array(df_train[['TARGET']]).astype(int)
X_test,y_test = np.array(df_test[df_test.columns[:df_test.shape[1]-1]]), np.array(df_test[['TARGET']]).astype(int)

In [None]:
df_train.head(10)

### Evaluate the algorithms on benchmark datasets and a real-world peer-to-peer lending dataset


In [None]:
# Evaluate the algorithms on benchmark datasets and a real-world peer-to-peer lending dataset
# Ploting
max_T = 50
classifiers_hard = []
classifiers_soft = []
p_hard_all = []
p_soft_all = []
k_hard_mv = []
k_hard_ap = []
k_soft_mv = []
k_soft_ap = []
accuracy_hard = []
accuracy_soft = []

for T in range(1, max_T+1):
    classifiers_hard_t = bcwf(X_train, y_train, T=T, filter_type='hard', n_hard_examples=100, n_soft_examples=100)
    classifiers_soft_t = bcwf(X_train, y_train, T=T, filter_type='soft', n_hard_examples=100, n_soft_examples=100)
    p_hard_t_mv, k_hard_mv_t = ensemble_predict(classifiers_hard_t, X_test, strategy='majority_vote')
    p_soft_t_mv, k_soft_mv_t = ensemble_predict(classifiers_soft_t, X_test, strategy='majority_vote')
    p_hard_t_ap, k_hard_ap_t = ensemble_predict(classifiers_hard_t, X_test, strategy='average_probability')
    p_soft_t_ap, k_soft_ap_t = ensemble_predict(classifiers_soft_t, X_test, strategy='average_probability')
    accuracy_hard_t = accuracy(p_hard_t_mv, y_test)
    accuracy_soft_t = accuracy(p_soft_t_mv, y_test)
    classifiers_hard.append(classifiers_hard_t)
    classifiers_soft.append(classifiers_soft_t)
    p_hard_all.append(p_hard_t_mv)
    p_soft_all.append(p_soft_t_mv)
    k_hard_mv.append(k_hard_mv_t)
    k_soft_mv.append(k_soft_mv_t)
    k_hard_ap.append(k_hard_ap_t)
    k_soft_ap.append(k_soft_ap_t)
    accuracy_hard.append(accuracy_hard_t)
    accuracy_soft.append(accuracy_soft_t)

    # Get the indices of correct, wrong, and normal predictions for the hard filter
    correct_hard_idx = np.where(np.round(p_hard_t_ap) == y_test)[0]
    wrong_hard_idx = np.where(np.round(p_hard_t_ap) != y_test)[0]
    normal_hard_idx = np.where(np.abs(np.round(p_hard_t_ap) - y_test) > 1)[0]

    # Get the indices of correct, wrong, and normal predictions for the soft filter
    correct_soft_idx = np.where(np.round(p_soft_t_ap) == y_test)[0]
    wrong_soft_idx = np.where(np.round(p_soft_t_ap) != y_test)[0]
    normal_soft_idx = np.where(np.abs(np.round(p_soft_t_ap) - y_test) > 1)[0]
    if T == max_T:
        # Scatter plot - Hard Filter
        plt.scatter(y_test[correct_hard_idx], p_hard_t_ap[correct_hard_idx], c='g', label='Correct Predictions - Hard')
        plt.scatter(y_test[wrong_hard_idx], p_hard_t_ap[wrong_hard_idx], c='r', label='Wrong Predictions - Hard')
        plt.scatter(y_test[normal_hard_idx], p_hard_t_ap[normal_hard_idx], c='b', label='Normal Predictions - Hard')
        plt.xlabel("True Values")
        plt.ylabel("Predictions")
        plt.title("Scatter plot for Hard Filter Predictions (T=50)")
        plt.legend()
        plt.show()
        # Scatter plot - Soft Filter
        plt.scatter(y_test[correct_soft_idx], p_soft_t_ap[correct_soft_idx], marker='s', c='y', label='Correct Predictions - Soft')
        plt.scatter(y_test[wrong_soft_idx], p_soft_t_ap[wrong_soft_idx], marker='s', c='m', label='Wrong Predictions - Soft')
        plt.scatter(y_test[normal_soft_idx], p_soft_t_ap[normal_soft_idx], marker='s', c='c', label='Normal Predictions - Soft')
        plt.xlabel("True Values")
        plt.ylabel("Predictions")
        plt.title("Scatter plot for Soft Filter Predictions (T=50)")
        plt.legend()
        plt.show()

## Making the graphs for different parameters
### The output graphs are:

In [None]:
# Plot the rest of the plots
# Plot accuracy vs T
plt.scatter(range(1, max_T+1), accuracy_hard, label='Hard Filter')
plt.scatter(range(1, max_T+1), accuracy_soft, label='Soft Filter')
plt.xlabel("T (Number of Iterations)")
plt.ylabel("Accuracy")
plt.title("Scatter plot of Accuracy vs T")
plt.legend()
plt.show()

# Plot k vs T for hard filter
plt.plot(range(1, max_T+1), k_hard_mv, label='Hard Filter Majority Vote')
plt.plot(range(1, max_T+1), k_hard_ap, label='Hard Filter Average Probability')
plt.xlabel("T (Number of Iterations)")
plt.ylabel("k (Number of Examples Filtered)")
plt.title("k vs T for Hard Filter")
plt.legend()
plt.show()

# Plot k vs T for soft filter
plt.plot(range(1, max_T+1), k_soft_mv, label='Soft Filter Majority Vote')
plt.plot(range(1, max_T+1), k_soft_ap, label='Soft Filter Average Probability')
plt.xlabel("T (Number of Iterations)")
plt.ylabel("k (Number of Examples Filtered)")
plt.title("k vs T for Soft Filter")
plt.legend()
plt.show()