<a href="https://colab.research.google.com/github/Marcusleeleelee/FTEC4998-4999/blob/main/FTEC4998_4999.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
# Import necessary packages
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from google.colab import drive
from tqdm import tqdm
import inspect
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
# Step 1: Utils - ok
def uni_list(input): return list(set(input))

In [36]:
class Dataset():
    def __init__(self, file_path):
        self.dataset = pd.read_feather(file_path)
        self.X_train, self.y_train = None, None
        self.X_test, self.y_test = None, None
        self.scalers = None
        self.pca = None
        self.label = 'loan_condition_cat'
        self.original_columns = None

    def show(self, rows=10):
        return self.dataset.head(rows)

    def basic_processing(self):
        temp_func_1 = lambda x: '<=2009' if str(x) in ['2007', '2008', '2009'] else ("[2010, 2012]" if str(x) in ['2010', '2011', '2012'] else '>=2013')
        columns_to_delete = [
            'id', 'issue_d', 'home_ownership_cat', 'income_category', 'income_cat', 'term_cat', 'application_type_cat',
            'purpose_cat', 'interest_payment_cat', 'loan_condition'
        ]
        self.dataset.drop(columns=columns_to_delete, inplace=True)
        self.dataset['grade'] = self.dataset['grade'].apply(temp_func_1)
        self.dataset['final_d'] = self.dataset['final_d'].apply(lambda x: str(x)[-4:]).apply(temp_func_1)
        self.dataset = pd.get_dummies(self.dataset, columns=['year', 'final_d', 'home_ownership', 'term', 'application_type',
                                                             'purpose', 'interest_payments', 'grade', 'region'], dtype=int)

    def train_test_split(self, test_size=0.2, random_state=42):
        X = self.dataset.drop(columns=[self.label])
        y = self.dataset[self.label]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        self.original_columns = X.columns

        # Sort by index
        self.X_train.sort_index(inplace=True)
        self.X_train.reset_index(drop=True, inplace=True)
        self.X_test.sort_index(inplace=True)
        self.X_test.reset_index(drop=True, inplace=True)
        self.y_train.sort_index(inplace=True)
        self.y_train.reset_index(drop=True, inplace=True)
        self.y_test.sort_index(inplace=True)
        self.y_test.reset_index(drop=True, inplace=True)

    def preprocessing_train(self, exclude_columns=None):
        if exclude_columns is None:
            exclude_columns = []

        # Separate columns to scale and exclude
        columns_to_scale = [col for col in self.X_train.columns if col not in exclude_columns]

        # Scale only the specified columns
        scaler = StandardScaler()
        self.X_train[columns_to_scale] = scaler.fit_transform(self.X_train[columns_to_scale])
        self.scalers = scaler

        # # Perform PCA
        self.pca = PCA(n_components=30)
        pca_components = self.pca.fit_transform(self.X_train)
        self.X_train = pd.DataFrame(pca_components, columns=self.original_columns[:pca_components.shape[1]])

    def preprocessing_test(self):
        # Apply stored scalers
        self.X_test = pd.DataFrame(self.scalers.transform(self.X_test), columns=self.original_columns)

        # # Apply PCA
        pca_components = self.pca.transform(self.X_test)
        self.X_test = pd.DataFrame(pca_components, columns=self.original_columns[:pca_components.shape[1]])

In [37]:
# Calculating # ok
data = Dataset('/content/drive/My Drive/Colab Notebooks/FTEC4998_9/loan_final313_processed.feather')
data.basic_processing()
data.train_test_split()
data.preprocessing_train()
data.preprocessing_test()

In [38]:
# Data conversion # ok
train_x, train_y = data.X_train, data.y_train
test_x, test_y = data.X_test, data.y_test
counts = np.mean(train_y == 1) * 100
print(counts)
print(train_x.shape, train_y.shape)
print(test_x.shape, test_y.shape)
# Ensure y_train is binary
assert set(train_y).issubset({0, 1}), "Target values must be 0 or 1 for binary classification."
# Move to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

7.5910370853482805
(709903, 30) (709903,)
(177476, 30) (177476,)


In [39]:
print(train_x.shape)
print(train_y.shape)

(709903, 30)
(709903,)


In [40]:
# Train, predict, and accuracy functions
def train_model_pt(model): # ok
    model.train()
    for epoch in range(model.epochs):
        model.optimizer.zero_grad()
        outputs = model(model.train_x)
        loss = model.criterion(outputs, model.train_y)
        loss.backward()
        model.optimizer.step()
        if epoch % 5 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item()}')

def predict_model_pt(model, X):
    model.eval()
    with torch.no_grad():
        X = X.to(next(model.parameters()).device)
        outputs = model(X).squeeze()
        return (outputs > 0.5).float().cpu().numpy()  # Convert to numpy array
def calculate_accuracy_pt(model, X, y, pred=None):
    # Ensure X and y are on the correct device
    X = X.to(next(model.parameters()).device)
    y = y.to(next(model.parameters()).device)

    # Get predictions
    predictions = predict_model_pt(model, X) if pred is None else pred

    # Ensure predictions and labels are tensors and the same shape
    predictions = predictions.squeeze()
    y = y.squeeze()

    # Convert to tensors if necessary
    if not isinstance(predictions, torch.Tensor):
        predictions = torch.tensor(predictions)
    if not isinstance(y, torch.Tensor):
        y = torch.tensor(y)

    # Calculate accuracy
    correct = (predictions == y).sum().item()
    accuracy = correct / len(y)
    return accuracy

def df_to_tensor(x, y):
    assert isinstance(x, pd.DataFrame) and isinstance(y, pd.Series)
    return torch.tensor(x.to_numpy(), dtype=torch.float32).to(device), torch.tensor(y.values.ravel(), dtype=torch.float32).unsqueeze(1).to(device)
test_x, test_y = data.X_test, data.y_test
test_x_tensor, test_y_tensor = df_to_tensor(data.X_test, data.y_test)

In [41]:
# MLP model
class ANN(nn.Module):
    def __init__(self, train_x, train_y, lr=0.001):
        super(ANN, self).__init__()
        self.train_y = torch.tensor(train_y.values.ravel(), dtype=torch.float32).unsqueeze(1).to(device)
        self.train_x = torch.tensor(train_x.to_numpy(), dtype=torch.float32).to(device)
        self.input_dim = self.train_x.shape[1]
        self.net = nn.Sequential(
            nn.Linear(self.input_dim, 64),
            nn.LeakyReLU(0.1),
            nn.BatchNorm1d(64),
            nn.Dropout(0.3),
            nn.Linear(64, 128),
            nn.LeakyReLU(0.1),
            nn.BatchNorm1d(128),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.LeakyReLU(0.1),
            nn.BatchNorm1d(64),
            nn.Dropout(0.3),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
        self._initialize_weights()
        self.criterion = nn.BCELoss()
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.epochs = 500

    def forward(self, x):
        return self.net(x)
    def _initialize_weights(self):
        for m in self.net:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)

# Logistic Regression as a neural network
class LogisticRegressionModel(nn.Module):
    def __init__(self, train_x, train_y, lr=0.001):
        super(LogisticRegressionModel, self).__init__()
        self.train_y = torch.tensor(train_y.values.ravel(), dtype=torch.float32).unsqueeze(1).to(device)
        self.train_x = torch.tensor(train_x.to_numpy(), dtype=torch.float32).to(device)
        self.input_dim = self.train_x.shape[1]
        self.net = nn.Sequential(
            nn.Linear(self.input_dim, 1),
            nn.Sigmoid()
        )
        self.criterion = nn.BCELoss()
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.epochs = 1200

    def forward(self, x):
        return self.net(x)

# SVM
class SVMClassifier():
    def __init__(self, train_x, train_y, fraction=0.1):
        # Sample a fraction of the data
        self.train_x = train_x
        self.train_y = train_y
        self.model = None
        self.prediction = None

    def fit(self):
        # Use Bagging with SVM
        self.model = BaggingClassifier(
            estimator=SVC(C=0.1, kernel='poly', degree=5, gamma='scale'),
            n_estimators=6,
            random_state=42,
            max_samples=0.001
        )
        self.model.fit(self.train_x, self.train_y)

    def predict(self, X):
        return self.model.predict(X).astype(float)
    def calculate_accuracy(self, X, y, pred=None):
        predictions = self.model.predict(X).astype(float) if pred is None else pred
        accuracy = accuracy_score(y, predictions)
        return accuracy
# NB
class NaiveBayesClassifier():
    def __init__(self, train_x, train_y, priors=None, var_smoothing=1e-9):
        self.model = GaussianNB(priors=priors, var_smoothing=var_smoothing)
        self.train_x = train_x
        self.train_y = train_y
        self.prediction = None

    def fit(self):
        self.model.fit(self.train_x, self.train_y)
    def predict(self, X):
        return self.model.predict(X).astype(float)
    def calculate_accuracy(self, X, y, pred=None):
        predictions = self.model.predict(X).astype(float) if pred is None else pred
        accuracy = accuracy_score(y, predictions)
        return accuracy
# RF
class RandomForestModel():
    def __init__(self, train_x, train_y, n_estimators=10, max_depth=None, random_state=42):
        self.model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state, max_samples = 0.05)
        self.train_x = train_x
        self.train_y = train_y
        self.prediction = None

    def fit(self):
        self.model.fit(self.train_x, self.train_y)
    def predict(self, X):
        return self.model.predict(X).astype(float)
    def calculate_accuracy(self, X, y, pred=None):
        predictions = self.model.predict(X).astype(float) if pred is None else pred
        accuracy = accuracy_score(y, predictions)
        return accuracy

In [18]:
ann_model = ANN(train_x, train_y)
train_model_pt(ann_model)
acc = predict_model_pt(ann_model, ann_model.train_x)
print(type(acc))
print(np.unique(acc))
print(calculate_accuracy_pt(ann_model, ann_model.train_x, ann_model.train_y, acc))

Epoch 0, Loss: 1.073890209197998
Epoch 5, Loss: 0.9966623187065125
Epoch 10, Loss: 0.9242585897445679
Epoch 15, Loss: 0.8569503426551819
Epoch 20, Loss: 0.7945494651794434
Epoch 25, Loss: 0.737194836139679


KeyboardInterrupt: 

In [None]:
lr = LogisticRegressionModel(train_x, train_y)
train_model_pt(lr)
acc = predict_model_pt(lr, lr.train_x)
print(type(acc))
print(np.unique(acc))
print(calculate_accuracy_pt(lr, lr.train_x, lr.train_y, acc))

In [None]:
svm = SVMClassifier(train_x, train_y)
svm.fit()
acc = svm.predict(svm.train_x)
print(type(acc))
print(np.unique(acc))
print(svm.calculate_accuracy(svm.train_x, svm.train_y, acc))

In [None]:
nb = NaiveBayesClassifier(train_x, train_y)
nb.fit()
acc = nb.predict(nb.train_x)
print(type(acc))
print(np.unique(acc))
print(nb.calculate_accuracy(nb.train_x, nb.train_y, acc))

In [None]:
rf = RandomForestModel(train_x, train_y)
rf.fit()
acc = rf.predict(rf.train_x)
print(type(acc))
print(np.unique(acc))
print(rf.calculate_accuracy(rf.train_x, rf.train_y, acc))

In [42]:
import numpy as np
import pandas as pd
import inspect

class Adaboost:
    def __init__(self, classes_dict, train_x, train_y):
        self.classes_dict = classes_dict
        self.model_order = list(classes_dict.keys())
        self.train_x = train_x
        self.train_y = train_y
        self.trained_model = {}
        self.training_data_history = {'base': {'X': self.train_x, 'y': self.train_y}}
        self.current_weight = None
        self.weight_history = {}
        self.restart = False
        self.predictions = None

    def weight_init(self): self.current_weight = pd.Series(np.ones(len(self.train_y)) / len(self.train_y))

    def weight_calculate(self, predictions, labels):
        incorrect = predictions != labels.to_numpy()
        error_rate = self.current_weight[incorrect].sum()

        print('Error rate is:', error_rate)

        if error_rate > 0.5:
            self.weight_init()
            self.restart = True
            return

        alpha = 0.5 * np.log((1 - error_rate) / error_rate)

        # Update weights
        self.current_weight[incorrect] *= np.exp(alpha)
        self.current_weight[~incorrect] *= np.exp(-alpha)

        # Normalize weights
        self.current_weight /= self.current_weight.sum()

    def training(self):
        self.weight_init()
        for model in self.model_order:
            while True:
                self.restart = False
                self.train_x['weight'] = self.current_weight
                self.train_y = self.train_y.to_frame()
                self.train_y['weight'] = self.current_weight
                sampled_train_x = self.train_x.sample(n=len(self.train_x), replace=True, weights='weight', random_state=42)
                del self.train_x['weight']
                del sampled_train_x['weight']
                sampled_train_x.sort_index(inplace=True)
                sampled_train_x.reset_index(drop=True, inplace=True)
                sampled_train_y = self.train_y.sample(n=len(self.train_y), replace=True, weights='weight', random_state=42)
                del self.train_y['weight']
                self.train_y = self.train_y.iloc[:, 0]
                del sampled_train_y['weight']
                sampled_train_y.sort_index(inplace=True)
                sampled_train_y.reset_index(drop=True, inplace=True)
                sampled_train_y = sampled_train_y.iloc[:, 0]

                print("*" * 37)
                print(f'Training --------------------- {model}')
                current_model = self.classes_dict[model](sampled_train_x, sampled_train_y)
                methods = inspect.getmembers(current_model, predicate=inspect.ismethod)

                if 'fit' in [z for z, _ in methods]:
                    current_model.fit()
                    print('Finish training.\nStart predicting.')
                    current_prediction = current_model.predict(current_model.train_x)
                    train_accuracy = current_model.calculate_accuracy(current_model.train_x , current_model.train_y, current_prediction)
                else:
                    current_model.to(device)
                    train_model_pt(current_model)
                    print('Finish training.\nStart predicting.')
                    current_prediction = predict_model_pt(current_model, current_model.train_x)
                    train_accuracy = calculate_accuracy_pt(current_model, current_model.train_x, current_model.train_y, current_prediction)

                print(f'{model} training accuracy:', train_accuracy)
                self.weight_calculate(current_prediction, sampled_train_y)

                if not self.restart:
                    self.train_x, self.train_y = sampled_train_x, sampled_train_y
                    self.training_data_history[model] = {'X': sampled_train_x, 'y': sampled_train_y}
                    self.trained_model[model] = current_model
                    break
    def reorder(self, order_list): self.model_order = order_list
    def predict(self, X):
        # Ensure X is a pandas DataFrame
        assert isinstance(X, pd.DataFrame), "Input X should be a pandas DataFrame"

        # Collect predictions from each trained model
        model_predictions = {}
        for model_name, model in self.trained_model.items():
            methods = inspect.getmembers(model, predicate=inspect.ismethod)

            # Check if the model has predict or use predict_model_pt
            if 'predict' in [z for z, _ in methods]: preds = model.predict(X)
            else: preds = predict_model_pt(model, torch.tensor(X.to_numpy(), dtype=torch.float32).to(device))

            # Ensure predictions are in numpy array format
            assert isinstance(preds, np.ndarray), f"Predictions from {model_name} should be a numpy array"

            # Only convert if preds contain values other than 0 and 1
            if not np.array_equal(np.unique(preds), [0, 1]):
                print('Alert --- ', np.unique(preds))
                preds = np.where(preds > 0.5, 1, 0)

            model_predictions[model_name] = preds

        # Voting mechanism
        predictions = np.zeros(len(X))
        for i in range(len(X)):
            votes = {}
            for model_name, preds in model_predictions.items():
                pred = preds[i]
                if pred in votes:
                    votes[pred] += 1
                else:
                    votes[pred] = 1
            # Choose the class with the most votes
            predictions[i] = max(votes, key=votes.get)

        # Ensure predictions are in numpy array format
        assert isinstance(predictions, np.ndarray), "Final predictions should be a numpy array"

        self.predictions = predictions
        return predictions

    def calculate_accuracy(self, y):
        # Ensure predictions is a numpy array
        assert isinstance(self.predictions, np.ndarray), "Predictions should be a numpy array"

        # Ensure y is a pandas Series
        assert isinstance(y, pd.Series), "y should be a pandas Series"

        # Ensure both have the same length
        assert len(self.predictions) == len(y), "Predictions and y should have the same length"

        accuracy = np.mean(self.predictions == y.to_numpy())
        return accuracy


In [43]:
adModel = Adaboost(classes_dict={"LR": LogisticRegressionModel, "SVM": SVMClassifier, "ANN": ANN, "NB": NaiveBayesClassifier, "RF": RandomForestModel}, train_x=train_x, train_y=train_y)
# adModel = Adaboost(classes_dict={"ANN": ANN, "LR": LogisticRegressionModel, "NB": NaiveBayesClassifier, "RF": RandomForestModel}, train_x=train_x, train_y=train_y)
adModel.reorder(['SVM', 'LR', 'ANN', 'RF'])
adModel.training()
print("*" * 37)
print("Prediction: ", adModel.predict(train_x))
print("Training_accuracy: ", adModel.calculate_accuracy(train_y))

*************************************
Training --------------------- SVM
Finish training.
Start predicting.
SVM training accuracy: 0.9282747079530583
Error rate is: 0.07172529204694164
*************************************
Training --------------------- LR
Epoch 0, Loss: 0.7821888327598572
Epoch 5, Loss: 0.772625207901001
Epoch 10, Loss: 0.7634242177009583
Epoch 15, Loss: 0.7546104192733765
Epoch 20, Loss: 0.7461987137794495
Epoch 25, Loss: 0.7381922006607056
Epoch 30, Loss: 0.730582594871521
Epoch 35, Loss: 0.7233538627624512
Epoch 40, Loss: 0.7164844870567322
Epoch 45, Loss: 0.7099528312683105
Epoch 50, Loss: 0.7037379741668701
Epoch 55, Loss: 0.697821319103241
Epoch 60, Loss: 0.6921857595443726
Epoch 65, Loss: 0.686815083026886
Epoch 70, Loss: 0.6816944479942322
Epoch 75, Loss: 0.6768091320991516
Epoch 80, Loss: 0.6721455454826355
Epoch 85, Loss: 0.6676915884017944
Epoch 90, Loss: 0.663434624671936
Epoch 95, Loss: 0.6593639254570007
Epoch 100, Loss: 0.6554701924324036
Epoch 105, Los

(709903, 30)
(709903, 30)
