<a href="https://colab.research.google.com/github/Marcusleeleelee/FTEC4998-4999/blob/main/FTEC4998_4999.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
# Step 0: Import the packages - ok
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from time import sleep
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Step 1: Utils - ok
def uni_list(input): return list(set(input))
def perform_pca(df, n_components):

    # Performing PCA
    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(df)

    # Creating a DataFrame with the top 15 components
    pca_df = pd.DataFrame(data=principal_components, index=df.index)

    # Retaining the original column names for the new DataFrame
    retained_columns = df.columns[:n_components]
    pca_df.columns = retained_columns

    return pca_df

In [4]:
# Step 2: Dataset
class Dataset:
    def __init__(self, file_path): # ok
        self.dataset = pd.read_feather(file_path)
        self.train_dict, self.test_dict = {}, {}
        self.scalers = None
        self.pca = None
        self.label = 'loan_condition_cat'

    def show(self, rows=10): # ok
        return self.dataset.head(rows)

    def get(self, type, key): # ok
        if type == 'test': return self.test_dict[key]['x'], self.test_dict[key]['y']
        elif type == 'train': return self.train_dict[key]['x'], self.train_dict[key]['y']
        else: raise Exception('The type must be either "test" or "train"!')

    def update(self, type, key, x, y): # ok
        if type == 'test': self.test_dict.update({key: {'x': x, 'y': y}})
        elif type == 'train': self.train_dict.update({key: {'x': x, 'y': y}})
        else: raise Exception('The type must be either "test" or "train"!')

    def basic_processing(self): # ok
        temp_func_1 = lambda x: '<=2009' if str(x) in ['2007', '2008', '2009'] else ("[2010, 2012]" if str(x) in ['2010', '2011', '2012'] else '>=2013')
        columns_to_delete = [
            'id', 'issue_d', 'home_ownership_cat', 'income_category', 'income_cat', 'term_cat', 'application_type_cat',
            'purpose_cat', 'interest_payment_cat', 'loan_condition'
        ]
        self.dataset.drop(columns=columns_to_delete, inplace=True)
        self.dataset['grade'] = self.dataset['grade'].apply(temp_func_1)
        self.dataset['final_d'] = self.dataset['final_d'].apply(lambda x: str(x)[-4:]).apply(temp_func_1)
        self.dataset = pd.get_dummies(self.dataset, columns=['year', 'final_d', 'home_ownership', 'term', 'application_type',
                                                             'purpose', 'interest_payments', 'grade', 'region'], dtype=int)

    def train_test_split(self, percentage=0.8): # ok
        self.dataset = self.dataset.sample(frac=1).reset_index(drop=True)
        train_size = int(len(self.dataset) * percentage)
        temp1 = self.dataset.iloc[:train_size].copy()
        temp2 = self.dataset.iloc[train_size:].copy()

        y_train = temp1[[self.label]]
        x_train = temp1.drop(columns=[self.label])
        self.update('train', 'w0', x_train, y_train)

        y_test = temp2[[self.label]]
        x_test = temp2.drop(columns=[self.label])
        self.update('test', 'w0', x_test, y_test)

    def preprocessing_train(self): # ok
        temp_train_x, temp_train_y = self.get('train', 'w0')
        scaler = StandardScaler()
        temp_train_x = pd.DataFrame(scaler.fit_transform(temp_train_x), columns=temp_train_x.columns)
        self.scalers = scaler
        temp_train_x = perform_pca(temp_train_x, n_components=30)
        self.update('train', 'w0', temp_train_x, temp_train_y)

    def preprocessing_test(self): # Not ok
        temp_test_x, temp_test_y = self.get('test', 'w0')
        temp_train_x, _ = self.get('train', 'w0')
        print(temp_train_x)

        # Apply stored scalers
        temp_test_x = pd.DataFrame(self.scalers.transform(temp_test_x), columns=temp_train_x.columns)

        assert set(temp_test_x.columns) == set(temp_train_x.columns)
        self.update('test', 'w0', temp_test_x, temp_test_y)

In [5]:
# Calculating # ok
data = Dataset('/content/drive/My Drive/Colab Notebooks/FTEC4998_9/loan_final313_processed.feather')
data.basic_processing()
data.train_test_split()
data.preprocessing_train()

In [8]:
# Data conversion # ok
train_x, train_y = data.get('train', 'w0')
X_train, y_train = train_x.to_numpy(), train_y.values.ravel()
counts = np.mean(y_train == 1) * 100
print(counts)
print(X_train.shape, y_train.shape)
# Ensure y_train is binary
assert set(y_train).issubset({0, 1}), "Target values must be 0 or 1 for binary classification."

# Convert to tensors and move to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1).to(device)

7.616956119357152
(709903, 30) (709903,)


In [30]:
# Train, predict, and accuracy functions
def train_model(model): # ok
    model.train()
    for epoch in range(model.epochs):
        model.optimizer.zero_grad()
        outputs = model(model.X_train_tensor)
        loss = model.criterion(outputs, model.y_train_tensor)
        loss.backward()
        model.optimizer.step()
        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item()}')


def predict_model(model, X_tensor):
    model.eval()
    with torch.no_grad():
        outputs = model(X_tensor).squeeze()
        return (outputs > 0.5).float()
        if isinstance(model, SVM): return (outputs > 0).float()  # For SVM
        else: return (outputs > 0.5).float()  # For Logistic Regression and ANN

def calculate_accuracy(model, X_tensor, y_tensor):
    X_tensor = X_tensor.to(next(model.parameters()).device)
    y_tensor = y_tensor.to(next(model.parameters()).device)
    predictions = predict_model(model, X_tensor)


    # Ensure predictions and labels are the same shape
    predictions = predictions.squeeze()
    y_tensor = y_tensor.squeeze()

    correct = (predictions == y_tensor).sum().item()
    accuracy = correct / y_tensor.size(0)
    return accuracy

In [None]:
# Step 3: Model training

class NaiveBayes:
    def __init__(self, device, X_train_tensor, y_train_tensor):
        self.classes = None
        self.class_priors = None
        self.feature_probs = None
        self.device = device
        self.X_train_tensor = X_train_tensor
        self.y_train_tensor = y_train_tensor

    def fit(self):
        y_train_tensor = self.y_train_tensor.squeeze().long()
        self.classes, class_counts = torch.unique(y_train_tensor, return_counts=True)
        self.class_priors = class_counts.float() / y_train_tensor.size(0)
        self.feature_probs = []
        for c in self.classes:
            X_c = self.X_train_tensor[y_train_tensor == c]
            class_feature_prob = (X_c.sum(dim=0) + 1) / (X_c.sum() + X_c.size(1))
            self.feature_probs.append(class_feature_prob)
        self.feature_probs = torch.stack(self.feature_probs).to(self.device)

    def predict(self, X_tensor):
        log_probs = []
        for i, c in enumerate(self.classes):
            log_prior = torch.log(self.class_priors[i])
            log_likelihood = (
                X_tensor * torch.log(self.feature_probs[i]) +
                (1 - X_tensor) * torch.log(1 - self.feature_probs[i])
            )
            log_probs.append(log_prior + log_likelihood.sum(dim=1))
        log_probs = torch.stack(log_probs).T
        return self.classes[log_probs.argmax(dim=1)].cpu().numpy().astype(float)

# Initialize PyTorch models and move to GPU
svm_model = SVM(X_train_tensor, y_train_tensor).to(device)
nb_model = NaiveBayes(device, X_train_tensor, y_train_tensor)
print(log_reg_model.criterion)
print(svm_model.criterion)

BCELoss()
BCELoss()
HingeEmbeddingLoss()


In [31]:
# Train ANN
# MLP model
class ANN(nn.Module):
    def __init__(self, X_train_tensor, y_train_tensor, lr=0.001):
        super(ANN, self).__init__()
        self.input_dim = X_train_tensor.shape[1]
        self.net = nn.Sequential(
            nn.Linear(self.input_dim, 128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )
        self.criterion = nn.BCELoss()
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.epochs = 100
        self.y_train_tensor = y_train_tensor
        self.X_train_tensor = X_train_tensor

    def forward(self, x):
        return self.net(x)
ann_model = ANN(X_train_tensor, y_train_tensor).to(device)
train_model(ann_model)
ann_predictions = predict_model(ann_model, X_train_tensor).cpu().numpy()
print('ANN:', calculate_accuracy(ann_model, ann_model.X_train_tensor, ann_model.y_train_tensor))

Epoch 0, Loss: 0.6712297797203064
Epoch 10, Loss: 0.567823052406311
Epoch 20, Loss: 0.5015900135040283
Epoch 30, Loss: 0.4576817750930786
Epoch 40, Loss: 0.4214310646057129
Epoch 50, Loss: 0.38850027322769165
Epoch 60, Loss: 0.35972169041633606
Epoch 70, Loss: 0.33442872762680054
Epoch 80, Loss: 0.3118170201778412
Epoch 90, Loss: 0.29266512393951416
ANN: 0.9464997330621225


In [42]:
print(ann_predictions)
print(type(ann_predictions))
print(np.unique(ann_predictions))

[0. 0. 0. ... 0. 0. 0.]
<class 'numpy.ndarray'>
[0. 1.]


In [36]:
# Logistic Regression as a neural network
class LogisticRegressionModel(nn.Module):
    def __init__(self, X_train_tensor, y_train_tensor, lr=0.01):
        super(LogisticRegressionModel, self).__init__()
        self.input_dim = X_train_tensor.shape[1]
        self.net = nn.Sequential(
            nn.Linear(self.input_dim, 1),
            nn.Sigmoid()
        )
        self.criterion = nn.BCELoss()
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.epochs = 1200
        self.y_train_tensor = y_train_tensor
        self.X_train_tensor = X_train_tensor

    def forward(self, x):
        return self.net(x)
log_reg = LogisticRegressionModel(X_train_tensor, y_train_tensor).to(device)
train_model(log_reg)
log_reg_predictions = predict_model(log_reg, X_train_tensor).cpu().numpy()
print('Logistic Regression:', calculate_accuracy(log_reg, log_reg.X_train_tensor, log_reg.y_train_tensor))

Epoch 0, Loss: 0.7840869426727295
Epoch 10, Loss: 0.6773155927658081
Epoch 20, Loss: 0.6177142262458801
Epoch 30, Loss: 0.5693557262420654
Epoch 40, Loss: 0.5292746424674988
Epoch 50, Loss: 0.49515870213508606
Epoch 60, Loss: 0.4653887450695038
Epoch 70, Loss: 0.4393005669116974
Epoch 80, Loss: 0.41650691628456116
Epoch 90, Loss: 0.3959510624408722
Epoch 100, Loss: 0.37775304913520813
Epoch 110, Loss: 0.36138424277305603
Epoch 120, Loss: 0.34694811701774597
Epoch 130, Loss: 0.33390283584594727
Epoch 140, Loss: 0.32222434878349304
Epoch 150, Loss: 0.31162574887275696
Epoch 160, Loss: 0.302066832780838
Epoch 170, Loss: 0.29342836141586304
Epoch 180, Loss: 0.2856084108352661
Epoch 190, Loss: 0.27851423621177673
Epoch 200, Loss: 0.2720675468444824
Epoch 210, Loss: 0.26619964838027954
Epoch 220, Loss: 0.2608502209186554
Epoch 230, Loss: 0.2559664845466614
Epoch 240, Loss: 0.2515016496181488
Epoch 250, Loss: 0.24741584062576294
Epoch 260, Loss: 0.24367083609104156
Epoch 270, Loss: 0.24023665

In [41]:
print(log_reg_predictions)
print(type(log_reg_predictions))
print(np.unique(log_reg_predictions))

[0. 0. 0. ... 0. 0. 0.]
<class 'numpy.ndarray'>
[0. 1.]


In [None]:
class SVMClassifier():
    def __init__(self, train_x, train_y):
        self.pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('svc', SVC())
        ])

        self.param_grid = {
            'svc__C': [0.1, 1, 10, 100],
            'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'svc__degree': [2, 3, 4],
            'svc__gamma': ['scale', 'auto'],
            'svc__coef0': [0.0, 0.1, 0.5, 1.0],
            'svc__class_weight': [None, 'balanced']
        }
        self.X_train = X_train
        self.y_train = y_train
        self.grid_search = None

    def fit(self, cv=5):
        self.grid_search = GridSearchCV(self.pipeline, self.param_grid, cv=cv, verbose=1, n_jobs=-1)
        self.grid_search.fit(self.X_train, self.y_train)
        print("Best Parameters:", self.grid_search.best_params_)

    def predict(self, X_test):
        return self.grid_search.best_estimator_.predict(X_test)

    def calculate_accuracy(self, X, y):
        predictions = self.predict(X)
        accuracy = accuracy_score(y, predictions)
        return accuracy
SVM_model = SVMClassifier(train_x, train_y)
SVM_model.fit()
SVM_predictions = SVM_model.predict(SVM_model.X_train, SVM_model.y_train)
print('SVM: ', SVM_model.calculate_accuracy(SVM_model.X_train, SVM_model.y_train))

Fitting 5 folds for each of 768 candidates, totalling 3840 fits


In [None]:
print(SVM_predictions)
print(type(SVM_predictions))
print(np.unique(SVM_predictions))

In [None]:
# Predict with PyTorch models and move predictions to CPU
svm_predictions = predict_model(svm_model, X_train_tensor).cpu().numpy()
nb_predictions = nb_model.predict(nb_model.X_train_tensor)

In [None]:
# Model Accuracy:

In [None]:
# Collect predictions for stacking
predictions = {
    "log_reg": log_reg_predictions,
    "ANN": ann_predictions,
    'svm':svm_predictions,
    'nb':nb_predictions,
}

In [None]:
print('Log_reg:', calculate_accuracy(log_reg_model, log_reg_model.X_train_tensor, log_reg_model.y_train_tensor))

Log_reg: 0.594665750109522


In [None]:
print('SVM:', calculate_accuracy(svm_model, svm_model.X_train_tensor, svm_model.y_train_tensor))

SVM: 0.043458049902592326


In [None]:
print('NB:', calculate_accuracy(nb_model, nb_model.X_train_tensor, nb_model.y_train_tensor))

TypeError: 'NoneType' object is not iterable

In [None]:
# Stack predictions as features for AdaBoost
stacked_features = np.column_stack(list(predictions.values()))

# Initialize and train AdaBoost
ada_boost = AdaBoostClassifier(n_estimators=30, random_state=29)
ada_boost.fit(stacked_features, y_train)

# Evaluate train
stacked_accuracy = ada_boost.score(stacked_features, y_train)
print("Stacked model accuracy:", stacked_accuracy)