<a href="https://colab.research.google.com/github/Marcusleeleelee/FTEC4998-4999/blob/main/FTEC4998_4999.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 0: Import the packages - ok
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import torch
import torch.nn as nn
import torch.optim as optim
from time import sleep
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Step 1: Utils - ok
def uni_list(input): return list(set(input))
def perform_pca(df, n_components):

    # Performing PCA
    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(df)

    # Creating a DataFrame with the top 15 components
    pca_df = pd.DataFrame(data=principal_components, index=df.index)

    # Retaining the original column names for the new DataFrame
    retained_columns = df.columns[:n_components]
    pca_df.columns = retained_columns

    return pca_df

In [None]:
# Step 2: Dataset
class Dataset:
    def __init__(self, file_path): # ok
        self.dataset = pd.read_feather(file_path)
        self.train_dict, self.test_dict = {}, {}
        self.scalers = None
        self.pca = None
        self.label = 'loan_condition_cat'

    def show(self): # ok
        return self.dataset.head(10)

    def get(self, type, key): # ok
        if type == 'test': return self.test_dict[key]['x'], self.test_dict[key]['y']
        elif type == 'train': return self.train_dict[key]['x'], self.train_dict[key]['y']
        else: raise Exception('The type must be either "test" or "train"!!')

    def update(self, type, key, x, y):
        if type == 'test': self.test_dict.update({key: {'x': x, 'y': y}})
        elif type == 'train': self.train_dict.update({key: {'x': x, 'y': y}})
        else: raise Exception('The type must be either "test" or "train"!!')

    def basic_processing(self): # ok
        temp_func_1 = lambda x: '<=2009' if str(x) in ['2007', '2008', '2009'] else ("[2010, 2012]" if str(x) in ['2010', '2011', '2012'] else '>=2013')
        columns_to_delete = [
            'id', 'issue_d', 'home_ownership_cat', 'income_category', 'income_cat', 'term_cat', 'application_type_cat',
            'purpose_cat', 'interest_payment_cat', 'loan_condition'
        ]
        self.dataset.drop(columns=columns_to_delete, inplace=True)
        self.dataset['grade'] = self.dataset['grade'].apply(temp_func_1)
        self.dataset['final_d'] = self.dataset['final_d'].apply(lambda x: str(x)[-4:]).apply(temp_func_1)
        self.dataset = pd.get_dummies(self.dataset, columns=['year', 'final_d', 'home_ownership', 'term', 'application_type',
                                                             'purpose', 'interest_payments', 'grade', 'region'], dtype=int)
        self.dataset.dropna(inplace=True)

    def train_test_split(self, percentage=0.8): # ok
        self.dataset = self.dataset.sample(frac=1).reset_index(drop=True)
        train_size = int(len(self.dataset) * percentage)
        temp1 = self.dataset.iloc[:train_size].copy()
        temp2 = self.dataset.iloc[train_size:].copy()

        y_train = temp1[[self.label]]
        x_train = temp1.drop(columns=[self.label])
        self.update('train', 'w0', x_train, y_train)

        y_test = temp2[[self.label]]
        x_test = temp2.drop(columns=[self.label])
        self.update('test', 'w0', x_test, y_test)

    def preprocessing_train(self):
        temp_train_x, temp_train_y = self.get('train', 'w0')
        scaler = StandardScaler()
        temp_train_x = pd.DataFrame(scaler.fit_transform(temp_train_x), columns=temp_train_x.columns)
        self.scalers = scaler
        temp_train_x = perform_pca(temp_train_x, n_components=30)
        self.update('train', 'w0', temp_train_x, temp_train_y)

    def preprocessing_test(self): # Not ok
        temp_test_x, temp_test_y = self.get('test', 'w0')
        temp_train_x, _ = self.get('train', 'w0')
        print(temp_train_x)

        # Apply stored scalers
        temp_test_x = pd.DataFrame(self.scalers.transform(temp_test_x), columns=temp_train_x.columns)

        assert set(temp_test_x.columns) == set(temp_train_x.columns)
        self.update('test', 'w0', temp_test_x, temp_test_y)

    def resample_with_weights(self, model, weight): # Not ok
        temp_train_x, temp_train_y = self.get('train', weight)
        y_pred = model.predict(temp_train_x)

        misclassified = (temp_train_y[self.label].values != y_pred)
        weights = np.ones(len(temp_train_y))

        if misclassified.any():
            weights[misclassified] = 1.0 / misclassified.sum()
        if (~misclassified).any():
            weights[~misclassified] = 1.0 / (~misclassified).sum()

        weights /= weights.sum()

        sampled_indices = np.random.choice(temp_train_x.index, size=len(temp_train_x), replace=True, p=weights)
        temp_x = temp_train_x.loc[sampled_indices]
        temp_y = temp_train_y.loc[sampled_indices]
        self.update('train', 'w' + str(int(weight[1:]) + 1), temp_x, temp_y)

In [None]:
# Calculating
data = Dataset('/content/drive/My Drive/Colab Notebooks/FTEC4998_9/loan_final313_processed.feather')
data.basic_processing()
data.train_test_split()
data.preprocessing_train()

In [None]:
# Testing
train_x, train_y = data.get('train', 'w0')
train_y = train_y.values.ravel()
counts = np.mean(train_y == 1) * 100
print(counts)
# Convert to NumPy arrays
X_train = train_x.to_numpy()
y_train = train_y
print(X_train.shape, y_train.shape)

In [None]:
# Step 3: Modelling Training
# Check target values
assert set(y_train).issubset({0, 1}), "Target values must be 0 or 1 for binary classification."

# Set CUDA launch blocking for debugging
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Convert to tensors and move to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
try:
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
except RuntimeError as e: print("Tensor conversion error:", e); raise

# Logistic Regression as a neural network
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return torch.sigmoid(self.linear(x))

# MLP model
class MLPModel(nn.Module):
    def __init__(self, input_dim):
        super(MLPModel, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

# Train and predict function for models
def train_model(model, criterion, optimizer, X_train, y_train, num_epochs=100):
    model.train()
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        outputs = model(X_train).squeeze()
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()

def predict_model(model, X):
    model.eval()
    with torch.no_grad(): outputs = model(X).squeeze()
    return (outputs > 0.5).float()

# Initialize PyTorch models and move to GPU
log_reg_model = LogisticRegressionModel(X_train_tensor.shape[1]).to(device)
mlp_model = MLPModel(X_train_tensor.shape[1]).to(device)

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer_log_reg = optim.SGD(log_reg_model.parameters(), lr=0.01)
optimizer_mlp = optim.Adam(mlp_model.parameters(), lr=0.001)

# Train PyTorch models
train_model(log_reg_model, criterion, optimizer_log_reg, X_train_tensor, y_train_tensor)
train_model(mlp_model, criterion, optimizer_mlp, X_train_tensor, y_train_tensor)

# Predict with PyTorch models and move predictions to CPU
log_reg_predictions = predict_model(log_reg_model, X_train_tensor).cpu().numpy()
mlp_predictions = predict_model(mlp_model, X_train_tensor).cpu().numpy()

# Scikit-learn models
svm = SVC(probability=True)
naive_bayes = GaussianNB()
random_forest = RandomForestClassifier()

# Train Scikit-learn models
svm.fit(X_train, y_train)
naive_bayes.fit(X_train, y_train)
random_forest.fit(X_train, y_train)

# Predict with Scikit-learn models
svm_predictions = svm.predict(X_train)
naive_bayes_predictions = naive_bayes.predict(X_train)
rf_predictions = random_forest.predict(X_train)

# Collect predictions for stacking
predictions = {
    "log_reg": log_reg_predictions,
    "mlp": mlp_predictions,
    "svm": svm_predictions,
    "naive_bayes": naive_bayes_predictions,
    "random_forest": rf_predictions,
}

# Stack predictions as features for AdaBoost
stacked_features = np.column_stack(list(predictions.values()))

# Initialize and train AdaBoost
ada_boost = AdaBoostClassifier(n_estimators=10, random_state=42)
ada_boost.fit(stacked_features, y_train)

# Evaluate
stacked_accuracy = ada_boost.score(stacked_features, y_train)
print("Stacked model accuracy:", stacked_accuracy)