<a href="https://colab.research.google.com/github/Marcusleeleelee/FTEC4998-4999/blob/main/FTEC4998_4999.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
# Step 0: Import the packages - ok
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [21]:
# Step 1: Utils - ok
def uni_list(input): return list(set(input))
def perform_pca(df, n_components):

    # Performing PCA
    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(df)

    # Creating a DataFrame with the top 15 components
    pca_df = pd.DataFrame(data=principal_components, index=df.index)

    # Retaining the original column names for the new DataFrame
    retained_columns = df.columns[:n_components]
    pca_df.columns = retained_columns

    return pca_df

In [22]:
class Dataset:
    def __init__(self, file_path):
        self.dataset = pd.read_csv(file_path, low_memory=False)
        self.train_dict, self.test_dict = {}, {}
        self.scalers = None
        self.pca = None
        self.label = 'loan_condition_cat'

    def show(self):
        return self.dataset.head(10)

    def get(self, type, key):
        if type == 'test': return self.test_dict[key]['x'], self.test_dict[key]['y']
        elif type == 'train': return self.train_dict[key]['x'], self.train_dict[key]['y']
        else: raise Exception('The type must be either "test" or "train"!!')

    def update(self, type, key, x, y):
        if type == 'test': self.test_dict.update({key: {'x': x, 'y': y}})
        elif type == 'train': self.train_dict.update({key: {'x': x, 'y': y}})
        else: raise Exception('The type must be either "test" or "train"!!')

    def basic_processing(self):
        temp_func_1 = lambda x: '<=2009' if str(x) in ['2007', '2008', '2009'] else ("[2010, 2012]" if str(x) in ['2010', '2011', '2012'] else '>=2013')
        columns_to_delete = [
            'id', 'issue_d', 'home_ownership_cat', 'income_category', 'income_cat', 'term_cat', 'application_type_cat',
            'purpose_cat', 'interest_payment_cat', 'loan_condition'
        ]
        self.dataset.drop(columns=columns_to_delete, inplace=True)
        self.dataset['grade'] = self.dataset['grade'].apply(temp_func_1)
        self.dataset['final_d'] = self.dataset['final_d'].apply(lambda x: str(x)[-4:]).apply(temp_func_1)
        self.dataset = pd.get_dummies(self.dataset, columns=['year', 'final_d', 'home_ownership', 'term', 'application_type',
                                                             'purpose', 'interest_payments', 'grade', 'region'], dtype=int)

    def train_test_split(self, percentage=0.8):
        self.dataset = self.dataset.sample(frac=1).reset_index(drop=True)
        train_size = int(len(self.dataset) * percentage)
        temp1 = self.dataset.iloc[:train_size].copy()
        temp2 = self.dataset.iloc[train_size:].copy()

        y_train = temp1[[self.label]]
        x_train = temp1.drop(columns=[self.label])
        self.update('train', 'w0', x_train, y_train)

        y_test = temp2[[self.label]]
        x_test = temp2.drop(columns=[self.label])
        self.update('test', 'w0', x_test, y_test)

    def preprocessing_train(self):
        temp_train_x, temp_train_y = self.get('train', 'w0')
        scaler = StandardScaler()
        temp_train_x = pd.DataFrame(scaler.fit_transform(temp_train_x), columns=temp_train_x.columns)
        self.scalers = scaler
        temp_train_x = temp_train_x.fillna(temp_train_x.mean())
        temp_train_x = perform_pca(temp_train_x, n_components=30)
        self.update('train', 'w0', temp_train_x, temp_train_y)

    def preprocessing_test(self): # Not ok
        temp_test_x, temp_test_y = self.get('test', 'w0')
        temp_train_x, _ = self.get('train', 'w0')
        print(temp_train_x)

        # Apply stored scalers
        temp_test_x = pd.DataFrame(self.scalers.transform(temp_test_x), columns=temp_train_x.columns)

        assert set(temp_test_x.columns) == set(temp_train_x.columns)
        self.update('test', 'w0', temp_test_x, temp_test_y)

    def resample_with_weights(self, model, weight): # Not ok
        temp_train_x, temp_train_y = self.get('train', weight)
        y_pred = model.predict(temp_train_x)

        misclassified = (temp_train_y[self.label].values != y_pred)
        weights = np.ones(len(temp_train_y))

        if misclassified.any():
            weights[misclassified] = 1.0 / misclassified.sum()
        if (~misclassified).any():
            weights[~misclassified] = 1.0 / (~misclassified).sum()

        weights /= weights.sum()

        sampled_indices = np.random.choice(temp_train_x.index, size=len(temp_train_x), replace=True, p=weights)
        temp_x = temp_train_x.loc[sampled_indices]
        temp_y = temp_train_y.loc[sampled_indices]
        self.update('train', 'w' + str(int(weight[1:]) + 1), temp_x, temp_y)

In [23]:
# Calculating
data = Dataset('loan_final313.csv')
data.basic_processing()
data.train_test_split()
data.preprocessing_train()

In [24]:
# Testing
train_x, train_y = data.get('train', 'w0')
train_y = train_y.values.ravel()
counts = np.mean(train_y == 1) * 100
print(counts)

13.83496222258365


In [None]:
# Step 1: Install RAPIDS in Colab
!apt-get install -y python3-dev libnvvm3
!pip install --upgrade pip
!pip install cupy-cuda11x
!pip install cuml-cu11 -f https://rapidsai.github.io/rapidsai-csp-utils/cu11

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
E: Unable to locate package libnvvm3
Collecting pip
  Using cached pip-24.2-py3-none-any.whl.metadata (3.6 kB)
Using cached pip-24.2-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-24.2
Collecting cupy-cuda11x
  Downloading cupy_cuda11x-13.2.0-cp310-cp310-manylinux2014_x86_64.whl.metadata (2.7 kB)
Downloading cupy_cuda11x-13.2.0-cp310-cp310-manylinux2014_x86_64.whl (95.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.4/95.4 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cupy-cuda11x
Successfully installed cupy-cuda11x-13.2.0
Looking in links: https://rapidsai.github.io/rapidsai-csp-utils/cu11
Collecting cuml-cu11
  Downloading cuml_cu11-24.8.0.tar.gz (2

In [None]:
import cupy as cp
from cuml.linear_model import LogisticRegression as cuLogisticRegression
from cuml.svm import SVC as cuSVC
from cuml.naive_bayes import MultinomialNB as cuMultinomialNB
from cuml.ensemble import RandomForestClassifier as cuRandomForestClassifier
from cuml.neural_network import MLPClassifier as cuMLPClassifier
from sklearn.ensemble import AdaBoostClassifier
import numpy as np

# Step 2: Model definition
class CustomLogisticRegression:
    def __init__(self):
        self.model = cuLogisticRegression(max_iter=1000)

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        return self.model.predict(X_test)

class CustomSVM:
    def __init__(self):
        self.model = cuSVC(probability=True)

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        return self.model.predict(X_test)

class CustomNaiveBayes:
    def __init__(self):
        self.model = cuMultinomialNB()

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        return self.model.predict(X_test)

class CustomRandomForest:
    def __init__(self):
        self.model = cuRandomForestClassifier()

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        return self.model.predict(X_test)

class CustomMLP:
    def __init__(self):
        self.model = cuMLPClassifier(max_iter=1000)

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        return self.model.predict(X_test)

# Assuming train_x and train_y are already defined as CuPy arrays

# Initialize models
log_reg = CustomLogisticRegression()
svm = CustomSVM()
naive_bayes = CustomNaiveBayes()
random_forest = CustomRandomForest()
mlp = CustomMLP()

# Train models
log_reg.train(train_x, train_y)
svm.train(train_x, train_y)
naive_bayes.train(train_x, train_y)
random_forest.train(train_x, train_y)
mlp.train(train_x, train_y)

# Collect predictions (using train_x as a placeholder for validation data)
predictions = {
    "log_reg": log_reg.predict(train_x),
    "svm": svm.predict(train_x),
    "naive_bayes": naive_bayes.predict(train_x),
    "random_forest": random_forest.predict(train_x),
    "mlp": mlp.predict(train_x),
}

# Convert predictions to NumPy for AdaBoost
stacked_features = np.column_stack([cp.asnumpy(predictions[key]) for key in predictions])

# Initialize and train AdaBoost
ada_boost = AdaBoostClassifier(n_estimators=10, random_state=42)
ada_boost.fit(stacked_features, cp.asnumpy(train_y))

# Evaluate
stacked_accuracy = ada_boost.score(stacked_features, cp.asnumpy(train_y))
print("Stacked model accuracy:", stacked_accuracy)