In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.linear_model import SGDClassifier
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
import xgboost as xgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Data Preperation

In [3]:
def load_data(file_path, chunk_size=100000):
    chunks = []
    dtype_dict = {
        'LOAN_ID': str  # Ensure LOAN_ID is read as string
        # Add other columns if specific types are known in advance
    }
    for chunk in pd.read_csv(file_path, chunksize=chunk_size, dtype=dtype_dict):
        # Convert data types to reduce memory usage
        for col in chunk.columns:
            if chunk[col].dtype == 'float64':
                chunk[col] = chunk[col].astype('float32')
            elif chunk[col].dtype == 'int64':
                chunk[col] = chunk[col].astype('int32')
        chunks.append(chunk)
    return pd.concat(chunks)

In [5]:
# train_file_path = "/content/drive/MyDrive/Data/train.csv"
# test_file_path = "/content/drive/MyDrive/Data/test.csv"
# train_df = load_data(train_file_path)
# test_df = load_data(test_file_path)

In [7]:
df = load_data('/content/drive/MyDrive/cleaned/2015Q1.csv8_quarters/part-00000-edea666e-1b0c-41d4-9cfd-857b9b575a4b-c000.csv')
df = df.dropna()

# Define categorical columns to encode
categorical_columns = [
    'seller_type', 'servicer_type', 'channel_type', 'num_borrowers',
    'purpose', 'property_type', 'occupancy_status', 'state',
    'default_status', 'high_balance_loan_indicator', 'mod_indicator',
    'homeready_indicator', 'relocation_mortgage_indicator', 'htlv_indicator',
    'payment_deferral'
]

# Initialize LabelEncoder for categorical columns
label_encoders = {col: LabelEncoder() for col in categorical_columns}
for col, le in label_encoders.items():
    df[col] = le.fit_transform(df[col])

# Select feature columns (including indexed categorical columns and numeric columns)
feature_columns = [
    'original_rate', 'orginal_credit_score', 'original_loan_to_value', 'curr_unpaid', 'adjusted_remaining_time',
    'seller_type', 'servicer_type', 'channel_type', 'num_borrowers',
    'purpose', 'property_type', 'occupancy_status', 'state',
    'high_balance_loan_indicator', 'mod_indicator',
    'homeready_indicator', 'relocation_mortgage_indicator', 'htlv_indicator',
    'payment_deferral'
]

df['ACT_PERIOD'] = pd.to_datetime(df['ACT_PERIOD'], errors='coerce')
train_start_date = pd.to_datetime('2015-01-01')
train_end_date = pd.to_datetime('2019-01-01')
test_start_date = pd.to_datetime('2021-01-01')
test_end_date = pd.to_datetime('2022-01-01')
train_df = df[(df['ACT_PERIOD'] >= train_start_date) & (df['ACT_PERIOD'] < train_end_date)]
test_df = df[(df['ACT_PERIOD'] >= test_start_date) & (df['ACT_PERIOD'] < test_end_date)]

# Prepare the features (X) and target (y) for train and test datasets
X_train = train_df[feature_columns].values
y_train = train_df['y_label'].values
X_test = test_df[feature_columns].values
y_test = test_df['y_label'].values

# Define the target variable
target_column = 'y_label'

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)


In [30]:
df.columns

Index(['LOAN_ID', 'ACT_PERIOD', 'original_rate', 'orginal_credit_score',
       'original_loan_to_value', 'curr_unpaid', 'seller_type', 'servicer_type',
       'channel_type', 'adjusted_remaining_time', 'num_borrowers', 'purpose',
       'property_type', 'occupancy_status', 'state', 'default_status',
       'mod_indicator', 'homeready_indicator', 'relocation_mortgage_indicator',
       'high_balance_loan_indicator', 'htlv_indicator', 'payment_deferral',
       'y_label'],
      dtype='object')

# Logistic Regression

In [19]:
# Initialize Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)

# Train the model on the training data
log_reg.fit(X_train_smote, y_train_smote)

# Make predictions on the test data
y_pred = log_reg.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Print metrics
print(f"Test Accuracy: {accuracy}")
print(f"Test Precision: {precision}")
print(f"Test Recall: {recall}")
print(f"Test F1 Score: {f1}")
print(f"Test ROC AUC: {roc_auc}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Test Accuracy: 0.6984406325028056
Test Precision: 0.006615373140322793
Test Recall: 0.7215269086357947
Test F1 Score: 0.013110541307301764
Test ROC AUC: 0.7099516359143574


# XGBoost

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from xgboost import XGBClassifier

In [8]:
print("Distribution of classes in train set:", np.bincount(y_train))
print("Distribution of classes in test set:", np.bincount(y_test))

Distribution of classes in train set: [16759907   104766]
Distribution of classes in test set: [1722060    4794]


In [27]:
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    max_depth = 6,
    learning_rate=0.03,
    n_jobs=-1,        # Uses all available threads, similar to nthread=-1
    scale_pos_weight=160,  # Handling imbalance in the dataset
    use_label_encoder=False  # To avoid warning, as the label encoder is deprecated
)

# Train the XGBClassifier
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
preds = xgb_model.predict_proba(X_test)[:, 1]
predictions = np.round(preds)  # Convert probabilities to binary (0 or 1)

# Calculate performance metrics
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
roc_auc = roc_auc_score(y_test, preds)  # Use probability for ROC AUC

# Print performance metrics
print(f"Test Accuracy: {accuracy}")
print(f"Test Precision: {precision}")
print(f"Test Recall: {recall}")
print(f"Test F1 Score: {f1}")
print(f"Test ROC AUC: {roc_auc}")


Parameters: { "use_label_encoder" } are not used.



Test Accuracy: 0.6692499771260338
Test Precision: 0.006471019834817329
Test Recall: 0.7745098039215687
Test F1 Score: 0.012834805032294417
Test ROC AUC: 0.789064722722891


In [None]:
thresholds = np.arange(0.1, 0.91, 0.05)

best_threshold = 0
best_roc_auc = 0
results_dict = {}

for threshold in thresholds:
    # Convert probabilities to binary predictions based on the current threshold
    predictions = (preds >= threshold).astype(int)
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, preds)  # ROC AUC is calculated on probabilities

    # Save metrics for each threshold in the dictionary
    results_dict[threshold] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }
print("\nAll Results:")
for thresh, metrics in results_dict.items():
    print(f"Threshold {thresh}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")

# LSTM

In [None]:
X_train_smote = X_train_smote.reshape((X_train_smote.shape[0], 1, X_train_smote.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
# Define the LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(X_train_smote.shape[1], X_train_smote.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_smote, y_train_smote, epochs=10, batch_size=128, validation_split=0.2, verbose=1)

# Make predictions on the test data
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f"Test Accuracy: {accuracy}")
print(f"Test Precision: {precision}")
print(f"Test Recall: {recall}")
print(f"Test F1 Score: {f1}")
print(f"Test ROC AUC: {roc_auc}")
