# Logistic Regression

In [13]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load the CSV file using pandas
file_path = "../data/processed/part-00000-0115a5ba-1db7-421c-866c-7f1446998bf2-c000.csv"
df = pd.read_csv(file_path)
df = df.ffill()
# Define categorical columns to encode
categorical_columns = [
    'seller_type', 'servicer_type', 'channel_type',
    'purpose', 'property_type', 'occupancy_status', 'state', 
    'default_status', 'high_balance_loan_indicator', 'mod_indicator', 
    'homeready_indicator', 'relocation_mortgage_indicator', 'htlv_indicator', 
    'payment_deferral'
]

# Initialize LabelEncoder for categorical columns
label_encoders = {col: LabelEncoder() for col in categorical_columns}
for col, le in label_encoders.items():
    df[col] = le.fit_transform(df[col])

# Select feature columns (including indexed categorical columns and numeric columns)
feature_columns = [
    'adjusted_remaining_time', 'num_borrowers', 
    'seller_type', 'servicer_type', 'channel_type', 
    'purpose', 'property_type', 'occupancy_status', 
    'state', 'high_balance_loan_indicator', 'mod_indicator', 
    'homeready_indicator', 'relocation_mortgage_indicator', 
    'htlv_indicator', 'payment_deferral'
]

# Define the target variable
target_column = 'y_label'

# Prepare the features (X) and target (y)
X = df[feature_columns].values
y = df[target_column].values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Initialize Logistic Regression model
log_reg = LogisticRegression(max_iter=100)

# Train the model on the training data
log_reg.fit(X_train, y_train)

# Make predictions on the test data
y_pred = log_reg.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
presicion = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")
print(f"Test Precision: {presicion}")
print(f"Test Recall: {recall}")
print(f"Test F1 Score: {f1}")
print(f"Test ROC AUC: {roc_auc}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Test Accuracy: 0.6568202175504984
Test Precision: 0.01190073431875324
Test Recall: 0.5824820561694016
Test F1 Score: 0.02332491554885048
Test ROC AUC: 0.6199144830893135


# XGBoost

In [12]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load the CSV file using pandas
file_path = "../data/processed/part-00000-0115a5ba-1db7-421c-866c-7f1446998bf2-c000.csv"
df = pd.read_csv(file_path)
df = df.ffill()


# Define categorical columns to encode
categorical_columns = [
    'seller_type', 'servicer_type', 'channel_type',
    'purpose', 'property_type', 'occupancy_status', 'state', 
    'default_status', 'high_balance_loan_indicator', 'mod_indicator', 
    'homeready_indicator', 'relocation_mortgage_indicator', 'htlv_indicator', 
    'payment_deferral'
]

# Initialize LabelEncoder for categorical columns
label_encoders = {col: LabelEncoder() for col in categorical_columns}
for col, le in label_encoders.items():
    df[col] = le.fit_transform(df[col])

# Select feature columns (including indexed categorical columns and numeric columns)
feature_columns = [
    'adjusted_remaining_time', 'num_borrowers', 
    'seller_type', 'servicer_type', 'channel_type', 
    'purpose', 'property_type', 'occupancy_status', 
    'state', 'high_balance_loan_indicator', 'mod_indicator', 
    'homeready_indicator', 'relocation_mortgage_indicator', 
    'htlv_indicator', 'payment_deferral'
]

# Define the target variable
target_column = 'y_label'

# Prepare the features (X) and target (y)
X = df[feature_columns].values
y = df[target_column].values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Create DMatrix for XGBoost
train_dmatrix = xgb.DMatrix(X_train, label=y_train)
test_dmatrix = xgb.DMatrix(X_test, label=y_test)

# Set up XGBoost parameters
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 6,
    'eta': 0.3,
    'silent': 1,
    'nthread': 4
}

# Train the XGBoost model
num_round = 100
bst = xgb.train(params, train_dmatrix, num_round)

# Make predictions on the test set
preds = bst.predict(test_dmatrix)

# Convert the predictions to binary (0 or 1)
predictions = np.round(preds)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
roc_auc = roc_auc_score(y_test, predictions)

print(f"Test Accuracy: {accuracy}")
print(f"Test Precision: {precision}")
print(f"Test Recall: {recall}")
print(f"Test F1 Score: {f1}")
print(f"Test ROC AUC: {roc_auc}")


# Optional: save the model for later use
# bst.save_model("xgboost_model.json")

Parameters: { "silent" } are not used.



Test Accuracy: 0.780021099234498
Test Precision: 0.02357037782921883
Test Recall: 0.7487268070405337
Test F1 Score: 0.045702027868420335
Test ROC AUC: 0.7644848145537356


# LSTM

In [14]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Load the CSV file using pandas
file_path = "../data/processed/part-00000-0115a5ba-1db7-421c-866c-7f1446998bf2-c000.csv"
df = pd.read_csv(file_path)
df = df.ffill()

# Define categorical columns to encode
categorical_columns = [
    'seller_type', 'servicer_type', 'channel_type',
    'purpose', 'property_type', 'occupancy_status', 'state', 
    'default_status', 'high_balance_loan_indicator', 'mod_indicator', 
    'homeready_indicator', 'relocation_mortgage_indicator', 'htlv_indicator', 
    'payment_deferral'
]

# Initialize LabelEncoder for categorical columns
label_encoders = {col: LabelEncoder() for col in categorical_columns}
for col, le in label_encoders.items():
    df[col] = le.fit_transform(df[col])

# Select feature columns (including indexed categorical columns and numeric columns)
feature_columns = [
    'adjusted_remaining_time', 'num_borrowers', 
    'seller_type', 'servicer_type', 'channel_type', 
    'purpose', 'property_type', 'occupancy_status', 
    'state', 'high_balance_loan_indicator', 'mod_indicator', 
    'homeready_indicator', 'relocation_mortgage_indicator', 
    'htlv_indicator', 'payment_deferral'
]

# Define the target variable
target_column = 'y_label'

# Prepare the features (X) and target (y)
X = df[feature_columns].values
y = df[target_column].values

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Reshape the data to be suitable for LSTM (samples, timesteps, features)
X = X.reshape((X.shape[0], 1, X.shape[1]))

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train.reshape(X_train.shape[0], -1), y_train)
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2, verbose=1)

# Make predictions on the test data
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f"Test Accuracy: {accuracy}")
print(f"Test Precision: {precision}")
print(f"Test Recall: {recall}")
print(f"Test F1 Score: {f1}")
print(f"Test ROC AUC: {roc_auc}")

  super().__init__(**kwargs)


Epoch 1/10
[1m220093/473921[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m7:39[0m 2ms/step - accuracy: 0.7440 - loss: 0.5175

KeyboardInterrupt: 