# Logistic Regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the CSV file using pandas
file_path = "../data/processed/2016Q1_ylabel.csv/part-00000-13809b61-c147-4eff-a074-11e0e068d0be-c000.csv"
df = pd.read_csv(file_path)
df = df.dropna()

# Define categorical columns to encode
categorical_columns = [
    'seller_type', 'servicer_type', 'channel_type',
    'purpose', 'property_type', 'occupancy_status', 'state', 
    'default_status', 'high_balance_loan_indicator', 'mod_indicator', 
    'homeready_indicator', 'relocation_mortgage_indicator', 'htlv_indicator', 
    'payment_deferral'
]

# Initialize LabelEncoder for categorical columns
label_encoders = {col: LabelEncoder() for col in categorical_columns}
for col, le in label_encoders.items():
    df[col] = le.fit_transform(df[col])

# Select feature columns (including indexed categorical columns and numeric columns)
feature_columns = [
    'adjusted_remaining_time', 'num_borrowers', 
    'seller_type', 'servicer_type', 'channel_type', 
    'purpose', 'property_type', 'occupancy_status', 
    'state', 'high_balance_loan_indicator', 'mod_indicator', 
    'homeready_indicator', 'relocation_mortgage_indicator', 
    'htlv_indicator', 'payment_deferral'
]

# Define the target variable
target_column = 'y_label'

# Prepare the features (X) and target (y)
X = df[feature_columns].values
y = df[target_column].values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Logistic Regression model
log_reg = LogisticRegression(max_iter=100)

# Train the model on the training data
log_reg.fit(X_train, y_train)

# Make predictions on the test data
y_pred = log_reg.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

# XGBoost

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the CSV file using pandas
file_path = "../data/processed/2016Q1_ylabel.csv/part-00000-13809b61-c147-4eff-a074-11e0e068d0be-c000.csv"
df = pd.read_csv(file_path)


# Define categorical columns to encode
categorical_columns = [
    'seller_type', 'servicer_type', 'channel_type',
    'purpose', 'property_type', 'occupancy_status', 'state', 
    'default_status', 'high_balance_loan_indicator', 'mod_indicator', 
    'homeready_indicator', 'relocation_mortgage_indicator', 'htlv_indicator', 
    'payment_deferral'
]

# Initialize LabelEncoder for categorical columns
label_encoders = {col: LabelEncoder() for col in categorical_columns}
for col, le in label_encoders.items():
    df[col] = le.fit_transform(df[col])

# Select feature columns (including indexed categorical columns and numeric columns)
feature_columns = [
    'adjusted_remaining_time', 'num_borrowers', 
    'seller_type', 'servicer_type', 'channel_type', 
    'purpose', 'property_type', 'occupancy_status', 
    'state', 'high_balance_loan_indicator', 'mod_indicator', 
    'homeready_indicator', 'relocation_mortgage_indicator', 
    'htlv_indicator', 'payment_deferral'
]

# Define the target variable
target_column = 'y_label'

# Prepare the features (X) and target (y)
X = df[feature_columns].values
y = df[target_column].values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create DMatrix for XGBoost
train_dmatrix = xgb.DMatrix(X_train, label=y_train)
test_dmatrix = xgb.DMatrix(X_test, label=y_test)

# Set up XGBoost parameters
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 6,
    'eta': 0.3,
    'silent': 1,
    'nthread': 4
}

# Train the XGBoost model
num_round = 100
bst = xgb.train(params, train_dmatrix, num_round)

# Make predictions on the test set
preds = bst.predict(test_dmatrix)

# Convert the predictions to binary (0 or 1)
predictions = np.round(preds)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Test Accuracy: {accuracy}")

# Optional: save the model for later use
# bst.save_model("xgboost_model.json")