In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
sampled_data = pd.read_csv("final 2024 Q1.csv", nrows=20000)
sampled_data.drop(columns=['Number of Units', 'Prepayment Penalty Mortgage (PPM) Flag','Amortization Type (Formerly Product Type)','Property State','Postal Code','Loan Sequence Number','Seller Name','Servicer Name','Super Conforming Flag','Pre-HARP Loan Sequence Number','Program Indicator','HARP Indicator','Interest Only (I/O) Indicator'], inplace=True)
sampled_data['Original Interest Rate'].fillna(sampled_data['Original Interest Rate'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sampled_data['Original Interest Rate'].fillna(sampled_data['Original Interest Rate'].mean(), inplace=True)


Label encode binary variables

In [3]:
for col in ['First Time Homebuyer Flag', 'Occupancy Status', 'Channel','Mortgage Insurance Cancellation Indicator']:
    le = LabelEncoder()
    sampled_data[col] = le.fit_transform(sampled_data[col])

Feature engineering

Define independent variables and target

In [5]:
X = sampled_data.drop(columns=['Original Interest Rate'])
y = sampled_data['Original Interest Rate'].values

One-hot encode multi-category categorical columns

In [6]:
categorical_columns = ['Property Type', 'Loan Purpose']
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(sparse_output=True), categorical_columns)],
    remainder='passthrough'
)
X = ct.fit_transform(X)

In [7]:
print(X)

[[0. 0. 0. ... 1. 2. 0.]
 [0. 0. 0. ... 2. 2. 0.]
 [0. 0. 0. ... 1. 2. 0.]
 ...
 [0. 0. 0. ... 2. 2. 0.]
 [0. 0. 0. ... 1. 2. 0.]
 [1. 0. 0. ... 3. 2. 1.]]


In [8]:
# Basic Data Overview and Missing Value Check

print("üìä Dataset Shape:", sampled_data.shape)

# Summary of non-null values and data types
print("\nüîç Dataset Info:")
sampled_data.info()

# Count of missing values in each column
print("\n‚ùó Missing Values per Column:")
missing_counts = sampled_data.isnull().sum()
print(missing_counts[missing_counts > 0].sort_values(ascending=False))

# Optional: Percentage of missing values (to help with decision-making)
print("\nüìâ Percentage of Missing Values:")
missing_percentage = (sampled_data.isnull().sum() / len(sampled_data)) * 100
print(missing_percentage[missing_percentage > 0].sort_values(ascending=False))


üìä Dataset Shape: (20000, 19)

üîç Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 19 columns):
 #   Column                                                        Non-Null Count  Dtype  
---  ------                                                        --------------  -----  
 0   Credit Score                                                  20000 non-null  int64  
 1   First Payment Date                                            20000 non-null  int64  
 2   First Time Homebuyer Flag                                     20000 non-null  int64  
 3   Maturity Date                                                 20000 non-null  int64  
 4   Metropolitan Statistical Area (MSA) Or Metropolitan Division  17223 non-null  float64
 5   Mortgage Insurance Percentage (MI %)                          20000 non-null  int64  
 6   Occupancy Status                                              20000 non-null  int64  
 7   Original Combin

In [9]:
sampled_data.dropna(subset=['Metropolitan Statistical Area (MSA) Or Metropolitan Division'], inplace=True)

In [28]:
print(sampled_data['Metropolitan Statistical Area (MSA) Or Metropolitan Division'].isnull().sum())

0


Train-test split

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16000, 24), (4000, 24), (16000,), (4000,))

In [22]:
from sklearn.impute import SimpleImputer

# Impute missing values in features
imputer = SimpleImputer(strategy='mean')  # or 'median'
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)



# Train XGBoost model

In [14]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Train XGBoost
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)

# Predict and evaluate
xgb_preds = xgb_model.predict(X_test)
xgb_mse = mean_squared_error(y_test, xgb_preds)
xgb_r2 = r2_score(y_test, xgb_preds)

print("üìà XGBoost Regressor Performance:")
print("MSE:", round(xgb_mse, 4))
print("R¬≤ Score:", round(xgb_r2, 4))

üìà XGBoost Regressor Performance:
MSE: 0.2037
R¬≤ Score: 0.2958


In [15]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

xgb = XGBRegressor(random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=15,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

# Best model and its performance
best_xgb = random_search.best_estimator_
y_pred = best_xgb.predict(X_test)

print("üìä Best Parameters:", random_search.best_params_)
print("Tuned MSE:", mean_squared_error(y_test, y_pred))
print("Tuned R¬≤:", r2_score(y_test, y_pred))


Fitting 3 folds for each of 15 candidates, totalling 45 fits
üìä Best Parameters: {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.05, 'colsample_bytree': 0.7}
Tuned MSE: 0.20395621717910847
Tuned R¬≤: 0.29483763588978695


# Train Linear Regression Model

In [18]:
from sklearn.linear_model import LinearRegression

# Train Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict and evaluate
lr_preds = lr_model.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_preds)
lr_r2 = r2_score(y_test, lr_preds)

print("üìà Linear Regressor Performance:")
print("MSE:", round(lr_mse, 4))
print("R¬≤ Score:", round(lr_r2, 4))


üìà Linear Regressor Performance:
MSE: 0.2345
R¬≤ Score: 0.1892


# Train Polynomial Regression

In [24]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 2)
X_poly = poly_reg.fit_transform(X_train)
lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_poly , y_train)

In [25]:
# Step 1: Transform X_test into polynomial features
X_test_poly = poly_reg.transform(X_test)

# Step 2: Predict using the trained regression model
lr2_preds = lin_reg_2.predict(X_test_poly)

# Step 3: Evaluate
from sklearn.metrics import mean_squared_error, r2_score

lr2_mse = mean_squared_error(y_test, lr2_preds)
lr2_r2 = r2_score(y_test, lr2_preds)

print("üìà Polynomial Regressor Performance:")
print("MSE:", round(lr2_mse, 4))
print("R¬≤ Score:", round(lr2_r2, 4))

üìà Polynomial Regressor Performance:
MSE: 0.2808
R¬≤ Score: 0.0293


# Train SVM Regression

In [26]:
from sklearn.svm import SVR
svm_regressor = SVR(kernel = 'rbf')
svm_regressor.fit(X_train, y_train)

svm_preds = svm_regressor.predict(X_test)
svm_mse = mean_squared_error(y_test, svm_preds)
svm_r2 = r2_score(y_test, svm_preds)

print("üìà SVM Regressor Performance:")
print("MSE:", round(svm_mse, 4))
print("R¬≤ Score:", round(svm_r2, 4))

üìà SVM Regressor Performance:
MSE: 0.2852
R¬≤ Score: 0.0138


# Train Random Forest Regression Model

In [27]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor =  RandomForestRegressor(n_estimators = 10 , random_state = 0)
rf_regressor.fit(X_train ,y_train)

rf_preds = rf_regressor.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_preds)
rf_r2 = r2_score(y_test, rf_preds)

print("üìà Random Forest Regressor Performance:")
print("MSE:", round(rf_mse, 4))
print("R¬≤ Score:", round(rf_r2, 4))

üìà Random Forest Regressor Performance:
MSE: 0.2294
R¬≤ Score: 0.2067
