In [3]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB 435.7 kB/s eta 0:04:47
   ---------------------------------------- 0.3/124.9 MB 2.5 MB/s eta 0:00:50
   ---------------------------------------- 0.6/124.9 MB 4.3 MB/s eta 0:00:29
   ---------------------------------------- 0.7/124.9 MB 3.3 MB/s eta 0:00:38
   ---------------------------------------- 0.9/124.9 MB 3.6 MB/s eta 0:00:35
   ---------------------------------------- 1.5/124.9 MB 5.0 MB/s eta 0:00:25
    --------------------------------------- 2.0/124.9 MB 5.7 MB/s eta 0:00:22
    --------------------------------------- 2.5/124.9 MB 6.4 MB/s eta 0:00:20
    --------------------------------------- 2.9/124.9 MB 6.5 MB/s eta 0:00:19
    --


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('interest_rate_df_engineered.csv')

# Create a copy of the data frame so that the original data frame is preserved
encoded_data = data.copy()

# Define the variables that should be kept based on feature importance from the graph
selected_vars = [
    'int_rate', 'loan_amnt', 'installment', 'annual_inc', 'revol_util', 
    'total_rec_int', 'inq_last_6mths', 'term', 'purpose', 
    'verification_status'
]

# Retain only the selected variables in the data frame
encoded_data = encoded_data[selected_vars]

# Define the numerical variables (Continuous and Discrete) based on the selected features
numerical_cols = [
    'loan_amnt', 'installment', 'annual_inc', 'revol_util',
    'total_rec_int', 'inq_last_6mths'
]

# Define the categorical variables for Label Encoding and One-Hot Encoding
one_hot_encode_cols = ['term', 'purpose', 'verification_status']

# Apply One-Hot Encoding to the nominal categorical variables
encoded_data = pd.get_dummies(encoded_data, columns=one_hot_encode_cols)

# Print the final shape of the processed dataset
print("Dimensions of the Processed Data:", encoded_data.shape)
display(encoded_data.head())

# Define the X [features] and Y [interest rate]
X = encoded_data.drop(columns=['int_rate'])
y = encoded_data['int_rate']

# Split the data into training and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the numerical variables
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# Inspect the variables after scaling
display(pd.DataFrame(X_train, columns=numerical_cols).describe())

# Define the XGBoost model
xgb_model = xgb.XGBRegressor(random_state=42)

# Fit the model
xgb_model.fit(X_train, y_train)

# Make predictions for training and test sets
y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)

# Calculate RMSE and R² for training and test sets
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("\nTraining and Testing Scores:")
print(f"Train RMSE: {train_rmse:.4f}, Train R²: {train_r2:.4f}")
print(f"Test RMSE: {test_rmse:.4f}, Test R²: {test_r2:.4f}")

# Save the model using joblib
joblib.dump(xgb_model, 'xgb_selected_features_model.pkl')

Dimensions of the Processed Data: (757494, 26)


Unnamed: 0,int_rate,loan_amnt,installment,annual_inc,revol_util,total_rec_int,inq_last_6mths,term_36 months,term_60 months,purpose_car,...,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified
0,10.99,27050.0,885.46,55000.0,61.2,4219.94,0,True,False,False,...,False,False,False,False,False,False,False,False,False,True
1,13.98,9750.0,333.14,26000.0,52.8,1955.79,0,True,False,False,...,False,False,False,False,False,False,False,True,False,False
2,6.62,12000.0,368.45,105000.0,21.6,1109.42,1,True,False,False,...,False,False,False,False,False,False,False,True,False,False
3,13.53,12000.0,407.4,40000.0,68.8,1359.78,0,True,False,False,...,False,False,False,False,False,False,False,False,True,False
4,8.9,15000.0,476.3,63000.0,74.2,1880.47,0,True,False,False,...,False,False,False,False,False,False,False,True,False,False


Unnamed: 0,loan_amnt,installment,annual_inc,revol_util,total_rec_int,inq_last_6mths
count,605995.0,605995.0,605995.0,605995.0,605995.0,605995.0
mean,2.5232680000000003e-17,4.449957e-16,-9.571066e-16,1.955767e-16,1.378886e-16,7.926252e-18
std,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001
min,-1.675105,-1.74275,-1.930431,-2.367863,-0.8479672,-0.6989991
25%,-0.7773707,-0.7232971,-0.7204962,-0.7325055,-0.6347185,-0.6989991
50%,-0.1669114,-0.2201218,-0.2140119,0.03179969,-0.3255272,-0.6989991
75%,0.5991552,0.5623643,0.4894386,0.7704858,0.2418165,0.3486455
max,2.394624,4.172898,4.175519,35.73211,10.04566,7.682158



Training and Testing Scores:
Train RMSE: 0.9571, Train R²: 0.9528
Test RMSE: 0.9795, Test R²: 0.9506


['xgb_selected_features_model.pkl']