In [1]:
# loan_age_model_pipeline.py
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pickle
from xgboost import XGBRegressor

In [2]:
# Load Lending Club data
df = pd.read_csv("../data/data_pipeline.csv", index_col=0)
df.head()

Unnamed: 0,funded_amnt,int_rate,installment,home_ownership,annual_inc,purpose,open_acc,total_pymnt,last_pymnt_amnt,loan_age
0,20000,7.56,622.68,MORTGAGE,100000.0,credit_card,9.0,20215.79243,20228.39,2.036794
1,4500,11.31,147.99,RENT,38500.0,credit_card,12.0,4549.217149,4553.46,2.036794
2,20000,17.97,507.55,RENT,57000.0,debt_consolidation,10.0,20013.577333,20043.53,1.018397
3,6600,11.31,217.05,RENT,45000.0,credit_card,6.0,6622.809,6629.03,1.018397
4,2500,13.56,84.92,RENT,42000.0,other,3.0,2501.285667,2504.11,1.018397


In [3]:
# Split data
X = df.drop('loan_age', axis=1)
y = df['loan_age']

In [4]:

# Preprocessing for numeric and categorical columns
numeric_features = ['funded_amnt', 'int_rate', 'installment', 'annual_inc', 'open_acc', 'total_pymnt', 'last_pymnt_amnt']
categorical_features = ['home_ownership', 'purpose']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [5]:
# Define model 
xgb_model = XGBRegressor(
    subsample=0.5, 
    reg_lambda=0.1, 
    reg_alpha=0.1, 
    min_child_weight=1, 
    max_depth=10, 
    max_delta_step=20, 
    learning_rate=0.1, 
    gamma=0.01, 
    colsample_bytree=0.5, 
    colsample_bynode=1.0, 
    colsample_bylevel=0.7, 
    base_score=0.7, 
    random_state=42
)

# Build pipeline properly
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb_model)
])

In [6]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
pipeline.fit(X_train, y_train)

# Save pipeline
with open('../models/pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline, f)