In [12]:
import pandas as pd
import timeit
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

In [2]:
data = pd.read_csv("./CreditPrediction.csv")

data = data.drop(columns=['Unnamed: 19', 'CLIENTNUM'])
#data = data.dropna(subset=['Credit_Limit'])
#data = data.drop_duplicates()

X = data.drop(columns=['Credit_Limit'])
y = data['Credit_Limit']

print(data.shape)
data.head()

(10132, 18)


Unnamed: 0,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,45.0,M,3,High School,Married,$60K - $80K,Blue,39.0,5.0,1,3,12691.0,777,1.335,1144,42,1.625,0.061
1,49.0,F,5,Graduate,,Less than $40K,Blue,44.0,6.0,1,2,8256.0,864,1.541,1291,33,3.714,0.105
2,51.0,M,3,Graduate,Married,$80K - $120K,Blue,36.0,4.0,1,0,3418.0,0,2.594,1887,20,2.333,0.0
3,40.0,F,4,High School,,Less than $40K,Blue,34.0,3.0,4,1,3313.0,2517,1.405,1171,20,2.333,0.76
4,40.0,M,3,Uneducated,Married,$60K - $80K,,21.0,5.0,1,0,4716.0,0,2.175,816,28,2.5,0.0


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [18]:
def run(model, rep=1):
    time = mse = mae = r2 = 0

    for _ in range(rep):
        start = timeit.default_timer()
        model.fit(X_train, y_train)
        end = timeit.default_timer()
        
        y_pred = model.predict(X_test)
        
        mae += mean_absolute_error(y_test, y_pred)
        mse += mean_squared_error(y_test, y_pred)
        r2 += r2_score(y_test, y_pred)
        time += end - start

    time /= rep
    mse /= rep
    mae /= rep
    r2 /= rep
    
    print(f'Time: {round(time, 6)} seconds')
    print('MAE: ', round(mae, 6))
    print('MSE: ', round(mse, 6))
    print('R2:  ',round(r2, 6))
    print('--------------------')

In [19]:
model = DecisionTreeRegressor(random_state=10)
run(model)

Time: 0.069845 seconds
MAE:  1574.163274
MSE:  18878646.16734
R2:   0.779419
--------------------


In [20]:
model = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_leaf=2, min_samples_split=5, random_state=10)
run(model)

Time: 2.572544 seconds
MAE:  1179.254407
MSE:  9669592.638143
R2:   0.887019
--------------------


In [21]:
model = GradientBoostingRegressor(n_estimators=400, random_state=10)
run(model)

Time: 4.482876 seconds
MAE:  1448.510291
MSE:  9262677.419402
R2:   0.891774
--------------------


In [22]:
model = XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, subsample=1, random_state=10)
run(model)

Time: 0.152485 seconds
MAE:  1200.312853
MSE:  8790874.388138
R2:   0.897286
--------------------


In [9]:
models = [
    DecisionTreeRegressor(),
    RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_leaf=2, min_samples_split=5),
    GradientBoostingRegressor(n_estimators=400),
    XGBRegressor(n_estimators=100, max_depth=6, eta=0.1, subsample=1)
                  ]

In [10]:
for model in models:
    pass
    #run(model, 20)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best Parameters: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 6, 'regressor__n_estimators': 100, 'regressor__subsample': 1.0}
Test MSE: 8790874.38813802
Test R²: 0.8972861512403716
