In [391]:
import pandas as pd
import timeit
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

In [392]:
data = pd.read_csv("./CreditPrediction.csv")

data = data.drop(columns=['Unnamed: 19', 'CLIENTNUM'])
data = data.dropna(subset=['Credit_Limit'])
#data = data.drop_duplicates()

# Replace Customer_Age values greater than 100 with NaN
#data.loc[data['Customer_Age'] > 100, 'Customer_Age'] = pd.NA

data = data[data['Customer_Age'] <= 100]

print(data.shape)
data.head()

(10150, 18)


Unnamed: 0,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,45.0,M,3,High School,Married,$60K - $80K,Blue,39.0,5.0,1,3,12691.0,777,1.335,1144,42,1.625,0.061
1,49.0,F,5,Graduate,,Less than $40K,Blue,44.0,6.0,1,2,8256.0,864,1.541,1291,33,3.714,0.105
2,51.0,M,3,Graduate,Married,$80K - $120K,Blue,36.0,4.0,1,0,3418.0,0,2.594,1887,20,2.333,0.0
3,40.0,F,4,High School,,Less than $40K,Blue,34.0,3.0,4,1,3313.0,2517,1.405,1171,20,2.333,0.76
4,40.0,M,3,Uneducated,Married,$60K - $80K,,21.0,5.0,1,0,4716.0,0,2.175,816,28,2.5,0.0


In [393]:
education_order = {
    'Uneducated': 1,
    'High School': 2,
    'College': 3,
    'Graduate': 4,
    'Post-Graduate': 5,
    'Doctorate': 6
}

income_order = {
    'Less than $40K': 1,
    '$40K - $60K': 2,
    '$60K - $80K': 3,
    '$80K - $120K': 4,
    '$120K +': 5
}

card_category_order = {
    'Blue': 1,
    'Silver': 2,
    'Gold': 3,
    'Platinum': 4
}

# Map the ordered numbers to the respective columns
data['Education_Level'] = data['Education_Level'].map(education_order)
data['Income_Category'] = data['Income_Category'].map(income_order)
data['Card_Category'] = data['Card_Category'].map(card_category_order)

In [394]:
X = data.drop(columns=['Credit_Limit'])
y = data['Credit_Limit']

def preprocess(random):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random)
    
    # Define numerical and categorical columns
    numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = X_train.select_dtypes(include=['object']).columns

    encoders = {col: LabelEncoder() for col in categorical_cols}
    for col in categorical_cols:
        X_train[col] = encoders[col].fit_transform(X_train[col].astype(str))
        X_test[col] = encoders[col].transform(X_test[col].astype(str))
    
    # Preprocessing pipelines
    numerical_transformer = Pipeline(steps=[
        #('imputer', SimpleImputer(strategy='median')),
        ('imputer', KNNImputer(n_neighbors=13)),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        #('imputer', SimpleImputer(strategy='most_frequent')),
        ('imputer', KNNImputer(n_neighbors=13)),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])
    
    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)

preprocess(42)

In [395]:
def run(model, rep=1):
    time = mse = mae = r2 = 0

    for _ in range(rep):
        start = timeit.default_timer()
        model.fit(X_train, y_train)
        end = timeit.default_timer()
        
        y_pred = model.predict(X_test)
        
        mae += mean_absolute_error(y_test, y_pred)
        mse += mean_squared_error(y_test, y_pred)
        r2 += r2_score(y_test, y_pred)
        time += end - start

    time /= rep
    mse /= rep
    mae /= rep
    r2 /= rep
    
    print(f'Time: {round(time, 6)} seconds')
    print('MAE: ', round(mae, 6))
    print('MSE: ', round(mse, 6))
    print('R2:  ',round(r2, 6))
    print('--------------------')

In [396]:
model = DecisionTreeRegressor(random_state=10)
run(model)

Time: 0.062029 seconds
MAE:  1517.649754
MSE:  17613070.538325
R2:   0.789922
--------------------


In [397]:
model = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_leaf=2, min_samples_split=10, random_state=10)
run(model)

Time: 2.559696 seconds
MAE:  1134.986212
MSE:  8687802.579343
R2:   0.896377
--------------------


In [398]:
model = GradientBoostingRegressor(n_estimators=400, random_state=10)
run(model)

Time: 4.559552 seconds
MAE:  1371.464677
MSE:  8744171.79077
R2:   0.895705
--------------------


In [399]:
model = XGBRegressor(n_estimators=500, max_depth=6, learning_rate=0.02, subsample=0.62, random_state=10)
run(model)

Time: 1.074593 seconds
MAE:  1061.139458
MSE:  7856852.984316
R2:   0.906288
--------------------


In [400]:
models = [
    DecisionTreeRegressor(),
    RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_leaf=2, min_samples_split=5),
    GradientBoostingRegressor(n_estimators=400),
    XGBRegressor(n_estimators=500, max_depth=6, eta=0.02, subsample=0.62)
                  ]

In [401]:
for model in models:
    break
    preprocess(None)
    run(model, 20)

[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=3, n_estimators=200, subsample=0.7; total time=   1.8s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=6, n_estimators=200, subsample=0.62; total time=   0.9s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=10, n_estimators=100, subsample=1.0; total time=   6.5s
[CV] END colsample_bytree=0.6, learning_rate=0.1, max_depth=6, n_estimators=100, subsample=0.8; total time=   0.7s
[CV] END colsample_bytree=0.6, learning_rate=0.1, max_depth=6, n_estimators=200, subsample=0.7; total time=   1.0s
[CV] END colsample_bytree=0.6, learning_rate=0.1, max_depth=10, n_estimators=100, subsample=0.62; total time=   4.4s
[CV] END colsample_bytree=0.6, learning_rate=0.1, max_depth=10, n_estimators=200, subsample=0.7; total time=  10.0s
[CV] END colsample_bytree=0.6, learning_rate=0.2, max_depth=10, n_estimators=200, subsample=0.62; total time=   9.5s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=10, n_est