In [61]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline

import pandas as pd
from imblearn.over_sampling import SMOTEN, RandomOverSampler, SMOTE

import re

In [62]:
# define a function to drop words from feature
def drop_words(s, words):
    for word in words:
        s = s.replace(word, '')
    return s.strip()

In [63]:
df = pd.read_csv('credit_customers.csv')
df.drop('num_dependents', inplace=True, axis=1)

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   checking_status         1000 non-null   object 
 1   duration                1000 non-null   float64
 2   credit_history          1000 non-null   object 
 3   purpose                 1000 non-null   object 
 4   credit_amount           1000 non-null   float64
 5   savings_status          1000 non-null   object 
 6   employment              1000 non-null   object 
 7   installment_commitment  1000 non-null   float64
 8   personal_status         1000 non-null   object 
 9   other_parties           1000 non-null   object 
 10  residence_since         1000 non-null   float64
 11  property_magnitude      1000 non-null   object 
 12  age                     1000 non-null   float64
 13  other_payment_plans     1000 non-null   object 
 14  housing                 1000 non-null   o

In [65]:
# create a new column
df['credit_history_new'] = 'existing_paid'
# define a regular expression pattern
credit_pattern = re.compile(r'(all paid|delayed previously)', flags=re.IGNORECASE)
# loop through each row in the dataframe
for index, row in df.iterrows():
    match = credit_pattern.search(row['credit_history'])
    if match:
        df.at[index, 'credit_history_new'] = match.group()
        row['credit_history'] = credit_pattern.sub('', row['credit_history'])
# drop the words
words_to_drop = ['all paid', 'delayed previously']
df['credit_history'] = df['credit_history'].apply(lambda x: drop_words(x, words_to_drop))
#df.drop('personal_status', inplace=True, axis=1)
del df['credit_history']

In [66]:
# create a new column
df['housing_new'] = 'rent'
# define a regular expression pattern
gender_pattern = re.compile(r'(own)', flags=re.IGNORECASE)
# loop through each row in the dataframe
for index, row in df.iterrows():
    match = gender_pattern.search(row['housing'])
    if match:
        df.at[index, 'housing_new'] = match.group()
        row['housing'] = gender_pattern.sub('', row['housing'])
# drop the words
words_to_drop = ['rent', 'own', 'free']
df['housing'] = df['housing'].apply(lambda x: drop_words(x, words_to_drop))
#df.drop('personal_status', inplace=True, axis=1)
del df['housing']

In [67]:
# del df['foreign_worker']
# del df['own_telephone']
# del df['num_dependents']
del df['residence_since']
del df['other_parties']
del df['installment_commitment']
del df['property_magnitude']
del df['other_payment_plans']
del df['existing_credits']
del df['credit_amount']

In [68]:
# Age Group
bins = [0, 30, 40, 50, 60, 70, 120] # Define age groups
labels = ['0-30', '31-40', '41-50', '51-60', '61-70', '70+']
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, include_lowest=True)
df['age_group']=df['age_group'].astype(object)
del df['age']

# Duration Group
bins = [0, 12, 24, 36, 48, 60, 72] # Define duration groups
labels = ['0-12', '13-24', '25-36', '37-48', '49-60', '61-72']
df['duration_group'] = pd.cut(df['duration'], bins=bins, labels=labels, include_lowest=True)
df['duration_group']=df['duration_group'].astype(object)
del df['duration']

In [69]:
df.head()

Unnamed: 0,checking_status,purpose,savings_status,employment,personal_status,job,own_telephone,foreign_worker,class,credit_history_new,housing_new,age_group,duration_group
0,<0,radio/tv,no known savings,>=7,male single,skilled,yes,yes,good,existing_paid,own,61-70,0-12
1,0<=X<200,radio/tv,<100,1<=X<4,female div/dep/mar,skilled,none,yes,bad,existing_paid,own,0-30,37-48
2,no checking,education,<100,4<=X<7,male single,unskilled resident,none,yes,good,existing_paid,own,41-50,0-12
3,<0,furniture/equipment,<100,4<=X<7,male single,skilled,none,yes,good,existing_paid,rent,41-50,37-48
4,<0,new car,<100,1<=X<4,male single,skilled,none,yes,bad,delayed previously,rent,51-60,13-24


In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   checking_status     1000 non-null   object
 1   purpose             1000 non-null   object
 2   savings_status      1000 non-null   object
 3   employment          1000 non-null   object
 4   personal_status     1000 non-null   object
 5   job                 1000 non-null   object
 6   own_telephone       1000 non-null   object
 7   foreign_worker      1000 non-null   object
 8   class               1000 non-null   object
 9   credit_history_new  1000 non-null   object
 10  housing_new         1000 non-null   object
 11  age_group           1000 non-null   object
 12  duration_group      1000 non-null   object
dtypes: object(13)
memory usage: 101.7+ KB


In [71]:
del df['employment']
del df['personal_status']
del df['job']
del df['own_telephone']
del df['foreign_worker']

In [72]:
df.drop(df[df.duration_group == '61-72'].index, inplace=True)

In [73]:
def Label_Encoder(df):
    object_cols = df.select_dtypes(include='object').columns
    for col in object_cols:
        df[col] = LabelEncoder().fit_transform(df[col])
    return df
df = Label_Encoder(df)

In [74]:
smote = SMOTE()
X = df.drop('class', axis=1)
y = df['class']
X, y = smote.fit_resample(X, y)
df = pd.concat([X, y], axis=1)

In [75]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('class', axis=1), df['class'], test_size=0.3, random_state=42)
ros = RandomOverSampler(random_state=0)

X_train, y_train = ros.fit_resample(X_train, y_train)

In [76]:
pipeline = Pipeline(
    steps=[
        ('scaling', MinMaxScaler()),
        ('classifier', XGBClassifier()),
    ]
)

param_grid = {
    'classifier__max_depth': [3, 5, 7, 9],
    'classifier__learning_rate': [0.1, 0.01, 0.001],
    'classifier__n_estimators': [100, 500, 1000]
}



stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(pipeline, param_grid, cv=stratified_kfold)
grid_search.fit(X_train, y_train)

In [77]:
print(f"Best Result: {grid_search.best_score_}\nBest Parameters: {grid_search.best_params_}")


Best Result: 0.7379067052433886
Best Parameters: {'classifier__learning_rate': 0.01, 'classifier__max_depth': 7, 'classifier__n_estimators': 100}


In [78]:
# %%timeit
final_params = {
    'learning_rate' : grid_search.best_params_['classifier__learning_rate'],
    'max_depth' : grid_search.best_params_['classifier__max_depth'],
    'n_estimators' : grid_search.best_params_['classifier__n_estimators']
}


xgb_pipeline = Pipeline(
    steps=[
        ('scaling', MinMaxScaler()),
        ('classifier', XGBClassifier(**final_params)),
    ]
)

xgb_pipeline.fit(X_train, y_train)
xgb_pipeline.score(X_test, y_test)

0.75

In [81]:

%%timeit
xgb_pipeline.fit(X_train, y_train)
xgb_pipeline.score(X_test, y_test)

153 ms ± 2.85 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
