# Importing Data and Library

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,FunctionTransformer,LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.impute import SimpleImputer
import wolta.model_tools as wltm 
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("LoanDataset.csv")
df.drop(columns=["customer_id"],inplace=True)

In [3]:
df.sample()

Unnamed: 0,customer_age,customer_income,home_ownership,employment_duration,loan_intent,loan_grade,loan_amnt,loan_int_rate,term_years,historical_default,cred_hist_length,Current_loan_status
1854,24,102000,RENT,8.0,HOMEIMPROVEMENT,B,"£15,000.00",14.22,8,,4,NO DEFAULT


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32586 entries, 0 to 32585
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   customer_age         32586 non-null  int64  
 1   customer_income      32586 non-null  object 
 2   home_ownership       32586 non-null  object 
 3   employment_duration  31691 non-null  float64
 4   loan_intent          32586 non-null  object 
 5   loan_grade           32586 non-null  object 
 6   loan_amnt            32585 non-null  object 
 7   loan_int_rate        29470 non-null  float64
 8   term_years           32586 non-null  int64  
 9   historical_default   11849 non-null  object 
 10  cred_hist_length     32586 non-null  int64  
 11  Current_loan_status  32582 non-null  object 
dtypes: float64(2), int64(3), object(7)
memory usage: 3.0+ MB


# Data Preprocessing

In [56]:
df= df.dropna(subset=["Current_loan_status"],axis=0)

In [57]:
col = ['customer_age','customer_income','employment_duration','loan_amnt','term_years','cred_hist_length','loan_int_rate']
for c in df.columns.tolist():
    if (c in ['customer_age','customer_income','employment_duration','loan_amnt','term_years','cred_hist_length','loan_int_rate']) | (c in ['Current_loan_status']):
        continue
    col.append(c)
col

['customer_age',
 'customer_income',
 'employment_duration',
 'loan_amnt',
 'term_years',
 'cred_hist_length',
 'loan_int_rate',
 'home_ownership',
 'loan_intent',
 'loan_grade',
 'historical_default']

In [117]:
lab = LabelEncoder()
df['Current_loan_status'] = lab.fit_transform(df['Current_loan_status'])
X_train,X_test,y_train,y_test = train_test_split(df[col],df['Current_loan_status'],test_size=0.2,random_state=42)

In [159]:
# def remove_comma_euro_and_convert_to_float(col):
#     col = col.str.replace(',', '').str.replace('£', '')
#     return col.astype(float)

# # Create a FunctionTransformer
# # remove_comma_euro_transformer = FunctionTransformer(lambda x: x.apply(remove_comma_euro_and_convert_to_float))
# remove_comma_euro_transformer = FunctionTransformer(remove_comma_euro_and_convert_to_float)

def remove_comma_euro_and_convert_to_float(df):
    df = df.apply(lambda col: col.str.replace(',', '').str.replace('£', '').astype(float))
    return df

# Create a FunctionTransformer
remove_comma_euro_transformer = FunctionTransformer(remove_comma_euro_and_convert_to_float)

- **As `FunctionTransformer` works on series not on dataframe that's why we need to tackle it using lambda**

In [60]:
X_train.isnull().sum()

customer_age               0
customer_income            0
employment_duration      723
loan_amnt                  1
term_years                 0
cred_hist_length           0
loan_int_rate           2479
home_ownership             0
loan_intent                0
loan_grade                 0
historical_default     16574
dtype: int64

In [61]:
X_train

Unnamed: 0,customer_age,customer_income,employment_duration,loan_amnt,term_years,cred_hist_length,loan_int_rate,home_ownership,loan_intent,loan_grade,historical_default
24390,31,50000,1.0,"£10,000.00",1,8,12.69,RENT,MEDICAL,B,
13321,25,44000,1.0,"£2,800.00",5,3,6.92,RENT,EDUCATION,A,
1338,26,25000,1.0,"£1,000.00",6,3,14.65,RENT,MEDICAL,B,Y
24084,27,70000,9.0,"£16,000.00",7,7,7.90,MORTGAGE,EDUCATION,A,
11963,22,40000,6.0,"£5,000.00",2,4,16.29,OWN,EDUCATION,C,Y
...,...,...,...,...,...,...,...,...,...,...,...
29806,39,38500,7.0,"£3,500.00",9,17,13.98,MORTGAGE,MEDICAL,C,Y
5390,25,45000,2.0,"£3,000.00",5,3,6.92,MORTGAGE,MEDICAL,A,
860,24,138000,2.0,"£20,000.00",9,3,15.99,RENT,VENTURE,C,Y
15795,25,175000,9.0,"£7,550.00",5,4,11.49,MORTGAGE,PERSONAL,A,


## ColumnTransformers for Data Cleaning and Imputation

###  Column transformer for Clean and clip

In [167]:
from sklearn.impute import KNNImputer
# clip the age to 100 and clean values in customer_income and loan_amount
def clip(df):
    return df.applymap(lambda x: min(x, 100))
clean_clip = ColumnTransformer(
    transformers=[
        ("clean1",remove_comma_euro_transformer,[1]),
        ("clean2",remove_comma_euro_transformer,[3]),
#         ("clipping",FunctionTransformer(lambda x: x.clip(upper=100)),[0]),
        ("clipping",FunctionTransformer(clip),[0]),
         ],remainder='passthrough'
)

### Column transformer for Imputing

In [77]:
# imputing employment duration 
# Using Knn imputer because simple imputer was affecting the data distribution
interest_imputer = ColumnTransformer(
    transformers=[
        ("num_imputerr",KNNImputer(n_neighbors=5),[3]),
         ],remainder='passthrough'
)
# imputing 2---> loan_amount, 6---> loan_int_rate, 10---> historical_default

imputer = ColumnTransformer(
    transformers=[
        ("near_imputer",KNNImputer(n_neighbors=5),[2,6]),
        ("cat_imputer",SimpleImputer(strategy="constant",fill_value="missing"),[10]),
        
         ],remainder='passthrough'
)

###  Column transformer for Scaling and Encoding

In [78]:
# Normalizing Numeric columns 
#     0-->loan_amount, 1-->loan_int_rate, 3-->employment_duration,
#     4-->customer_income, 5-->Customer_age, 6-->term_years, 7--> cred_hist_lenght
# Encoding categorical columns
# 2-->historical_default,8--> home_ownership, 9--> loan_intent, 10--> loan_grade 
encoding = ColumnTransformer(
    transformers=[
        ("Normalizing",StandardScaler(),[0,1,3,4,5,6,7]),
        ("encoding",OneHotEncoder(),[2,8,9,10]),
        
         ],remainder='passthrough'
)

### Testing Column Transformers

In [168]:
X_train_transfromed = clean_clip.fit_transform(X_train)
X_train_transfromed = interest_imputer.fit_transform(X_train_transfromed) 
X_train_transfromed = imputer.fit_transform(X_train_transfromed)
X_train_transfromed = encoding.fit_transform(X_train_transfromed)
# X_train_transfromed = pd.DataFrame(X_train_transfromed)
pd.DataFrame(X_train_transfromed)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.010009,0.534976,-0.934634,-0.252835,0.512079,-1.523317,0.539707,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.298400,-1.309202,-0.934634,-0.347290,-0.433446,0.096612,-0.689730,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-0.375503,1.161421,-0.934634,-0.646399,-0.275858,0.501594,-0.689730,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.267018,-0.995980,1.034214,0.062016,-0.118271,0.906576,0.293820,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,-0.204164,1.685590,0.295896,-0.410261,-0.906208,-1.118335,-0.443843,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26060,-0.268416,0.947279,0.542002,-0.433874,1.772779,1.716540,2.752695,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
26061,-0.289833,-1.309202,-0.688528,-0.331548,-0.433446,0.096612,-0.689730,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
26062,0.438356,1.589706,-0.688528,1.132510,-0.591033,1.716540,-0.689730,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
26063,-0.094935,0.151438,1.034214,1.714985,-0.433446,0.096612,-0.443843,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


# Data Preprocessing Pipeline with Scikit-learn

In [170]:
pipeline = Pipeline(steps=[
    ("clean_n_clip",clean_clip),
    ("loan_interest_imputer",interest_imputer),
    ("imputer_othercolumns",imputer),
    ("encoding",encoding)
]
)

In [171]:
X_train_trans = pipeline.fit_transform(X_train)

X_train_trans = pd.DataFrame(X_train_trans)
X_train_trans.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
dtype: int64

In [172]:
X_test_trans = pipeline.transform(X_test)
X_test_trans = pd.DataFrame(X_test_trans)
X_test_trans.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
dtype: int64

# Model Training

In [68]:
from wolta.model_tools import compare_models

compare_models('clf',
              ['ada', 'cat', 'raf', 'dtr', 'log', 'lbm'],
              ['acc', 'precision','f1'],
              X_train_trans, y_train, X_test_trans, y_test)

AdaBoost
Accuracy Score: 0.9469080865428878
Precision Score: 0.9470562861692814
F1 Score (weighted): 0.9469790305477506
***
CatBoost
Accuracy Score: 0.9759091606567439
Precision Score: 0.9757794969019704
F1 Score (weighted): 0.9756774461560641
***
Random Forest
Accuracy Score: 0.9711523707227252
Precision Score: 0.9709802446688498
F1 Score (weighted): 0.9707948557860894
***
Decision Tree
Accuracy Score: 0.955040662881694
Precision Score: 0.9549589653730773
F1 Score (weighted): 0.9549982263643408
***


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression
Accuracy Score: 0.9370876170016879
Precision Score: 0.9365233120921962
F1 Score (weighted): 0.9367598687721653
***
LightGBM
Accuracy Score: 0.9742212674543501
Precision Score: 0.9740715273482174
F1 Score (weighted): 0.9739546337545095
***


**Cat Boost** is delivering best results out of all these models with `f1 score` of `97.56`

In [92]:
compare_models

<function wolta.model_tools.compare_models(algo_type, algorithms, metrics, X_train, y_train, X_test, y_test)>

In [120]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
model = CatBoostClassifier(iterations=1000,depth=7,learning_rate=0.01,verbose=False,eval_metric='F1')
model.fit(X_train_trans,y_train)
f1_score(y_test,model.predict(X_test_trans))

0.9840261739799846

In [121]:
print((model.predict(X_test_trans)==y_test).mean())

0.9745281571275126


In [122]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
model = CatBoostClassifier(iterations=1000,depth=7,learning_rate=0.03,verbose=False,eval_metric='F1')
model.fit(X_train_trans,y_train)
print((model.predict(X_test_trans)==y_test).mean())
f1_score(y_test,model.predict(X_test_trans))

0.9752953813104189


0.9844759425320606

In [123]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
model = CatBoostClassifier(iterations=1000,depth=7,learning_rate=0.04,verbose=False,eval_metric='F1')
model.fit(X_train_trans,y_train)
print((model.predict(X_test_trans)==y_test).mean())
f1_score(y_test,model.predict(X_test_trans))

0.9748350468006751


0.98419734052804

In [124]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
model = CatBoostClassifier(iterations=1000,depth=7,learning_rate=0.05,verbose=False,eval_metric='F1')
model.fit(X_train_trans,y_train)
print((model.predict(X_test_trans)==y_test).mean())
f1_score(y_test,model.predict(X_test_trans))

0.9734540432714439


0.9833156524254991

In [125]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
model = CatBoostClassifier(iterations=1000,depth=8,learning_rate=0.03,verbose=False,eval_metric='F1')
model.fit(X_train_trans,y_train)
print((model.predict(X_test_trans)==y_test).mean())
f1_score(y_test,model.predict(X_test_trans))

0.9762160503299064


0.9850544788352136

In [144]:
scores =pd.DataFrame(columns=['iteration','depth','lr','accuracy','f1_score'])
for itr in [1000,1200,1500]:
    for dep in [3,5,8,10]:
        for lr in [0.001,0.005,0.01,0.3]:
            model = CatBoostClassifier(iterations=itr,depth=dep,learning_rate=lr,verbose=False,eval_metric='F1')
            model.fit(X_train_trans,y_train)
            acc = (model.predict(X_test_trans)==y_test).mean()
            f1 = f1_score(y_test,model.predict(X_test_trans))
            dic ={
                "iteration":itr,
                "depth"    :dep,
                "lr"       :lr,
                "accuracy" :acc,
                "f1_score" :f1
            }
            print(dic)
            score = pd.DataFrame(dic,index=[0])
            scores = pd.concat([scores, score], ignore_index=True, sort=False)

{'iteration': 1000, 'depth': 3, 'lr': 0.001, 'accuracy': 0.9373945066748504, 'f1_score': 0.9615094339622641}
{'iteration': 1000, 'depth': 3, 'lr': 0.005, 'accuracy': 0.9525855454963941, 'f1_score': 0.9705293276108727}
{'iteration': 1000, 'depth': 3, 'lr': 0.01, 'accuracy': 0.9620991253644315, 'f1_score': 0.976334195650091}
{'iteration': 1000, 'depth': 3, 'lr': 0.3, 'accuracy': 0.9736074881080251, 'f1_score': 0.9834168916313151}
{'iteration': 1000, 'depth': 5, 'lr': 0.001, 'accuracy': 0.9455270830136566, 'f1_score': 0.9663793919878777}
{'iteration': 1000, 'depth': 5, 'lr': 0.005, 'accuracy': 0.9639404634034064, 'f1_score': 0.9775098095511533}
{'iteration': 1000, 'depth': 5, 'lr': 0.01, 'accuracy': 0.9702317017032377, 'f1_score': 0.9813569094753027}
{'iteration': 1000, 'depth': 5, 'lr': 0.3, 'accuracy': 0.9740678226177689, 'f1_score': 0.9837014176873373}
{'iteration': 1000, 'depth': 8, 'lr': 0.001, 'accuracy': 0.9573423354304128, 'f1_score': 0.9734479465138491}
{'iteration': 1000, 'depth

In [None]:
# model = CatBoostClassifier(iterations=1000,depth=8,learning_rate=0.03,verbose=False,eval_metric='F1')

In [145]:
scores.sort_values(by="f1_score",ascending=False).head(3)

Unnamed: 0,iteration,depth,lr,accuracy,f1_score
42,1500,8,0.01,0.975295,0.984491
10,1000,8,0.01,0.975142,0.984402
26,1200,8,0.01,0.975142,0.984399


## Training final model

In [173]:
model = CatBoostClassifier(iterations=1500,depth=8,learning_rate=0.01,verbose=False,eval_metric='F1')
model.fit(X_train_trans,y_train)
print("Accuacy :",(model.predict(X_test_trans)==y_test).mean())
print("F1 Score :",f1_score(y_test,model.predict(X_test_trans)))

Accuacy : 0.9752953813104189
F1 Score : 0.9844908968307484


In [174]:
pipeline2 =Pipeline(
steps=[
    ("model",model)
])

pipline_final=Pipeline(
steps=[
    ("transformation",pipeline),
    ("model",pipeline2)
])

In [175]:
(pipline_final.predict(X_test)==y_test).mean()

0.9752953813104189

In [176]:
import pickle
with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [178]:
with open('pipeline.pkl', 'rb') as file:
    loaded_pipeline = pickle.load(file)

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import __version__ as sklearn_version
import matplotlib as mpl

print("pandas version:", pd.__version__)
print("numpy version:", np.__version__)
print("seaborn version:", sns.__version__)
print("scikit-learn version:", sklearn_version)
print("matplotlib version:", mpl.__version__)
print("wolta version: (not available)")


pandas version: 2.0.3
numpy version: 1.24.3
seaborn version: 0.12.2
scikit-learn version: 1.3.0
matplotlib version: 3.7.2
wolta version: (not available)


In [3]:
with open("requirements.txt", "w") as file:
    file.write("pandas==2.0.3\n")
    file.write("numpy==1.24.3\n")
    file.write("scikit-learn==1.3.0\n")
    file.write("matplotlib==3.7.2\n")
    file.write("wolta\n")