In [1]:
#general imports
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import sklearn
import scipy.optimize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC 
import random
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier

#fmclp imports
from final_fmclp_26_09_2022 import fmclp, cuae, synthetic_dataset

In [4]:
loan = pd.read_csv('loan_new.csv')

In [12]:
#create risk groups 0 - good, 1 - bad, 2  - dubious
def loan_grouper(x):
    if x == 'Fully Paid':
        z = 0
    elif x == 'Charged Off':
        z = 1
    elif x == 'Late (31-120 days)':
        z = 2
    elif x == 'Issued':
        z = 2
    elif x=='In Grace Period':
        z= 2
    elif x == 'Late (16-30 days)':
        z =2
    elif x == 'Does not meet the credit policy. Status:Fully Paid':
        z=2
    elif x == 'Default':
        z = 1
    elif x == 'Does not meet the credit policy. Status:Charged Off':
        z = 1
    return z
loan['target'] = loan['loan_status'].apply(loan_grouper)
del loan['loan_status']
del loan['zip_code']
del loan['issue_d']
del loan['addr_state']

loan = loan[['loan_amnt', 'term', 'int_rate', 'verification_status','initial_list_status','target', 'sub_grade',
                'home_ownership', 'purpose', 'dti', 'revol_bal','total_pymnt', 'total_rec_prncp']]
loan = pd.get_dummies(loan, drop_first = True)
loan = loan.rename(columns = {'initial_list_status_w': 'attr'})



In [13]:
loan

Unnamed: 0,loan_amnt,int_rate,target,dti,revol_bal,total_pymnt,total_rec_prncp,term_ 60 months,verification_status_Source Verified,verification_status_Verified,...,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding
0,5000.0,10.65,0,27.65,13648.0,5861.071414,5000.00,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,2500.0,15.27,1,1.00,1687.0,1008.710000,456.46,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2400.0,15.96,0,8.72,2956.0,3003.653644,2400.00,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,10000.0,13.49,0,20.00,5598.0,12226.302212,10000.00,0,1,0,...,0,0,0,0,0,1,0,0,0,0
5,5000.0,7.90,0,11.20,7963.0,5631.377753,5000.00,0,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887351,4200.0,15.99,1,36.93,12943.0,1026.020000,667.96,0,0,1,...,0,0,0,1,0,0,0,0,0,0
887364,10775.0,6.03,0,13.22,10776.0,11071.870000,10775.00,0,0,0,...,0,0,0,0,0,0,0,0,0,0
887366,6225.0,16.49,0,18.58,1756.0,7050.460000,6225.00,0,1,0,...,0,0,0,0,0,0,0,0,0,0
887369,4000.0,8.67,0,12.63,1700.0,4158.020000,4000.00,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [14]:
y = loan.drop('target', axis=1)
x = loan['target']
y_train,y_test,x_train,x_test = train_test_split(y,x)

lg = LGBMClassifier()
lg.fit(y_train, x_train)
lg = lg.predict(y_test)
accuracy_score(lg, x_test)

0.937843137254902

In [5]:
%%time

fair_diff = []
fair_ratio = []
fair_variation = []
fair_accuracy = []
unfair_diff = []
unfair_ratio = []
unfair_variation = []
unfair_accuracy = []
results = []
for i in range(100):
    main_state = random.choice(range(1000))
    res = fmclp(dataset = loan, 
               estimator = LGBMClassifier(), 
               number_iterations = 20, 
               prefit = False, 
               interior_classifier = 'rf',
               verbose = False, 
               multiplier=20, 
               random_state = main_state)
    results.append(res)
    unfair_diff.append(res['fairness_of_initial_classifier']['diff'])
    unfair_ratio.append(res['fairness_of_initial_classifier']['ratio'])
    unfair_variation.append(res['fairness_of_initial_classifier']['variation'])
    unfair_accuracy.append(res['accuracy_of_initial_classifier'])
    fair_diff.append(res['fairness_of_fair_classifier']['diff'])
    fair_ratio.append(res['fairness_of_fair_classifier']['ratio'])
    fair_variation.append(res['fairness_of_fair_classifier']['variation'])
    fair_accuracy.append(res['accuracy_of_fair_classifier'])
    
    name = f"loan_trials/loan_trial_№'{i+1}.txt"
    file = open(name,'w')
    file.write(f"""unfair_total_diff: {res['fairness_of_initial_classifier']['diff']}
unfair_ratio: {res['fairness_of_initial_classifier']['ratio']}
unfair_variation: {res['fairness_of_initial_classifier']['variation']}
unfair_accuracy: {res['accuracy_of_initial_classifier']}
fair_diff: {res['fairness_of_fair_classifier']['diff']}
fair_ratio: {res['fairness_of_fair_classifier']['ratio']}
fair_variation: {res['fairness_of_fair_classifier']['variation']}
fair_accuracy: {res['accuracy_of_fair_classifier']}
interior_classifier: rf 
multiplier: 20 
main_state: {main_state}
    """)
    file.close()
    res['fairness_of_fair_classifier']['df'].to_csv(f"loan_trials/loan_trial_№{i+1} cuae-metric-fair.csv")
    res['fairness_of_initial_classifier']['df'].to_csv(f"loan_trials/loan_trial_№{i+1} cuae-metric-unfair.csv")
    print(i+1)  

fair_diff = np.array(fair_diff)
fair_ratio = np.array(fair_ratio)
fair_variation = np.array(fair_variation)
fair_accuracy = np.array(fair_accuracy)
unfair_diff = np.array(unfair_diff)
unfair_ratio = np.array(unfair_ratio)
unfair_variation = np.array(unfair_variation)
unfair_accuracy = np.array(unfair_accuracy)

file = open('loan_trials/loan_trials.txt','w')
file.write(
f"""dataset for initial classifier training: 200 
classifier: LGBMClassifier()
number_iterations: 10
multiplier:25
interior_classifier: knn
fair_diff: {fair_diff}
fair_ratio: {fair_ratio}
fair_variation: {fair_variation}
fair_accuracy: {fair_accuracy}
unfair_diff: {unfair_diff}
unfair_ratio: {unfair_ratio}
unfair_variation: {unfair_variation}
unfair_accuracy: {unfair_accuracy}""")
file.close()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
CPU times: user 6h 3min 38s, sys: 1h 4min 19s, total: 7h 7min 58s
Wall time: 3h 58min 9s
