In [1]:
import pandas as pd
import wittgenstein as lw
print('Importing Wittgenstein Version', lw.__version__)
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
import time

def convert_label(label):
    return (label == ' false') # ' false', ' true'

Importing Wittgenstein Version 0.3.2


In [2]:
df = pd.read_csv('../data/miniloan-decisions-100K.csv') # miniloan-decisions-100K.csv, miniloan-adjusted-and-enriched.csv
# del df['yearlyReimbursement'] # Is this derived from the input?
df['approval'] = df['approval'].map(convert_label) # label needs Boolean or {0,1}
print('Label distribution')
print(df['approval'].value_counts())
df['ratio'] = df['yearlyReimbursement'] / df['income']
# df['ratio'] = round(df['yearlyReimbursement'] / df['income'], 2)
# df['ratio'] = 100 * round(df['yearlyReimbursement'] / df['income'], 2)
# df['ratio'] = df['ratio'].map(int)
# df['ratio_print'] = df['ratio'].map(str)

# df = df[df['monthDuration'] > 0]
df.info()

# Check hypothesis that data is generated from the following two rules
df_from_rule = df[((df['ratio'] < 0.3) & (df['loanAmount'] < 1000000)) == (df['approval'])]
df_from_rule.info()
# df_residue = df[(df['ratio'] < 0.3) & (df['loanAmount'] < 1000000) & ~df['approval']]
# df_residue.info()
# df_residue = df[(df['loanAmount'] >= 1000000) & df['approval']]
# df_residue.info()
# df_residue = df[(df['ratio'] >= 0.3) & df['approval']]
# df_residue.info()
# df_residue.head(5)
# print(df['ratio'].value_counts())
# df.to_csv('out_miniloan.csv', index=False)

Label distribution
True     78046
False    21954
Name: approval, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   name                 100000 non-null  object 
 1   creditScore          100000 non-null  int64  
 2   income               100000 non-null  int64  
 3   loanAmount           100000 non-null  int64  
 4   monthDuration        100000 non-null  int64  
 5   rate                 100000 non-null  float64
 6   approval             100000 non-null  bool   
 7   yearlyReimbursement  100000 non-null  int64  
 8   ratio                100000 non-null  float64
dtypes: bool(1), float64(2), int64(5), object(1)
memory usage: 6.2+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               -------------

In [3]:
# IGNORE THIS
# df2 = df[(df['ratio'] >= 0.29994492856911464) & (df['ratio'] <= 0.2999544476383457)]
# df2 = df[(df['ratio'] >= 0.299944928569114) & (df['ratio'] <= 0.29995444763835)]
# df2
# df3 = df[(df['loanAmount'] >= 999609) & df['approval']]
# df3

In [4]:
train, test = train_test_split(df, test_size=.33, random_state=42)

In [5]:
for BINS in [10,20,40]:
    print('BINS:', BINS)

    clf = lw.RIPPER(k=3, prune_size=0.33, dl_allowance=32, n_discretize_bins=BINS, verbosity=0) # n_discretize_bins=None, max_rules=2
    # dir(ripper_clf)

    start_time = time.time()
    print('Job started at {}.'.format(time.ctime()))

    clf.fit(train, class_feat='approval') # Or pass X and y data to .fit , initial_model='[ratio=0.13-0.3]'
    end_time = time.time()
    elapsed = end_time - start_time
    print('Job finished at {} in {} seconds.'.format(time.ctime(), round(elapsed, 1)))

    X_test = test.drop('approval', axis=1)
    y_test = test['approval']
    print('accuracy: ', clf.score(X_test, y_test))

    precision = clf.score(X_test, y_test, precision_score)
    recall = clf.score(X_test, y_test, recall_score)
    print(f'precision: {precision} recall: {recall}')

    clf.out_model()
    print('')
    clf

BINS: 10
Job started at Tue Feb 15 11:32:37 2022.
Job finished at Tue Feb 15 11:32:57 2022 in 20.2 seconds.
accuracy:  0.9188787878787879
precision: 0.9977594898530743 recall: 0.898184780079125
[[ratio=0.39-0.54] V
[ratio=>5.96] V
[ratio=1.02-1.5] V
[ratio=2.52-5.96] V
[ratio=0.54-0.73] V
[ratio=0.73-1.02] V
[ratio=1.5-2.52] V
[ratio=0.26-0.39 ^ yearlyReimbursement=78749.0-98638.5 ^ income=240359.6-270200.4 ^ loanAmount=1198082.8-1397676.2] V
[ratio=0.26-0.39 ^ yearlyReimbursement=78749.0-98638.5 ^ loanAmount=997885.0-1198082.8] V
[ratio=0.26-0.39 ^ yearlyReimbursement=78749.0-98638.5 ^ loanAmount=1198082.8-1397676.2] V
[ratio=0.26-0.39 ^ loanAmount=997885.0-1198082.8] V
[ratio=0.26-0.39 ^ yearlyReimbursement=78749.0-98638.5 ^ income=240359.6-270200.4 ^ loanAmount=1397676.2-1596548.2] V
[ratio=0.26-0.39 ^ yearlyReimbursement=78749.0-98638.5 ^ income=240359.6-270200.4] V
[ratio=0.26-0.39 ^ yearlyReimbursement=98638.5-119304.8] V
[ratio=0.26-0.39 ^ yearlyReimbursement=78749.0-98638.5 ^ i