In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [17]:
data = pd.read_csv('data_final.csv')
data = data.drop('record', axis = 1)
oot = pd.read_csv('data_final_OOT.csv')
oot = oot.drop('Unnamed: 0',axis = 1)
label = pd.read_csv('fraud_label.csv')
data_label = label[label.record < 833508].drop(['Unnamed: 0', 'record'], axis = 1)
oot_label = label[label.record >= 833508].drop(['Unnamed: 0', 'record'], axis = 1)
oot_label = oot_label.reset_index()

In [18]:
train_x, test_x, train_y, test_y = train_test_split(data, data_label, test_size = 0.2, random_state = 42)

In [19]:
scaler = StandardScaler()

In [20]:
train_x1 = scaler.fit_transform(train_x.to_numpy())
train_x1 = pd.DataFrame(train_x1, columns = train_x.columns)
test_x1 = scaler.fit_transform(test_x.to_numpy())
test_x1 = pd.DataFrame(test_x1, columns = test_x.columns)
oot1 = scaler.fit_transform(oot.to_numpy())
oot1 = pd.DataFrame(oot1, columns = oot.columns)

In [21]:
train_x1.to_csv('train_x.csv')
test_x1.to_csv('test_x.csv')
oot1.to_csv('oot_x.csv')

In [22]:
train_y.to_csv('train_y.csv')
test_y.to_csv('test_y.csv')
oot_label.to_csv('oot_y.csv')

### Trial

In [23]:
train_x = pd.read_csv('train_x.csv')
train_y = pd.read_csv('train_y.csv')
test_x = pd.read_csv('test_x.csv')
test_y = pd.read_csv('test_y.csv')
oot_x = pd.read_csv('oot_x.csv')
oot_y = pd.read_csv('oot_y.csv')

In [24]:
%%time
rf = RandomForestRegressor(n_estimators = 50, max_depth = 10, min_samples_leaf = 5, random_state = 42)
rf.fit(train_x, train_y)

CPU times: user 40.3 s, sys: 645 ms, total: 40.9 s
Wall time: 45.8 s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=5, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=50,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [25]:
train_pred = rf.predict(train_x)
train_data = train_x
train_data['fraud_label'] = train_y['fraud_label']
train_data['pred'] = train_pred
topRows = int(round(len(train_data)*0.03)) # 3% of population
temp = train_data.sort_values('pred', ascending = False).head(topRows)
needed = temp.loc[:,'fraud_label']
FDR_train = sum(needed)/sum(train_data['fraud_label'])
FDR_train

0.5395474361635872

In [26]:
test_pred = rf.predict(test_x)
test_data = test_x
test_data['fraud_label'] = test_y['fraud_label']
test_data['pred'] = test_pred
topRows = int(round(len(test_data)*0.03)) # 3% of population
temp = test_data.sort_values('pred', ascending = False).head(topRows)
needed = temp.loc[:,'fraud_label']
FDR_test = sum(needed)/sum(test_data['fraud_label'])
FDR_test

0.53771597134429

In [27]:
oot_pred = rf.predict(oot_x)
oot_data = oot_x
oot_data['fraud_label'] = oot_y['fraud_label']
oot_data['pred'] = oot_pred
topRows = int(round(len(oot_data)*0.03)) # 3% of population
temp = oot_data.sort_values('pred', ascending = False).head(topRows)
needed = temp.loc[:,'fraud_label']
FDR_oot = sum(needed)/sum(oot_data['fraud_label'])
FDR_oot

0.46563285834031853

### Random Forest

In [28]:
%%time
rf_FDR = []

estimators = [20,50,80]
maxdepth = [10,20]
minsamplesleaf = [5,10]

for e in estimators:
    for d in maxdepth:
        for l in minsamplesleaf:
            
            train_x = pd.read_csv('train_x.csv')
            train_y = pd.read_csv('train_y.csv')
            test_x = pd.read_csv('test_x.csv')
            test_y = pd.read_csv('test_y.csv')
            oot_x = pd.read_csv('oot_x.csv')
            oot_y = pd.read_csv('oot_y.csv')
            
            rf = RandomForestRegressor(n_estimators = e, max_depth = d, min_samples_leaf = l, random_state = 42)
            rf.fit(train_x, train_y)
            train_pred = rf.predict(train_x)
            test_pred = rf.predict(test_x)
            oot_pred = rf.predict(oot_x)
            
            # FDR train
            train_data = train_x
            train_data['fraud_label'] = train_y['fraud_label']
            train_data['pred'] = train_pred
            topRows = int(round(len(train_data)*0.03)) # 3% of population
            temp = train_data.sort_values('pred', ascending = False).head(topRows)
            needed = temp.loc[:,'fraud_label']
            FDR_train = sum(needed)/sum(train_data['fraud_label'])

            # FDR test
            test_data = test_x
            test_data['fraud_label'] = test_y['fraud_label']
            test_data['pred'] = test_pred
            topRows = int(round(len(test_data)*0.03)) # 3% of population
            temp = test_data.sort_values('pred', ascending = False).head(topRows)
            needed = temp.loc[:,'fraud_label']
            FDR_test = sum(needed)/sum(test_data['fraud_label'])
            
            # FDR oot
            oot_data = oot_x
            oot_data['fraud_label'] = oot_y['fraud_label']
            oot_data['pred'] = oot_pred
            topRows = int(round(len(oot_data)*0.03)) # 3% of population
            temp = oot_data.sort_values('pred', ascending = False).head(topRows)
            needed = temp.loc[:,'fraud_label']
            FDR_oot = sum(needed)/sum(oot_data['fraud_label'])

            rf_FDR.append([e, d, l, FDR_train, FDR_test, FDR_oot])

CPU times: user 10min 50s, sys: 12.5 s, total: 11min 3s
Wall time: 10min 59s


In [29]:
rf_FDR = pd.DataFrame(rf_FDR, columns = ['n_estimators', 'max_depth', 'min_samples_leaf', 'train', 'test', 'oot'])
#FDR.drop([0,1],axis = 0, inplace = True)
#FDR.loc['mean',:] = FDR.mean(axis = 0)
rf_FDR

Unnamed: 0,n_estimators,max_depth,min_samples_leaf,train,test,oot
0,20,10,5,0.538925,0.536452,0.512992
1,20,10,10,0.539028,0.536452,0.523051
2,20,20,5,0.551381,0.528445,0.506287
3,20,20,10,0.550446,0.531816,0.515926
4,50,10,5,0.539547,0.537716,0.521375
5,50,10,10,0.54017,0.537295,0.525985
6,50,20,5,0.551692,0.530552,0.517184
7,50,20,10,0.550758,0.532238,0.517603
8,80,10,5,0.539547,0.537295,0.522213
9,80,10,10,0.539547,0.536452,0.522632


In [30]:
rf_FDR.to_excel('rf_FDR1.xlsx', index=True, header=True)

### Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression 
x_train,x_test,y_train,y_test = train_test_split(df,v['fraud_label'],test_size=0.2)
LogisticRegression
solver=['newton-cg','lbfgs','liblinear','sag','saga']
param_grid = dict(solver=solver)
kfold=StratifiedKFold(n_splits=5,shuffle=True)
model_LR=LogisticRegression()
grid_search=GridSearchCV(model_LR,param_grid,scoring='accuracy',cv=kfold)
grid_result=grid_search.fit(x_train,y_train)
means=grid_result.cv_results_['mean_test_score']
params=grid_result.cv_results_['params']
df=pd.DataFrame()
df['params']=params
df['mean test score']=means
df

In [13]:
%%time
lr_FDR = []

solver=['newton-cg','lbfgs','liblinear','sag','saga']

for s in solver:
    train_x = pd.read_csv('train_x.csv')
    train_y = pd.read_csv('train_y.csv')
    test_x = pd.read_csv('test_x.csv')
    test_y = pd.read_csv('test_y.csv')
    oot_x = pd.read_csv('oot_x.csv')
    oot_y = pd.read_csv('oot_y.csv')
            
    lr = LogisticRegression(solver = s)
    lr.fit(train_x, train_y)
    train_pred = lr.predict(train_x)
    test_pred = lr.predict(test_x)
    oot_pred = lr.predict(oot_x)
            
    # FDR train
    train_data = train_x
    train_data['fraud_label'] = train_y['fraud_label']
    train_data['pred'] = train_pred
    topRows = int(round(len(train_data)*0.03)) # 3% of population
    temp = train_data.sort_values('pred', ascending = False).head(topRows)
    needed = temp.loc[:,'fraud_label']
    FDR_train = sum(needed)/sum(train_data['fraud_label'])

    # FDR test
    test_data = test_x
    test_data['fraud_label'] = test_y['fraud_label']
    test_data['pred'] = test_pred
    topRows = int(round(len(test_data)*0.03)) # 3% of population
    temp = test_data.sort_values('pred', ascending = False).head(topRows)
    needed = temp.loc[:,'fraud_label']
    FDR_test = sum(needed)/sum(test_data['fraud_label'])
            
    # FDR oot
    oot_data = oot_x
    oot_data['fraud_label'] = oot_y['fraud_label']
    oot_data['pred'] = oot_pred
    topRows = int(round(len(oot_data)*0.03)) # 3% of population
    temp = oot_data.sort_values('pred', ascending = False).head(topRows)
    needed = temp.loc[:,'fraud_label']
    FDR_oot = sum(needed)/sum(oot_data['fraud_label'])

    lr_FDR.append([s, FDR_train, FDR_test, FDR_oot])

CPU times: user 2min 36s, sys: 5.27 s, total: 2min 41s
Wall time: 2min 32s


In [14]:
lr_FDR = pd.DataFrame(lr_FDR, columns = ['solver', 'train', 'test', 'oot'])
#FDR.drop([0,1],axis = 0, inplace = True)
#FDR.loc['mean',:] = FDR.mean(axis = 0)
lr_FDR

Unnamed: 0,solver,train,test,oot
0,newton-cg,0.4071,0.400759,0.383068
1,lbfgs,0.405854,0.400759,0.383487
2,liblinear,0.4071,0.400759,0.383068
3,sag,0.40928,0.400759,0.383068
4,saga,0.404816,0.400337,0.383068


In [35]:
lr_FDR.to_excel('lr_FDR1.xlsx', index=True, header=True)

### XGBoost

In [61]:
train_x = pd.read_csv('train_x.csv')
train_y = pd.read_csv('train_y.csv')
test_x = pd.read_csv('test_x.csv')
test_y = pd.read_csv('test_y.csv')
oot_x = pd.read_csv('oot_x.csv')
oot_y = pd.read_csv('oot_y.csv')

In [62]:
%%time
xgb =XGBClassifier(max_depth=2, learning_rate=1, n_estimators=100)
xgb.fit(train_x, train_y)

CPU times: user 1min 19s, sys: 1.62 s, total: 1min 20s
Wall time: 1min 34s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=1,
              max_delta_step=0, max_depth=2, min_child_weight=1, missing=None,
              n_estimators=100, n_jobs=1, nthread=None,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
              subsample=1, verbosity=1)

In [63]:
train_pred = xgb.predict(train_x)
train_data = train_x
train_data['fraud_label'] = train_y['fraud_label']
train_data['pred'] = train_pred
topRows = int(round(len(train_data)*0.03)) # 3% of population
temp = train_data.sort_values('pred', ascending = False).head(topRows)
needed = temp.loc[:,'fraud_label']
FDR_train = sum(needed)/sum(train_data['fraud_label'])
FDR_train

0.4916232109175635

In [64]:
test_pred = xgb.predict(test_x)
test_data = test_x
test_data['fraud_label'] = test_y['fraud_label']
test_data['pred'] = test_pred
topRows = int(round(len(test_data)*0.03)) # 3% of population
temp = test_data.sort_values('pred', ascending = False).head(topRows)
needed = temp.loc[:,'fraud_label']
FDR_test = sum(needed)/sum(test_data['fraud_label'])
FDR_test

0.4926519706078824

In [65]:
oot_pred = xgb.predict(oot_x)
oot_data = oot_x
oot_data['fraud_label'] = oot_y['fraud_label']
oot_data['pred'] = oot_pred
topRows = int(round(len(oot_data)*0.03)) # 3% of population
temp = oot_data.sort_values('pred', ascending = False).head(topRows)
needed = temp.loc[:,'fraud_label']
FDR_oot = sum(needed)/sum(oot_data['fraud_label'])
FDR_oot

0.4731768650461023

In [36]:
%%time
xgb_FDR = []

maxdepth = [2, 5]
learningrate = [0.1, 0.01, 0.001]
estimators = [100, 800]

for d in maxdepth:
    for l in learningrate:
        for e in estimators:
            
            train_x = pd.read_csv('train_x.csv')
            train_y = pd.read_csv('train_y.csv')
            test_x = pd.read_csv('test_x.csv')
            test_y = pd.read_csv('test_y.csv')
            oot_x = pd.read_csv('oot_x.csv')
            oot_y = pd.read_csv('oot_y.csv')
            
            xgb =XGBClassifier(max_depth=d, learning_rate=l, n_estimators=e)
            xgb.fit(train_x, train_y)
            train_pred =xgb.predict(train_x)
            test_pred = xgb.predict(test_x)
            oot_pred = xgb.predict(oot_x)
            
            # FDR train
            train_data = train_x
            train_data['fraud_label'] = train_y['fraud_label']
            train_data['pred'] = train_pred
            topRows = int(round(len(train_data)*0.03)) # 3% of population
            temp = train_data.sort_values('pred', ascending = False).head(topRows)
            needed = temp.loc[:,'fraud_label']
            FDR_train = sum(needed)/sum(train_data['fraud_label'])

            # FDR test
            test_data = test_x
            test_data['fraud_label'] = test_y['fraud_label']
            test_data['pred'] = test_pred
            topRows = int(round(len(test_data)*0.03)) # 3% of population
            temp = test_data.sort_values('pred', ascending = False).head(topRows)
            needed = temp.loc[:,'fraud_label']
            FDR_test = sum(needed)/sum(test_data['fraud_label'])
            
            # FDR oot
            oot_data = oot_x
            oot_data['fraud_label'] = oot_y['fraud_label']
            oot_data['pred'] = oot_pred
            topRows = int(round(len(oot_data)*0.03)) # 3% of population
            temp = oot_data.sort_values('pred', ascending = False).head(topRows)
            needed = temp.loc[:,'fraud_label']
            FDR_oot = sum(needed)/sum(oot_data['fraud_label'])

            xgb_FDR.append([e, d, l, FDR_train, FDR_test, FDR_oot])

CPU times: user 1h 29min 23s, sys: 32.2 s, total: 1h 29min 55s
Wall time: 1h 32min 17s


In [37]:
xgb_FDR = pd.DataFrame(xgb_FDR, columns = ['n_estimators', 'max_depth', 'learning_rate', 'train', 'test', 'oot'])
xgb_FDR

Unnamed: 0,n_estimators,max_depth,learning_rate,train,test,oot
0,100,2,0.1,0.478306,0.480826,0.346186
1,800,2,0.1,0.492942,0.49389,0.304694
2,100,2,0.01,0.480071,0.483776,0.46689
3,800,2,0.01,0.478202,0.480826,0.369656
4,100,2,0.001,0.480071,0.483776,0.46689
5,800,2,0.001,0.480071,0.483776,0.46689
6,100,5,0.1,0.500623,0.500632,0.315172
7,800,5,0.1,0.503322,0.493468,0.295474
8,100,5,0.01,0.493564,0.498104,0.341576
9,800,5,0.01,0.500727,0.501896,0.312657


In [38]:
xgb_FDR.to_excel('xgb_FDR1.xlsx', index=True, header=True)

### Final Model & Table

In [3]:
train_x = pd.read_csv('train_x.csv')
train_y = pd.read_csv('train_y.csv')
test_x = pd.read_csv('test_x.csv')
test_y = pd.read_csv('test_y.csv')
oot_x = pd.read_csv('oot_x.csv')
oot_y = pd.read_csv('oot_y.csv')

In [4]:
%%time
rf = RandomForestRegressor(n_estimators = 50, max_depth = 10, min_samples_leaf = 10)
rf.fit(train_x, train_y)

CPU times: user 39.7 s, sys: 423 ms, total: 40.1 s
Wall time: 41.6 s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=10, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=50,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

#### importance

In [15]:
rf.feature_importances_

array([4.22505249e-04, 6.99689298e-04, 1.80278969e-03, 6.69748082e-04,
       4.20467885e-03, 2.24593926e-03, 1.72037597e-03, 1.98146890e-03,
       5.99635483e-01, 1.92559132e-02, 3.13043007e-01, 6.63994708e-03,
       1.94883907e-03, 9.93915202e-03, 2.63647744e-03, 8.04816718e-03,
       8.03817023e-03, 7.25102457e-03, 9.81662442e-03])

In [18]:
train_x.columns

Index(['address-homephone-zip5_count180_date',
       'address-homephone-zip5_count7_date', 'address-homephone-zip5_pastday',
       'address-homephone_count180_date', 'address-homephone_pastday',
       'address-zip5_0_count180_count_ratio',
       'address-zip5_0_count30_count_ratio',
       'address-zip5_0_count7_count_ratio', 'address-zip5_count30_date',
       'address-zip5_pastday', 'dob-name_count30_date', 'dob-name_pastday',
       'homephone-zip5_count30_date', 'ssn-dob_pastday',
       'ssn-name_count180_date', 'ssn-name_count7_date', 'ssn-name_pastday',
       'ssn_count180_date', 'ssn_unique_address-zip5', 'fraud_label', 'pred'],
      dtype='object')

In [20]:
importance = pd.DataFrame({'Variable': ['address-homephone-zip5_count180_date',
       'address-homephone-zip5_count7_date', 'address-homephone-zip5_pastday',
       'address-homephone_count180_date', 'address-homephone_pastday',
       'address-zip5_0_count180_count_ratio',
       'address-zip5_0_count30_count_ratio',
       'address-zip5_0_count7_count_ratio', 'address-zip5_count30_date',
       'address-zip5_pastday', 'dob-name_count30_date', 'dob-name_pastday',
       'homephone-zip5_count30_date', 'ssn-dob_pastday',
       'ssn-name_count180_date', 'ssn-name_count7_date', 'ssn-name_pastday',
       'ssn_count180_date', 'ssn_unique_address-zip5'], 'importance': [4.22505249e-04, 6.99689298e-04, 1.80278969e-03, 6.69748082e-04,
       4.20467885e-03, 2.24593926e-03, 1.72037597e-03, 1.98146890e-03,
       5.99635483e-01, 1.92559132e-02, 3.13043007e-01, 6.63994708e-03,
       1.94883907e-03, 9.93915202e-03, 2.63647744e-03, 8.04816718e-03,
       8.03817023e-03, 7.25102457e-03, 9.81662442e-03]})

In [21]:
importance

Unnamed: 0,Variable,importance
0,address-homephone-zip5_count180_date,0.000423
1,address-homephone-zip5_count7_date,0.0007
2,address-homephone-zip5_pastday,0.001803
3,address-homephone_count180_date,0.00067
4,address-homephone_pastday,0.004205
5,address-zip5_0_count180_count_ratio,0.002246
6,address-zip5_0_count30_count_ratio,0.00172
7,address-zip5_0_count7_count_ratio,0.001981
8,address-zip5_count30_date,0.599635
9,address-zip5_pastday,0.019256


In [22]:
importance.sort_values('importance', ascending = False, inplace = True)

In [23]:
importance

Unnamed: 0,Variable,importance
8,address-zip5_count30_date,0.599635
10,dob-name_count30_date,0.313043
9,address-zip5_pastday,0.019256
13,ssn-dob_pastday,0.009939
18,ssn_unique_address-zip5,0.009817
15,ssn-name_count7_date,0.008048
16,ssn-name_pastday,0.008038
17,ssn_count180_date,0.007251
11,dob-name_pastday,0.00664
4,address-homephone_pastday,0.004205


In [24]:
importance.to_csv('importance.csv')

In [53]:
train_pred = rf.predict(train_x)
train_data = train_x
train_data['fraud_label'] = train_y['fraud_label']
train_data['pred'] = train_pred
topRows = int(round(len(train_data)*0.03)) # 3% of population
temp = train_data.sort_values('pred', ascending = False).head(topRows)
needed = temp.loc[:,'fraud_label']
FDR_train = sum(needed)/sum(train_data['fraud_label'])
FDR_train

0.5406892256591239

In [54]:
test_pred = rf.predict(test_x)
test_data = test_x
test_data['fraud_label'] = test_y['fraud_label']
test_data['pred'] = test_pred
topRows = int(round(len(test_data)*0.03)) # 3% of population
temp = test_data.sort_values('pred', ascending = False).head(topRows)
needed = temp.loc[:,'fraud_label']
FDR_test = sum(needed)/sum(test_data['fraud_label'])
FDR_test

0.5364517488411293

In [55]:
oot_pred = rf.predict(oot_x)
oot_data = oot_x
oot_data['fraud_label'] = oot_y['fraud_label']
oot_data['pred'] = oot_pred
topRows = int(round(len(oot_data)*0.03)) # 3% of population
temp = oot_data.sort_values('pred', ascending = False).head(topRows)
needed = temp.loc[:,'fraud_label']
FDR_oot = sum(needed)/sum(oot_data['fraud_label'])
FDR_oot

0.5247275775356245

#### Three Tables

In [5]:
train_pred = rf.predict(train_x)
train_data = train_x
train_data['fraud_label'] = train_y['fraud_label']
train_data['pred'] = train_pred
train_data.sort_values('pred', ascending = False, inplace = True)

GoodTest=sum(train_data['fraud_label']==0)
BadTest=sum(train_data['fraud_label']==1)

r=0
l=[]
culgood=0
culbad=0

for i in range(100):
    data=train_data.iloc[r:min(int((i+1)*len(train_data)/100),len(train_data)),]
    populationbin=i+ 1
    numofrecord=len(data)
    numofgood=sum(data['fraud_label']==0)
    numofbads=sum(data['fraud_label']==1)
    percentagegood=numofgood/GoodTest*100
    percentagebad=numofbads/BadTest*100
    numofrecord=min(int((i+1)*len(train_data)/100),len(train_data))-r
    totalrecord=min(int((i+1)*len(train_data)/100),len(train_data))
    culgood+=numofgood
    culbad+=numofbads
    culpergood=culgood/GoodTest*100
    culperbad=culbad/BadTest*100
    KS=culperbad-culpergood
    FPR=culpergood/culperbad
    r= min(int((i+1)*len(train_data)/100),len(train_data))
    l.append([populationbin,numofrecord,numofgood,numofbads,percentagegood,percentagebad,totalrecord,culgood,culbad,culpergood,culperbad,KS,FPR])
    
l
l=pd.DataFrame(l,columns=['Population Bin%','#Records','#Goods','#Bads','%Goods','%Bads','Total#Records','Cumulative Goods','Cumulative Bads','%Goods','%Bads (FDR)','KS','FPR'])
l.to_excel('train-FDR.xlsx')

In [6]:
l.head(20)

Unnamed: 0,Population Bin%,#Records,#Goods,#Bads,%Goods,%Bads,Total#Records,Cumulative Goods,Cumulative Bads,%Goods.1,%Bads (FDR),KS,FPR
0,1,6668,1703,4965,0.259141,51.536226,6668,1703,4965,0.259141,51.536226,51.277085,0.005028
1,2,6668,6479,189,0.985893,1.961802,13336,8182,5154,1.245034,53.498028,52.252994,0.023273
2,3,6668,6623,45,1.007805,0.467096,20004,14805,5199,2.252838,53.965124,51.712285,0.041746
3,4,6668,6625,43,1.008109,0.446336,26672,21430,5242,3.260947,54.411459,51.150512,0.059931
4,5,6668,6609,59,1.005674,0.612414,33340,28039,5301,4.266622,55.023874,50.757252,0.077541
5,6,6668,6622,46,1.007652,0.477476,40008,34661,5347,5.274274,55.501349,50.227075,0.09503
6,7,6668,6616,52,1.006739,0.539755,46676,41277,5399,6.281014,56.041104,49.760091,0.112079
7,8,6668,6626,42,1.008261,0.435956,53344,47903,5441,7.289275,56.47706,49.187786,0.129066
8,9,6668,6626,42,1.008261,0.435956,60012,54529,5483,8.297536,56.913016,48.61548,0.145793
9,10,6668,6630,38,1.00887,0.394436,66680,61159,5521,9.306406,57.307453,48.001047,0.162394


In [7]:
test_pred = rf.predict(test_x)
test_data = test_x
test_data['fraud_label'] = test_y['fraud_label']
test_data['pred'] = test_pred
test_data.sort_values('pred', ascending = False, inplace = True)

GoodTest=sum(test_data['fraud_label']==0)
BadTest=sum(test_data['fraud_label']==1)

r=0
l=[]
culgood=0
culbad=0


for i in range(100):
    data=test_data.iloc[r:min(int((i+1)*len(test_data)/100),len(test_data)),]
    populationbin=i+ 1
    numofrecord=len(data)
    numofgood=sum(data['fraud_label']==0)
    numofbads=sum(data['fraud_label']==1)
    percentagegood=numofgood/GoodTest*100
    percentagebad=numofbads/BadTest*100
    numofrecord=min(int((i+1)*len(test_data)/100),len(test_data))-r
    totalrecord=min(int((i+1)*len(test_data)/100),len(test_data))
    culgood+=numofgood
    culbad+=numofbads
    culpergood=culgood/GoodTest*100
    culperbad=culbad/BadTest*100
    KS=culperbad-culpergood
    FPR=culpergood/culperbad
    r= min(int((i+1)*len(test_data)/100),len(test_data))
    l.append([populationbin,numofrecord,numofgood,numofbads,percentagegood,percentagebad,totalrecord,culgood,culbad,culpergood,culperbad,KS,FPR])
    
l
l=pd.DataFrame(l,columns=['Population Bin%','#Records','#Goods','#Bads','%Goods','%Bads','Total#Records','Cumulative Goods','Cumulative Bads','%Goods','%Bads (FDR)','KS','FPR'])
l.to_excel('test-FDR.xlsx')

In [8]:
l.head(20)

Unnamed: 0,Population Bin%,#Records,#Goods,#Bads,%Goods,%Bads,Total#Records,Cumulative Goods,Cumulative Bads,%Goods.1,%Bads (FDR),KS,FPR
0,1,1667,446,1221,0.271407,51.453856,1667,446,1221,0.271407,51.453856,51.182449,0.005275
1,2,1667,1630,37,0.991913,1.559208,3334,2076,1258,1.263319,53.013064,51.749744,0.02383
2,3,1667,1654,13,1.006517,0.54783,5001,3730,1271,2.269837,53.560893,51.291057,0.042379
3,4,1667,1656,11,1.007734,0.463548,6668,5386,1282,3.277571,54.024442,50.74687,0.060668
4,5,1667,1656,11,1.007734,0.463548,8335,7042,1293,4.285306,54.48799,50.202684,0.078647
5,6,1667,1653,14,1.005909,0.589971,10002,8695,1307,5.291215,55.07796,49.786746,0.096068
6,7,1667,1651,16,1.004692,0.674252,11669,10346,1323,6.295906,55.752212,49.456306,0.112927
7,8,1667,1656,11,1.007734,0.463548,13336,12002,1334,7.303641,56.215761,48.91212,0.129922
8,9,1667,1659,8,1.00956,0.337126,15003,13661,1342,8.313201,56.552887,48.239686,0.146999
9,10,1667,1657,10,1.008343,0.421408,16670,15318,1352,9.321544,56.974294,47.65275,0.16361


In [9]:
oot_pred = rf.predict(oot_x)
oot_data = oot_x
oot_data['fraud_label'] = oot_y['fraud_label']
oot_data['pred'] = oot_pred
oot_data.sort_values('pred', ascending = False, inplace = True)

GoodTest=sum(oot_data['fraud_label']==0)
BadTest=sum(oot_data['fraud_label']==1)

r=0
l=[]
culgood=0
culbad=0


for i in range(100):
    data=oot_data.iloc[r:min(int((i+1)*len(oot_data)/100),len(oot_data)),]
    populationbin=i+ 1
    numofrecord=len(data)
    numofgood=sum(data['fraud_label']==0)
    numofbads=sum(data['fraud_label']==1)
    percentagegood=numofgood/GoodTest*100
    percentagebad=numofbads/BadTest*100
    numofrecord=min(int((i+1)*len(oot_data)/100),len(oot_data))-r
    totalrecord=min(int((i+1)*len(oot_data)/100),len(oot_data))
    culgood+=numofgood
    culbad+=numofbads
    culpergood=culgood/GoodTest*100
    culperbad=culbad/BadTest*100
    KS=culperbad-culpergood
    FPR=culpergood/culperbad
    r= min(int((i+1)*len(oot_data)/100),len(oot_data))
    l.append([populationbin,numofrecord,numofgood,numofbads,percentagegood,percentagebad,totalrecord,culgood,culbad,culpergood,culperbad,KS,FPR])
    
l
l=pd.DataFrame(l,columns=['Population Bin%','#Records','#Goods','#Bads','%Goods','%Bads','Total#Records','Cumulative Goods','Cumulative Bads','%Goods','%Bads (FDR)','KS','FPR'])
l.to_excel('oot-FDR.xlsx')

In [10]:
l.head(20)

Unnamed: 0,Population Bin%,#Records,#Goods,#Bads,%Goods,%Bads,Total#Records,Cumulative Goods,Cumulative Bads,%Goods.1,%Bads (FDR),KS,FPR
0,1,1664,545,1119,0.3321,46.898575,1664,545,1119,0.3321,46.898575,46.566475,0.007081
1,2,1665,1560,105,0.950599,4.400671,3329,2105,1224,1.2827,51.299246,50.016546,0.025004
2,3,1665,1640,25,0.999348,1.047779,4994,3745,1249,2.282048,52.347024,50.064977,0.043595
3,4,1665,1648,17,1.004223,0.71249,6659,5393,1266,3.286271,53.059514,49.773243,0.061936
4,5,1665,1653,12,1.00727,0.502934,8324,7046,1278,4.29354,53.562448,49.268907,0.08016
5,6,1665,1654,11,1.007879,0.461023,9989,8700,1289,5.301419,54.02347,48.722051,0.098132
6,7,1665,1651,14,1.006051,0.586756,11654,10351,1303,6.30747,54.610226,48.302756,0.1155
7,8,1665,1653,12,1.00727,0.502934,13319,12004,1315,7.31474,55.11316,47.79842,0.132722
8,9,1665,1657,8,1.009707,0.335289,14984,13661,1323,8.324447,55.448449,47.124002,0.150129
9,10,1665,1651,14,1.006051,0.586756,16649,15312,1337,9.330498,56.035205,46.704708,0.166511


In [11]:
finaldata = [train_data, test_data, oot_data]
summary = []
for set in finaldata:
    summary.append([len(set),(len(set)-sum(set['fraud_label'])),sum(set['fraud_label']),sum(set['fraud_label'])/len(set)])

summary = pd.DataFrame(summary, columns = ['#Records','#Goods','#Bads','Fraud Rate'], index = ['train','test','oot'])
summary

Unnamed: 0,#Records,#Goods,#Bads,Fraud Rate
train,666805,657171,9634,0.014448
test,166702,164329,2373,0.014235
oot,166493,164107,2386,0.014331


In [12]:
summary.to_excel('data_summary.xlsx')