## Fraud Analytics Project 2 - Fraud Detection in Credit Card Transaction

In [230]:
## Import relevant packages
import pandas as pd
import numpy as np
import scipy.stats as sps
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from math import log10
import random
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
%matplotlib inline

In [2]:
## Read data set
mydata = pd.read_excel('card transactions.xlsx', converters={'Merchnum': lambda x: str(x)})
mydata.shape

(96753, 10)

### Data Cleaning: fill in the missing fields

In [3]:
## Fill NA for Merch state
mydata['Merch state'] = mydata.groupby('Merch description')['Merch state'].apply(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))
mydata['Merch state'] = mydata['Merch state'].fillna(mydata['Merch state'].mode()[0])

In [4]:
## Fill NA for Merch zip
mydata['Merch zip'] = mydata.groupby('Merch description')['Merch zip'].apply(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))
mydata['Merch zip'] = mydata.groupby('Merch state')['Merch zip'].apply(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))
mydata['Merch zip'] = mydata['Merch zip'].fillna(mydata['Merch zip'].mode()[0])

In [5]:
## Fill NA for Merchnum
mydata['Merchnum'] = mydata.groupby('Merch description')['Merchnum'].apply(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))
mydata['Merchnum'] = mydata.groupby('Merch zip')['Merchnum'].apply(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))
mydata['Merchnum'] = mydata.groupby('Merch state')['Merchnum'].apply(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))

In [6]:
## Check whether there are any NAs for Merchnum
mydata['Merchnum'].isnull().sum()

0

In [7]:
## Check whether there are any NAs for Merch state
mydata['Merch state'].isnull().sum()

0

In [8]:
## Check whether there are any NAs for Merch zip
mydata['Merch zip'].isnull().sum()

0

### Create all (expert) variables

In [9]:
## Create card at this merchant - card_merch
mydata['card_merch'] = mydata.Cardnum.astype(str).str.cat(mydata.Merchnum)

## Create card in this zip code - card_zip
mydata['card_zip'] = mydata.Cardnum.astype(str).str.cat(mydata['Merch zip'].astype(str))

## Create card in this state - card_state
mydata['card_state'] = mydata.Cardnum.astype(str).str.cat(mydata['Merch state'])

In [10]:
## 5 entities
entities = ['Cardnum', 'Merchnum', 'card_merch', 'card_zip', 'card_state']

## Copy two columns Recnum -> check_record, Date -> check_date
mydata['check_record'] = mydata.Recnum
mydata['check_date'] = mydata.Date

In [11]:
for entity in entities:
    ## Days-since variables
    df_l = mydata[['Recnum', 'Date', entity]]
    df_r = mydata[['check_record', 'check_date', entity, 'Amount']]
    temp = pd.merge(df_l, df_r, left_on = entity, right_on = entity)
    temp1 = temp[temp.Recnum > temp.check_record][['Recnum', 'Date', 'check_date']].groupby('Recnum')[['Date', 'check_date']].last()
    mapper = (temp1.Date - temp1.check_date).dt.days
    mydata[entity + '_day_since'] = mydata.Recnum.map(mapper)
    mydata[entity + '_day_since'].fillna((mydata.Date - pd.to_datetime('2010-01-01')).dt.days, inplace = True)
    
    ## Frequency variables
    for time in [0, 1, 3, 7, 14, 30]:
        temp2 = temp[(temp.check_date >= (temp.Date - dt.timedelta(time))) & (temp.Recnum >= temp.check_record)][['Recnum', entity, 'Amount']]
        col_name = entity + '_count_' + str(time)
        mapper2 = temp2.groupby('Recnum')[entity].count()
        mydata[col_name] = mydata.Recnum.map(mapper2)
        
        ## Amount variables
        mydata[entity + '_avg_' + str(time)] = mydata.Recnum.map(temp2.groupby('Recnum')['Amount'].mean())
        mydata[entity + '_max_' + str(time)] = mydata.Recnum.map(temp2.groupby('Recnum')['Amount'].max())
        mydata[entity + '_med_' + str(time)] = mydata.Recnum.map(temp2.groupby('Recnum')['Amount'].median())
        mydata[entity + '_total_' + str(time)] = mydata.Recnum.map(temp2.groupby('Recnum')['Amount'].sum())
        mydata[entity + '_actual/avg_' + str(time)] = mydata['Amount'] / mydata[entity + '_avg_' + str(time)]
        mydata[entity + '_actual/max_' + str(time)] = mydata['Amount'] / mydata[entity + '_max_' + str(time)]
        mydata[entity + '_actual/med_' + str(time)] = mydata['Amount'] / mydata[entity + '_med_' + str(time)]
        mydata[entity + '_actual/total_' + str(time)] = mydata['Amount'] / mydata[entity + '_total_' + str(time)]

In [12]:
## 2 entities
entities2 = ['Cardnum', 'Merchnum']

## Velocity change variables
for entity in entities2:
    for time1 in [0, 1]:
        for time2 in [7, 14, 30]:
            mydata[entity + '_count_' + str(time1) + '/avg_count_' + str(time2)] = mydata[entity + '_count_' + str(time1)] / (mydata[entity + '_count_' + str(time2)] / time2)
            mydata[entity + '_actual_' + str(time1) + '/avg_' + str(time2)] = mydata[entity + '_total_' + str(time1)] / mydata[entity + '_avg_' + str(time2)]

In [13]:
## 2 risk table variables
c = 4
nmid = 20

## Get training_testing data
mydata1 = mydata[mydata.Date.dt.month.isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])]

## Likelihood of fraud for that day of the week
fraud_avg = mydata1['Fraud'].mean()
mydata1['Weekday'] = mydata1.Date.dt.weekday_name
fraud_weekday = mydata1.groupby('Weekday')['Fraud'].mean()
num_instances_weekday = mydata1.groupby('Weekday').size()
fraud_weekday_smooth = fraud_avg + (fraud_weekday - fraud_avg) / (1 + np.exp(-(num_instances_weekday - nmid) / c))
mydata1['weekday_risk'] = mydata1['Weekday'].map(fraud_weekday_smooth)

## Likelihood of fraud for that state
fraud_state = mydata1.groupby('Merch state')['Fraud'].mean()
num_instances_state = mydata1.groupby('Merch state').size()
fraud_state_smooth = fraud_avg + (fraud_state - fraud_avg) / (1 + np.exp(-(num_instances_state - nmid) / c))
mydata1['state_risk'] = mydata1['Merch state'].map(fraud_state_smooth)

## Only keep four columns
mydata2 = mydata1[['Weekday', 'weekday_risk']].drop_duplicates()
mydata3 = mydata1[['Merch state', 'state_risk']].drop_duplicates()

## Join to mydata
mydata['Weekday'] = mydata.Date.dt.weekday_name
mydata = mydata.merge(mydata2, how = 'left', on = 'Weekday')
mydata = mydata.merge(mydata3, how = 'left', on = 'Merch state')

## Fill NA for state_risk
mydata['state_risk'] = mydata.groupby('Merch zip')['state_risk'].apply(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Z Scale all variables

In [14]:
## Drop categorical variables
Date = mydata.Date.tolist()
Fraud = mydata.Fraud.tolist()
mydata_newV = mydata.drop(['Recnum', 'Cardnum', 'Date', 'Merchnum', 'Merch description', 'Merch state', 'Merch zip', 'Transtype', 'Amount', 'Fraud', 'card_merch', 'card_zip', 'card_state', 'check_record', 'check_date', 'Weekday'], axis = 1)
mydata_newV.shape

(96753, 301)

In [15]:
## Z scale each field so that the mean is about 0 and standard deviation is about 1
scaler = StandardScaler()
fieldName = mydata_newV.columns.values.tolist()
for i in fieldName:
    mydata_newV[i] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(mydata_newV[i])))

In [17]:
## Check mean (close to 0) and standard deviation (close to 1)
mydata_newV.describe()

Unnamed: 0,Cardnum_day_since,Cardnum_count_0,Cardnum_avg_0,Cardnum_max_0,Cardnum_med_0,Cardnum_total_0,Cardnum_actual/avg_0,Cardnum_actual/max_0,Cardnum_actual/med_0,Cardnum_actual/total_0,...,Merchnum_count_0/avg_count_30,Merchnum_actual_0/avg_30,Merchnum_count_1/avg_count_7,Merchnum_actual_1/avg_7,Merchnum_count_1/avg_count_14,Merchnum_actual_1/avg_14,Merchnum_count_1/avg_count_30,Merchnum_actual_1/avg_30,weekday_risk,state_risk
count,96753.0,96753.0,96753.0,96753.0,96753.0,96753.0,96753.0,96753.0,96753.0,96753.0,...,96753.0,96753.0,96753.0,96753.0,96753.0,96753.0,96753.0,96753.0,96753.0,96753.0
mean,-1.4687770000000002e-17,-2.350043e-18,-2.423481e-18,-3.010992e-18,-7.417322e-18,-6.756373e-18,-9.561736000000001e-17,1.486402e-16,-2.691533e-17,6.268739e-16,...,-1.718469e-16,5.875107e-18,9.752677000000001e-17,7.520136000000001e-17,2.878802e-17,-1.6450300000000003e-17,-2.937553e-19,1.762532e-17,-2.632048e-16,-9.400171e-18
std,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,...,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005
min,-0.3148231,-0.2454078,-0.03860027,-0.03136488,-0.03841379,-0.0429533,-2.252805,-3.094904,-0.1427115,-2.207244,...,-1.006143,-0.1467511,-1.8533,-0.3417222,-1.403102,-0.2977406,-1.105737,-0.2137672,-0.7756625,-1.188224
25%,-0.3148231,-0.2454078,-0.03500883,-0.02894357,-0.03494167,-0.03998919,-0.004621614,0.4409888,-0.041488,-0.7793521,...,-0.8569465,-0.1260596,-0.8948995,-0.3098278,-0.9204107,-0.2692219,-0.8561634,-0.1925449,-0.4423785,-0.8690627
50%,-0.2565855,-0.2454078,-0.02517831,-0.02175375,-0.0250239,-0.03208358,-0.004621614,0.4409888,-0.041488,0.6485804,...,-0.4914377,-0.1217467,-0.123367,-0.3021021,-0.2610736,-0.2614013,-0.448117,-0.1867085,-0.2515032,-0.318343
75%,-0.02363501,-0.07850726,-0.0005374037,-0.003171004,-0.0001685965,-0.009080229,-0.004621614,0.4409888,-0.041488,0.6485804,...,1.573035,-0.08465255,1.033932,-0.2043561,1.265812,-0.1782802,1.529338,-0.1288748,-0.1566523,0.7584529
max,20.41777,23.95518,260.1998,155.3154,277.6275,153.3207,51.2364,0.4409888,66.45761,0.6485804,...,1.573035,31.52606,1.033932,10.94564,1.265812,16.63616,1.529338,26.67378,3.195323,8.31928


In [18]:
## Get Date column for data separation
mydata_newV['Date'] = Date

## Get Fraud as label (response variable)
mydata_newV['Fraud'] = Fraud
mydata_newV.shape

(96753, 303)

### Separate data into modeling and OOT

In [19]:
## Only use the records from 1/15 through 10/31 for feature selection
newdata = mydata_newV.loc[(mydata_newV['Date'] >= dt.datetime.strptime('2010-01-15', "%Y-%m-%d")) & (mydata_newV['Date'] <= dt.datetime.strptime('2010-10-31', "%Y-%m-%d"))]
newdata.shape

(80955, 303)

### Use the modeling data (trn, tst) for feature selection

In [20]:
## Drop Date
newdata = newdata.drop(['Date'], axis = 1)
newdata.shape

(80955, 302)

In [21]:
## Add a random number for process validation checks (fraud label already in newdata)
newdata['randNum'] = random.sample(range(80955), len(newdata))
newdata.shape

(80955, 303)

In [22]:
## Get label distribution
newdata.Fraud.value_counts()

0    80087
1      868
Name: Fraud, dtype: int64

In [24]:
## Check missing value
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
newdata.isnull().sum()

Cardnum_day_since                0
Cardnum_count_0                  0
Cardnum_avg_0                    0
Cardnum_max_0                    0
Cardnum_med_0                    0
Cardnum_total_0                  0
Cardnum_actual/avg_0             0
Cardnum_actual/max_0             0
Cardnum_actual/med_0             0
Cardnum_actual/total_0           0
Cardnum_count_1                  0
Cardnum_avg_1                    0
Cardnum_max_1                    0
Cardnum_med_1                    0
Cardnum_total_1                  0
Cardnum_actual/avg_1             0
Cardnum_actual/max_1             0
Cardnum_actual/med_1             0
Cardnum_actual/total_1           0
Cardnum_count_3                  0
Cardnum_avg_3                    0
Cardnum_max_3                    0
Cardnum_med_3                    0
Cardnum_total_3                  0
Cardnum_actual/avg_3             0
Cardnum_actual/max_3             0
Cardnum_actual/med_3             0
Cardnum_actual/total_3           0
Cardnum_count_7     

### Select 30 best variables with filter and wrapper

In [25]:
## Run a filter: Sort by the average of FDR @ 3% rank and univariate KS rank, keep about 80 variables
## Get goods and bads
goods = newdata.loc[(newdata.Fraud == 0)]
bads = newdata.loc[(newdata.Fraud == 1)]

## Get KS measurement
KSFDR = pd.DataFrame({'Variable':newdata.columns})
ks = []
for column in newdata:
    ks.append(stats.ks_2samp(goods[column], bads[column])[0])
KSFDR['ks'] = ks

## Get FDR @ 3% (upper and lower)
topRows = int(round(len(newdata) * 0.03))
j = 0
for column in newdata:
    temp = newdata.sort_values(column, ascending = False)
    temp1 = temp.head(topRows)
    temp2 = temp.tail(topRows)
    needed1 = temp1.loc[:, 'Fraud']
    needed2 = temp2.loc[:, 'Fraud']
    FDR1 = sum(needed1) / bads.shape[0]
    FDR2 = sum(needed2) / bads.shape[0]
    FDRate = np.maximum(FDR1, FDR2)
    KSFDR.loc[j, 'FDR'] = FDRate
    j = j + 1

## Rank of ks
KSFDR['rank_ks'] = KSFDR['ks'].rank(ascending = True)

## Rank of FDR @ 3%
KSFDR['rank_FDR'] = KSFDR['FDR'].rank(ascending = True)

## Average rank of ks and FDR @ 3%
KSFDR['average_rank'] = (KSFDR['rank_ks'] + KSFDR['rank_FDR']) / 2

In [27]:
KSFDR.sort_values(by = ['average_rank'], ascending = False, inplace = True)
KSFDR

Unnamed: 0,Variable,ks,FDR,rank_ks,rank_FDR,average_rank
301,Fraud,1.0,1.0,303.0,303.0,303.0
197,card_zip_total_7,0.686097,0.639401,302.0,301.0,301.5
188,card_zip_total_3,0.679002,0.642857,300.0,302.0,301.0
142,card_merch_total_7,0.681855,0.633641,301.0,300.0,300.5
151,card_merch_total_14,0.676265,0.631336,299.0,298.5,298.75
133,card_merch_total_3,0.675786,0.631336,298.0,298.5,298.25
243,card_state_total_3,0.674461,0.630184,297.0,297.0,297.0
206,card_zip_total_14,0.673388,0.62788,296.0,296.0,296.0
252,card_state_total_7,0.669424,0.596774,295.0,292.0,293.5
179,card_zip_total_1,0.660988,0.597926,293.0,293.5,293.25


In [28]:
## Keep top 80 variables except Fraud
KSFDRtop80 = KSFDR.Variable.head(81).tolist()
wrapdata = newdata[KSFDRtop80]
Y = wrapdata.Fraud
wrapdata = wrapdata.drop(['Fraud'], axis = 1)

In [46]:
wrapdata.head()

Unnamed: 0,card_zip_total_7,card_zip_total_3,card_merch_total_7,card_merch_total_14,card_merch_total_3,card_state_total_3,card_zip_total_14,card_state_total_7,card_zip_total_1,card_state_total_1,card_merch_total_1,card_merch_total_30,card_state_total_14,card_zip_total_30,card_zip_max_14,card_state_total_0,card_merch_total_0,card_zip_total_0,card_zip_max_30,card_state_max_7,card_merch_max_14,Cardnum_total_3,card_state_max_14,card_zip_max_3,card_zip_max_7,card_merch_max_30,card_merch_max_3,card_merch_max_7,card_state_max_3,Cardnum_total_7,card_state_max_30,card_state_total_30,card_merch_max_1,card_zip_max_1,card_state_max_1,Cardnum_total_1,Cardnum_total_0,card_state_max_0,card_zip_max_0,card_merch_max_0,Merchnum_total_0,Merchnum_max_0,Cardnum_max_0,Merchnum_total_1,Cardnum_max_7,Cardnum_max_1,Cardnum_max_3,Cardnum_avg_1,Cardnum_total_14,card_state_avg_7,Merchnum_total_3,card_state_avg_3,Cardnum_avg_3,card_merch_avg_0,card_zip_avg_30,card_state_avg_0,card_zip_avg_0,Merchnum_max_1,card_state_avg_1,Merchnum_avg_0,Cardnum_avg_0,card_zip_avg_3,card_merch_avg_3,card_state_avg_14,card_zip_avg_7,card_zip_avg_1,card_merch_avg_30,card_state_avg_30,Cardnum_max_14,card_zip_avg_14,card_merch_avg_1,card_merch_avg_7,card_merch_avg_14,Cardnum_avg_7,Cardnum_med_1,Cardnum_avg_14,Merchnum_max_3,Cardnum_avg_30,card_state_med_0,card_merch_med_0
3344,0.112875,-0.0316,0.114971,0.1073,-0.030513,-0.040389,0.103796,0.094687,-0.028245,-0.033101,-0.027625,0.092679,0.071038,0.086288,0.045616,-0.024397,-0.021971,-0.022283,0.041519,0.040925,0.047012,0.010302,0.0344,-0.014476,0.048263,0.043745,-0.013962,0.049151,-0.018398,0.029839,0.023843,0.023048,-0.013076,-0.013364,-0.015564,0.009931,-0.026477,-0.013049,-0.012105,-0.01196,-0.034782,-0.031554,-0.014666,-0.03969,-0.008867,0.005197,-9.2e-05,0.009414,0.13081,0.022171,-0.042657,-0.009612,-0.034008,-0.00943,0.021718,-0.009377,-0.009413,-0.03317,-0.009535,-0.050635,-0.010625,-0.009662,-0.009666,0.021988,0.022148,-0.009569,0.021754,0.02188,0.002652,0.021928,-0.009571,0.022143,0.021953,-0.008515,-0.009277,-0.008777,-0.039859,-0.009776,-0.009043,-0.009197
3345,-0.037276,-0.046003,-0.03679,-0.02794,-0.046127,-0.054766,-0.0285,-0.054761,-0.058862,-0.063686,-0.058245,-0.04193,-0.059726,-0.045255,-0.050952,-0.056308,-0.053896,-0.054206,-0.055018,-0.055574,-0.049568,-0.034552,-0.062026,-0.046449,-0.04833,-0.052812,-0.046505,-0.047448,-0.05036,-0.054434,-0.072453,-0.104284,-0.045912,-0.0462,-0.048395,-0.025558,-0.042738,-0.045907,-0.044965,-0.044821,-0.0384,-0.035178,-0.031147,-0.043263,-0.01979,-0.006004,-0.011291,-0.016142,-0.088177,-0.042691,-0.046018,-0.042415,-0.092819,-0.04233,-0.043165,-0.042279,-0.042313,-0.036751,-0.042437,-0.190227,-0.038234,-0.042459,-0.042486,-0.042915,-0.042697,-0.042469,-0.04314,-0.043044,-0.027019,-0.042951,-0.04247,-0.042711,-0.042938,-0.093402,-0.00988,-0.107756,-0.042632,-0.175662,-0.041946,-0.042098
3346,-0.01486,-0.008426,-0.012807,0.139393,-0.007336,0.01724,0.135866,0.001894,-0.005037,-0.009916,-0.004414,0.124622,0.136753,0.118175,0.002802,-0.030364,-0.027941,-0.028252,-0.001281,-0.020029,0.004193,0.26816,-0.008351,-0.019937,-0.02206,0.000936,-0.019423,-0.021177,-0.01455,0.227464,-0.01885,0.087038,-0.018538,-0.018825,-0.021025,0.063876,-0.029518,-0.019193,-0.018249,-0.018105,-0.035458,-0.032231,-0.017747,-0.036982,0.040827,0.023602,0.062933,-0.004577,0.325432,-0.019297,-0.047441,-0.019083,0.0403,-0.015582,-0.017985,-0.01553,-0.015565,-0.033765,-0.023447,-0.076738,-0.015788,-0.023574,-0.023577,-0.01657,-0.023751,-0.023481,-0.017946,-0.01669,0.030477,-0.017772,-0.023482,-0.023755,-0.017746,0.02895,-0.021515,0.003608,-0.040354,0.01098,-0.015195,-0.015349
3347,-0.03694,-0.045667,-0.036454,-0.027605,-0.04579,-0.05443,-0.028166,-0.054427,-0.058526,-0.06335,-0.057908,-0.041596,-0.059395,-0.044922,-0.050952,-0.055957,-0.053545,-0.053855,-0.055018,-0.055574,-0.049568,-0.034408,-0.062026,-0.046449,-0.04833,-0.052812,-0.046505,-0.047448,-0.05036,-0.054316,-0.072453,-0.103961,-0.045912,-0.0462,-0.048395,-0.025413,-0.042559,-0.045907,-0.044965,-0.044821,-0.03836,-0.035178,-0.031147,-0.043223,-0.01979,-0.006004,-0.011291,-0.031064,-0.088067,-0.042693,-0.045985,-0.04242,-0.092956,-0.042367,-0.043166,-0.042317,-0.04235,-0.036751,-0.042475,-0.190386,-0.038266,-0.042464,-0.04249,-0.042916,-0.042699,-0.042506,-0.04314,-0.043044,-0.027019,-0.042951,-0.042507,-0.042712,-0.042939,-0.093433,-0.074722,-0.107769,-0.042632,-0.175684,-0.041984,-0.042135
3348,-0.067412,-0.061063,-0.065377,-0.072642,-0.059981,-0.069799,-0.076021,-0.084756,-0.057754,-0.06258,-0.057136,-0.086424,-0.106696,-0.092505,-0.050864,-0.055153,-0.05274,-0.053051,-0.05493,-0.055487,-0.04948,-0.067051,-0.061938,-0.046122,-0.048242,-0.052724,-0.045609,-0.04736,-0.050033,-0.086321,-0.072366,-0.150021,-0.044724,-0.045012,-0.047207,-0.051256,-0.042149,-0.044718,-0.043775,-0.043632,-0.038269,-0.035046,-0.03055,-0.043133,-0.041166,-0.032186,-0.037465,-0.059173,-0.123722,-0.041544,-0.051522,-0.041328,-0.094719,-0.041139,-0.041987,-0.041088,-0.041122,-0.036621,-0.041247,-0.185175,-0.037235,-0.041372,-0.041375,-0.041736,-0.04155,-0.041278,-0.041946,-0.041864,-0.047294,-0.041772,-0.041279,-0.041553,-0.041745,-0.092869,-0.072353,-0.106154,-0.041846,-0.172977,-0.040755,-0.040907


In [32]:
model = LogisticRegression()
rfecv = RFECV(estimator = model, step = 1, cv = 2, verbose = 2, n_jobs = -1, scoring = 'roc_auc')
rfecv.fit(wrapdata, Y)

Fitting estimator with 80 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 79 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 78 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 77 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 76 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 75 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 74 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 73 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 72 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 71 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 70 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 69 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 68 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 67 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 66 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 65 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 64 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 63 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 62 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 61 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 60 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 59 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 58 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 57 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 56 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 55 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 54 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 53 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 52 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 51 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 50 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 49 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 48 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 47 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 46 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 45 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 44 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 43 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 42 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 41 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 40 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 39 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 38 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 37 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 36 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 35 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 34 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 33 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 32 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 31 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 30 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 29 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 28 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 27 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 26 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

RFECV(cv=2,
      estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                   fit_intercept=True, intercept_scaling=1,
                                   l1_ratio=None, max_iter=100,
                                   multi_class='auto', n_jobs=None,
                                   penalty='l2', random_state=None,
                                   solver='lbfgs', tol=0.0001, verbose=0,
                                   warm_start=False),
      min_features_to_select=1, n_jobs=-1, scoring='roc_auc', step=1,
      verbose=2)

In [33]:
var_selected = pd.DataFrame(sorted(zip(map(lambda x: round(x), rfecv.ranking_), wrapdata.columns)), columns = ['ranking', 'variable'])
var_selected

Unnamed: 0,ranking,variable
0,1,Cardnum_avg_3
1,1,Cardnum_max_0
2,1,Cardnum_max_1
3,1,Cardnum_total_0
4,1,Cardnum_total_1
5,1,Merchnum_max_0
6,1,Merchnum_max_1
7,1,Merchnum_max_3
8,1,Merchnum_total_0
9,1,Merchnum_total_1


In [34]:
var_selected_top50 = var_selected.head(50)
var_selected_top50

Unnamed: 0,ranking,variable
0,1,Cardnum_avg_3
1,1,Cardnum_max_0
2,1,Cardnum_max_1
3,1,Cardnum_total_0
4,1,Cardnum_total_1
5,1,Merchnum_max_0
6,1,Merchnum_max_1
7,1,Merchnum_max_3
8,1,Merchnum_total_0
9,1,Merchnum_total_1


In [35]:
varTop50List = var_selected_top50['variable'].tolist()
wrapdata2nd = newdata[varTop50List]

In [36]:
rfecv.fit(wrapdata2nd, Y)

Fitting estimator with 50 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 49 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 48 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 47 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 46 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 45 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 44 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 43 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 42 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 41 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 40 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 39 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 38 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 37 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 36 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 35 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 34 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 33 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 32 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 31 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 30 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 29 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 28 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 27 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 26 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 25 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 24 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 23 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 22 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

RFECV(cv=2,
      estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                   fit_intercept=True, intercept_scaling=1,
                                   l1_ratio=None, max_iter=100,
                                   multi_class='auto', n_jobs=None,
                                   penalty='l2', random_state=None,
                                   solver='lbfgs', tol=0.0001, verbose=0,
                                   warm_start=False),
      min_features_to_select=1, n_jobs=-1, scoring='roc_auc', step=1,
      verbose=2)

In [38]:
var_selected_2nd = pd.DataFrame(sorted(zip(map(lambda x: round(x), rfecv.ranking_), wrapdata2nd.columns)), columns = ['ranking', 'variable'])
var_selected_2nd

Unnamed: 0,ranking,variable
0,1,Cardnum_avg_3
1,1,Cardnum_max_0
2,1,Cardnum_max_1
3,1,Cardnum_total_0
4,1,Merchnum_max_0
5,1,Merchnum_max_1
6,1,Merchnum_max_3
7,1,Merchnum_total_0
8,1,Merchnum_total_1
9,1,Merchnum_total_3


In [39]:
top30Var = var_selected_2nd.head(30)

In [40]:
top30Var

Unnamed: 0,ranking,variable
0,1,Cardnum_avg_3
1,1,Cardnum_max_0
2,1,Cardnum_max_1
3,1,Cardnum_total_0
4,1,Merchnum_max_0
5,1,Merchnum_max_1
6,1,Merchnum_max_3
7,1,Merchnum_total_0
8,1,Merchnum_total_1
9,1,Merchnum_total_3


### Build a baseline linear model (logistic regression)

In [110]:
top30VarList = top30Var['variable'].tolist()
x_trntst = newdata[top30VarList]

In [111]:
y_trntst = pd.DataFrame(Y)

In [141]:
ootDF = mydata_newV.loc[(mydata_newV['Date'] >= dt.datetime.strptime('2010-11-01', "%Y-%m-%d")) & (mydata_newV['Date'] <= dt.datetime.strptime('2010-12-31', "%Y-%m-%d"))]
X_oot = ootDF[top30VarList]
Y_oot = pd.DataFrame(ootDF.Fraud)

#### Trial 1: max_iter = 1000, other hyperparameters are default

In [145]:
nitermax = 10
FDR_trn_hyper1 = []
FDR_tst_hyper1 = []
FDR_oot_hyper1 = []
for niter in range(nitermax):
    X_trn, X_tst, Y_trn, Y_tst = train_test_split(x_trntst, y_trntst, test_size = 0.3)
    clf = LogisticRegression(max_iter = 1000).fit(X_trn, Y_trn)
    
    ## FDR on the training set
    probOf1_trn = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_trn)).loc[:, 1])
    probFraud_trn = pd.concat([probOf1_trn, Y_trn.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_trn = int(round(len(X_trn) * 0.03))
    temp_trn = probFraud_trn.head(topRows_trn)
    needed_trn = temp_trn.loc[:, 'Fraud']
    bads_trn = probFraud_trn.loc[(probFraud_trn.Fraud == 1)]
    FDR_trn = sum(needed_trn) / bads_trn.shape[0]
    FDR_trn_hyper1.append(FDR_trn)
    
    ## FDR on the test set
    probOf1_tst = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_tst)).loc[:, 1])
    probFraud_tst = pd.concat([probOf1_tst, Y_tst.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_tst = int(round(len(X_tst) * 0.03))
    temp_tst = probFraud_tst.head(topRows_tst)
    needed_tst = temp_tst.loc[:, 'Fraud']
    bads_tst = probFraud_tst.loc[(probFraud_tst.Fraud == 1)]
    FDR_tst = sum(needed_tst) / bads_tst.shape[0]
    FDR_tst_hyper1.append(FDR_tst)
    
    ## FDR on the OOT set
    probOf1_oot = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_oot)).loc[:, 1])
    probFraud_oot = pd.concat([probOf1_oot, Y_oot.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_oot = int(round(len(X_oot) * 0.03))
    temp_oot = probFraud_oot.head(topRows_oot)
    needed_oot = temp_oot.loc[:, 'Fraud']
    bads_oot = probFraud_oot.loc[(probFraud_oot.Fraud == 1)]
    FDR_oot = sum(needed_oot) / bads_oot.shape[0]
    FDR_oot_hyper1.append(FDR_oot)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [146]:
FDR_trn_hyper1

[0.6921824104234527,
 0.6728187919463087,
 0.6866666666666666,
 0.6704918032786885,
 0.6632825719120136,
 0.6710310965630114,
 0.6979522184300341,
 0.6683673469387755,
 0.6728499156829679,
 0.6700507614213198]

In [149]:
sum(FDR_trn_hyper1)/len(FDR_trn_hyper1)

0.6765693583263238

In [147]:
FDR_tst_hyper1

[0.6535433070866141,
 0.6727941176470589,
 0.6492537313432836,
 0.6666666666666666,
 0.7075812274368231,
 0.7042801556420234,
 0.6418439716312057,
 0.7,
 0.6690909090909091,
 0.7075812274368231]

In [150]:
sum(FDR_tst_hyper1)/len(FDR_tst_hyper1)

0.6772635313981408

In [148]:
FDR_oot_hyper1

[0.3575418994413408,
 0.329608938547486,
 0.3575418994413408,
 0.329608938547486,
 0.3407821229050279,
 0.3854748603351955,
 0.3407821229050279,
 0.3240223463687151,
 0.3743016759776536,
 0.329608938547486]

In [151]:
sum(FDR_oot_hyper1)/len(FDR_oot_hyper1)

0.3469273743016759

#### Trial 2: max_iter = 1000, class_weight = 'balanced', other hyperparameters are default

In [152]:
nitermax = 10
FDR_trn_hyper2 = []
FDR_tst_hyper2 = []
FDR_oot_hyper2 = []
for niter in range(nitermax):
    X_trn, X_tst, Y_trn, Y_tst = train_test_split(x_trntst, y_trntst, test_size = 0.3)
    clf = LogisticRegression(max_iter = 1000, class_weight = 'balanced').fit(X_trn, Y_trn)
    
    ## FDR on the training set
    probOf1_trn = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_trn)).loc[:, 1])
    probFraud_trn = pd.concat([probOf1_trn, Y_trn.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_trn = int(round(len(X_trn) * 0.03))
    temp_trn = probFraud_trn.head(topRows_trn)
    needed_trn = temp_trn.loc[:, 'Fraud']
    bads_trn = probFraud_trn.loc[(probFraud_trn.Fraud == 1)]
    FDR_trn = sum(needed_trn) / bads_trn.shape[0]
    FDR_trn_hyper2.append(FDR_trn)
    
    ## FDR on the test set
    probOf1_tst = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_tst)).loc[:, 1])
    probFraud_tst = pd.concat([probOf1_tst, Y_tst.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_tst = int(round(len(X_tst) * 0.03))
    temp_tst = probFraud_tst.head(topRows_tst)
    needed_tst = temp_tst.loc[:, 'Fraud']
    bads_tst = probFraud_tst.loc[(probFraud_tst.Fraud == 1)]
    FDR_tst = sum(needed_tst) / bads_tst.shape[0]
    FDR_tst_hyper2.append(FDR_tst)
    
    ## FDR on the OOT set
    probOf1_oot = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_oot)).loc[:, 1])
    probFraud_oot = pd.concat([probOf1_oot, Y_oot.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_oot = int(round(len(X_oot) * 0.03))
    temp_oot = probFraud_oot.head(topRows_oot)
    needed_oot = temp_oot.loc[:, 'Fraud']
    bads_oot = probFraud_oot.loc[(probFraud_oot.Fraud == 1)]
    FDR_oot = sum(needed_oot) / bads_oot.shape[0]
    FDR_oot_hyper2.append(FDR_oot)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [153]:
FDR_trn_hyper2

[0.6904761904761905,
 0.7051070840197694,
 0.6804635761589404,
 0.6823338735818476,
 0.6607431340872375,
 0.6863057324840764,
 0.6857638888888888,
 0.6868686868686869,
 0.7210440456769984,
 0.7054908485856906]

In [156]:
sum(FDR_trn_hyper2)/len(FDR_trn_hyper2)

0.6904597060828326

In [154]:
FDR_tst_hyper2

[0.6890756302521008,
 0.685823754789272,
 0.7159090909090909,
 0.7051792828685259,
 0.7349397590361446,
 0.6791666666666667,
 0.6952054794520548,
 0.6788321167883211,
 0.6196078431372549,
 0.6629213483146067]

In [157]:
sum(FDR_tst_hyper2)/len(FDR_tst_hyper2)

0.6866660972214038

In [155]:
FDR_oot_hyper2

[0.4245810055865922,
 0.4301675977653631,
 0.4301675977653631,
 0.4301675977653631,
 0.4134078212290503,
 0.441340782122905,
 0.4301675977653631,
 0.3854748603351955,
 0.41899441340782123,
 0.41899441340782123]

In [158]:
sum(FDR_oot_hyper2)/len(FDR_oot_hyper2)

0.4223463687150838

### Random Forest

#### Trial 1: # of trees = 150, max_depth = 30, max_features = 20

In [173]:
nitermax = 10
FDR_trn_RF1 = []
FDR_tst_RF1 = []
FDR_oot_RF1 = []
for niter in range(nitermax):
    X_trn, X_tst, Y_trn, Y_tst = train_test_split(x_trntst, y_trntst, test_size = 0.3)
    clf = RandomForestClassifier(n_estimators=150, max_depth=30, max_features=20).fit(X_trn, Y_trn)
    
    ## FDR on the training set
    probOf1_trn = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_trn)).loc[:, 1])
    probFraud_trn = pd.concat([probOf1_trn, Y_trn.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_trn = int(round(len(X_trn) * 0.03))
    temp_trn = probFraud_trn.head(topRows_trn)
    needed_trn = temp_trn.loc[:, 'Fraud']
    bads_trn = probFraud_trn.loc[(probFraud_trn.Fraud == 1)]
    FDR_trn = sum(needed_trn) / bads_trn.shape[0]
    FDR_trn_RF1.append(FDR_trn)
    
    ## FDR on the test set
    probOf1_tst = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_tst)).loc[:, 1])
    probFraud_tst = pd.concat([probOf1_tst, Y_tst.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_tst = int(round(len(X_tst) * 0.03))
    temp_tst = probFraud_tst.head(topRows_tst)
    needed_tst = temp_tst.loc[:, 'Fraud']
    bads_tst = probFraud_tst.loc[(probFraud_tst.Fraud == 1)]
    FDR_tst = sum(needed_tst) / bads_tst.shape[0]
    FDR_tst_RF1.append(FDR_tst)
    
    ## FDR on the OOT set
    probOf1_oot = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_oot)).loc[:, 1])
    probFraud_oot = pd.concat([probOf1_oot, Y_oot.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_oot = int(round(len(X_oot) * 0.03))
    temp_oot = probFraud_oot.head(topRows_oot)
    needed_oot = temp_oot.loc[:, 'Fraud']
    bads_oot = probFraud_oot.loc[(probFraud_oot.Fraud == 1)]
    FDR_oot = sum(needed_oot) / bads_oot.shape[0]
    FDR_oot_RF1.append(FDR_oot)

  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys


In [207]:
FDR_trn_RF1

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]

In [175]:
FDR_tst_RF1

[0.9053030303030303,
 0.8941605839416058,
 0.8912280701754386,
 0.8929889298892989,
 0.8773946360153256,
 0.8847736625514403,
 0.9047619047619048,
 0.9188191881918819,
 0.8645418326693227,
 0.8846153846153846]

In [177]:
sum(FDR_tst_RF1)/len(FDR_tst_RF1)

0.8918587223114633

In [176]:
FDR_oot_RF1

[0.5363128491620112,
 0.553072625698324,
 0.5139664804469274,
 0.5586592178770949,
 0.553072625698324,
 0.5586592178770949,
 0.5307262569832403,
 0.553072625698324,
 0.5418994413407822,
 0.5363128491620112]

In [178]:
sum(FDR_oot_RF1)/len(FDR_oot_RF1)

0.5435754189944134

#### Trial 2: # of trees = 200, max_depth = 8, max_features = 5

In [202]:
nitermax = 10
FDR_trn_RF2 = []
FDR_tst_RF2 = []
FDR_oot_RF2 = []
for niter in range(nitermax):
    X_trn, X_tst, Y_trn, Y_tst = train_test_split(x_trntst, y_trntst, test_size = 0.3)
    clf = RandomForestClassifier(n_estimators=200, max_depth=8, max_features=5).fit(X_trn, Y_trn)
    
    ## FDR on the training set
    probOf1_trn = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_trn)).loc[:, 1])
    probFraud_trn = pd.concat([probOf1_trn, Y_trn.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_trn = int(round(len(X_trn) * 0.03))
    temp_trn = probFraud_trn.head(topRows_trn)
    needed_trn = temp_trn.loc[:, 'Fraud']
    bads_trn = probFraud_trn.loc[(probFraud_trn.Fraud == 1)]
    FDR_trn = sum(needed_trn) / bads_trn.shape[0]
    FDR_trn_RF2.append(FDR_trn)
    
    ## FDR on the test set
    probOf1_tst = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_tst)).loc[:, 1])
    probFraud_tst = pd.concat([probOf1_tst, Y_tst.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_tst = int(round(len(X_tst) * 0.03))
    temp_tst = probFraud_tst.head(topRows_tst)
    needed_tst = temp_tst.loc[:, 'Fraud']
    bads_tst = probFraud_tst.loc[(probFraud_tst.Fraud == 1)]
    FDR_tst = sum(needed_tst) / bads_tst.shape[0]
    FDR_tst_RF2.append(FDR_tst)
    
    ## FDR on the OOT set
    probOf1_oot = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_oot)).loc[:, 1])
    probFraud_oot = pd.concat([probOf1_oot, Y_oot.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_oot = int(round(len(X_oot) * 0.03))
    temp_oot = probFraud_oot.head(topRows_oot)
    needed_oot = temp_oot.loc[:, 'Fraud']
    bads_oot = probFraud_oot.loc[(probFraud_oot.Fraud == 1)]
    FDR_oot = sum(needed_oot) / bads_oot.shape[0]
    FDR_oot_RF2.append(FDR_oot)

  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys


In [206]:
FDR_trn_RF2

[0.8615635179153095,
 0.8589951377633711,
 0.8640132669983416,
 0.8578512396694215,
 0.8481421647819063,
 0.8557213930348259,
 0.8656716417910447,
 0.8609715242881072,
 0.8644338118022329,
 0.8656716417910447]

In [208]:
sum(FDR_trn_RF2)/len(FDR_trn_RF2)

0.8603035339835605

In [209]:
FDR_tst_RF2

[0.8385826771653543,
 0.8446215139442231,
 0.8264150943396227,
 0.8174904942965779,
 0.8313253012048193,
 0.8264150943396227,
 0.8113207547169812,
 0.8007380073800738,
 0.7966804979253111,
 0.7849056603773585]

In [210]:
sum(FDR_tst_RF2)/len(FDR_tst_RF2)

0.8178495095689945

In [211]:
FDR_oot_RF2

[0.5642458100558659,
 0.5586592178770949,
 0.5251396648044693,
 0.553072625698324,
 0.5642458100558659,
 0.5418994413407822,
 0.5307262569832403,
 0.553072625698324,
 0.5586592178770949,
 0.5586592178770949]

In [203]:
sum(FDR_oot_RF2)/len(FDR_oot_RF2)

0.5508379888268156

#### Trial 3: # of trees = 200, max_depth = 8, max_features = 5, class_weight = 'balanced'

In [204]:
nitermax = 10
FDR_trn_RF3 = []
FDR_tst_RF3 = []
FDR_oot_RF3 = []
for niter in range(nitermax):
    X_trn, X_tst, Y_trn, Y_tst = train_test_split(x_trntst, y_trntst, test_size = 0.3)
    clf = RandomForestClassifier(n_estimators=200, max_depth=8, max_features=5, class_weight = 'balanced').fit(X_trn, Y_trn)
    
    ## FDR on the training set
    probOf1_trn = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_trn)).loc[:, 1])
    probFraud_trn = pd.concat([probOf1_trn, Y_trn.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_trn = int(round(len(X_trn) * 0.03))
    temp_trn = probFraud_trn.head(topRows_trn)
    needed_trn = temp_trn.loc[:, 'Fraud']
    bads_trn = probFraud_trn.loc[(probFraud_trn.Fraud == 1)]
    FDR_trn = sum(needed_trn) / bads_trn.shape[0]
    FDR_trn_RF3.append(FDR_trn)
    
    ## FDR on the test set
    probOf1_tst = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_tst)).loc[:, 1])
    probFraud_tst = pd.concat([probOf1_tst, Y_tst.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_tst = int(round(len(X_tst) * 0.03))
    temp_tst = probFraud_tst.head(topRows_tst)
    needed_tst = temp_tst.loc[:, 'Fraud']
    bads_tst = probFraud_tst.loc[(probFraud_tst.Fraud == 1)]
    FDR_tst = sum(needed_tst) / bads_tst.shape[0]
    FDR_tst_RF3.append(FDR_tst)
    
    ## FDR on the OOT set
    probOf1_oot = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_oot)).loc[:, 1])
    probFraud_oot = pd.concat([probOf1_oot, Y_oot.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_oot = int(round(len(X_oot) * 0.03))
    temp_oot = probFraud_oot.head(topRows_oot)
    needed_oot = temp_oot.loc[:, 'Fraud']
    bads_oot = probFraud_oot.loc[(probFraud_oot.Fraud == 1)]
    FDR_oot = sum(needed_oot) / bads_oot.shape[0]
    FDR_oot_RF3.append(FDR_oot)

  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys


In [212]:
FDR_trn_RF3

[0.9477124183006536,
 0.945273631840796,
 0.9515050167224081,
 0.9624183006535948,
 0.9492635024549918,
 0.9508474576271186,
 0.9420035149384886,
 0.9532258064516129,
 0.9596122778675282,
 0.9579124579124579]

In [213]:
sum(FDR_trn_RF3)/len(FDR_trn_RF3)

0.9519774384769653

In [214]:
FDR_tst_RF3

[0.80859375,
 0.7924528301886793,
 0.8185185185185185,
 0.78125,
 0.8404669260700389,
 0.7877697841726619,
 0.8662207357859532,
 0.8266129032258065,
 0.8112449799196787,
 0.8248175182481752]

In [215]:
sum(FDR_tst_RF3)/len(FDR_tst_RF3)

0.8157947946129512

In [216]:
FDR_oot_RF3

[0.2905027932960894,
 0.2905027932960894,
 0.2905027932960894,
 0.3128491620111732,
 0.2905027932960894,
 0.31843575418994413,
 0.30726256983240224,
 0.3016759776536313,
 0.3240223463687151,
 0.27932960893854747]

In [205]:
sum(FDR_oot_RF3)/len(FDR_oot_RF3)

0.3005586592178771

### Neural Network

#### Trial 1: one layer, 4 nodes

In [180]:
nitermax = 10
FDR_trn_NN1 = []
FDR_tst_NN1 = []
FDR_oot_NN1 = []
for niter in range(nitermax):
    X_trn, X_tst, Y_trn, Y_tst = train_test_split(x_trntst, y_trntst, test_size = 0.3)
    clf = MLPClassifier(hidden_layer_sizes=(4)).fit(X_trn, Y_trn)
    
    ## FDR on the training set
    probOf1_trn = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_trn)).loc[:, 1])
    probFraud_trn = pd.concat([probOf1_trn, Y_trn.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_trn = int(round(len(X_trn) * 0.03))
    temp_trn = probFraud_trn.head(topRows_trn)
    needed_trn = temp_trn.loc[:, 'Fraud']
    bads_trn = probFraud_trn.loc[(probFraud_trn.Fraud == 1)]
    FDR_trn = sum(needed_trn) / bads_trn.shape[0]
    FDR_trn_NN1.append(FDR_trn)
    
    ## FDR on the test set
    probOf1_tst = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_tst)).loc[:, 1])
    probFraud_tst = pd.concat([probOf1_tst, Y_tst.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_tst = int(round(len(X_tst) * 0.03))
    temp_tst = probFraud_tst.head(topRows_tst)
    needed_tst = temp_tst.loc[:, 'Fraud']
    bads_tst = probFraud_tst.loc[(probFraud_tst.Fraud == 1)]
    FDR_tst = sum(needed_tst) / bads_tst.shape[0]
    FDR_tst_NN1.append(FDR_tst)
    
    ## FDR on the OOT set
    probOf1_oot = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_oot)).loc[:, 1])
    probFraud_oot = pd.concat([probOf1_oot, Y_oot.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_oot = int(round(len(X_oot) * 0.03))
    temp_oot = probFraud_oot.head(topRows_oot)
    needed_oot = temp_oot.loc[:, 'Fraud']
    bads_oot = probFraud_oot.loc[(probFraud_oot.Fraud == 1)]
    FDR_oot = sum(needed_oot) / bads_oot.shape[0]
    FDR_oot_NN1.append(FDR_oot)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [181]:
FDR_trn_NN1

[0.6798107255520505,
 0.6908212560386473,
 0.7114754098360656,
 0.6688524590163935,
 0.6801948051948052,
 0.6553398058252428,
 0.7006920415224913,
 0.6731391585760518,
 0.6436974789915967,
 0.6444805194805194]

In [182]:
sum(FDR_trn_NN1)/len(FDR_trn_NN1)

0.6748503660033865

In [183]:
FDR_tst_NN1

[0.6581196581196581,
 0.7206477732793523,
 0.6627906976744186,
 0.7325581395348837,
 0.6904761904761905,
 0.676,
 0.6482758620689655,
 0.664,
 0.6483516483516484,
 0.6626984126984127]

In [184]:
sum(FDR_tst_NN1)/len(FDR_tst_NN1)

0.6763918382203531

In [185]:
FDR_oot_NN1

[0.3575418994413408,
 0.3575418994413408,
 0.441340782122905,
 0.3687150837988827,
 0.3575418994413408,
 0.4301675977653631,
 0.4245810055865922,
 0.35195530726256985,
 0.40782122905027934,
 0.3575418994413408]

In [186]:
sum(FDR_oot_NN1)/len(FDR_oot_NN1)

0.38547486033519557

#### Trial 2: one layer, 8 nodes

In [187]:
nitermax = 10
FDR_trn_NN2 = []
FDR_tst_NN2 = []
FDR_oot_NN2 = []
for niter in range(nitermax):
    X_trn, X_tst, Y_trn, Y_tst = train_test_split(x_trntst, y_trntst, test_size = 0.3)
    clf = MLPClassifier(hidden_layer_sizes=(8)).fit(X_trn, Y_trn)
    
    ## FDR on the training set
    probOf1_trn = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_trn)).loc[:, 1])
    probFraud_trn = pd.concat([probOf1_trn, Y_trn.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_trn = int(round(len(X_trn) * 0.03))
    temp_trn = probFraud_trn.head(topRows_trn)
    needed_trn = temp_trn.loc[:, 'Fraud']
    bads_trn = probFraud_trn.loc[(probFraud_trn.Fraud == 1)]
    FDR_trn = sum(needed_trn) / bads_trn.shape[0]
    FDR_trn_NN2.append(FDR_trn)
    
    ## FDR on the test set
    probOf1_tst = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_tst)).loc[:, 1])
    probFraud_tst = pd.concat([probOf1_tst, Y_tst.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_tst = int(round(len(X_tst) * 0.03))
    temp_tst = probFraud_tst.head(topRows_tst)
    needed_tst = temp_tst.loc[:, 'Fraud']
    bads_tst = probFraud_tst.loc[(probFraud_tst.Fraud == 1)]
    FDR_tst = sum(needed_tst) / bads_tst.shape[0]
    FDR_tst_NN2.append(FDR_tst)
    
    ## FDR on the OOT set
    probOf1_oot = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_oot)).loc[:, 1])
    probFraud_oot = pd.concat([probOf1_oot, Y_oot.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_oot = int(round(len(X_oot) * 0.03))
    temp_oot = probFraud_oot.head(topRows_oot)
    needed_oot = temp_oot.loc[:, 'Fraud']
    bads_oot = probFraud_oot.loc[(probFraud_oot.Fraud == 1)]
    FDR_oot = sum(needed_oot) / bads_oot.shape[0]
    FDR_oot_NN2.append(FDR_oot)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [188]:
FDR_trn_NN2

[0.70578231292517,
 0.6818181818181818,
 0.6915739268680445,
 0.7231270358306189,
 0.6948590381426202,
 0.7324185248713551,
 0.694078947368421,
 0.7050847457627119,
 0.7113564668769716,
 0.7471264367816092]

In [189]:
sum(FDR_trn_NN2)/len(FDR_trn_NN2)

0.7087225617245705

In [190]:
FDR_tst_NN2

[0.7178571428571429,
 0.7301587301587301,
 0.7071129707112971,
 0.6692913385826772,
 0.6830188679245283,
 0.7403508771929824,
 0.6961538461538461,
 0.7482014388489209,
 0.7521367521367521,
 0.7335907335907336]

In [191]:
sum(FDR_tst_NN2)/len(FDR_tst_NN2)

0.717787269815761

In [192]:
FDR_oot_NN2

[0.5251396648044693,
 0.39664804469273746,
 0.5307262569832403,
 0.5195530726256983,
 0.5251396648044693,
 0.547486033519553,
 0.5027932960893855,
 0.5307262569832403,
 0.547486033519553,
 0.547486033519553]

In [193]:
sum(FDR_oot_NN2)/len(FDR_oot_NN2)

0.5173184357541899

#### Trial 3: one layer, 12 nodes

In [194]:
nitermax = 10
FDR_trn_NN3 = []
FDR_tst_NN3 = []
FDR_oot_NN3 = []
for niter in range(nitermax):
    X_trn, X_tst, Y_trn, Y_tst = train_test_split(x_trntst, y_trntst, test_size = 0.3)
    clf = MLPClassifier(hidden_layer_sizes=(12)).fit(X_trn, Y_trn)
    
    ## FDR on the training set
    probOf1_trn = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_trn)).loc[:, 1])
    probFraud_trn = pd.concat([probOf1_trn, Y_trn.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_trn = int(round(len(X_trn) * 0.03))
    temp_trn = probFraud_trn.head(topRows_trn)
    needed_trn = temp_trn.loc[:, 'Fraud']
    bads_trn = probFraud_trn.loc[(probFraud_trn.Fraud == 1)]
    FDR_trn = sum(needed_trn) / bads_trn.shape[0]
    FDR_trn_NN3.append(FDR_trn)
    
    ## FDR on the test set
    probOf1_tst = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_tst)).loc[:, 1])
    probFraud_tst = pd.concat([probOf1_tst, Y_tst.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_tst = int(round(len(X_tst) * 0.03))
    temp_tst = probFraud_tst.head(topRows_tst)
    needed_tst = temp_tst.loc[:, 'Fraud']
    bads_tst = probFraud_tst.loc[(probFraud_tst.Fraud == 1)]
    FDR_tst = sum(needed_tst) / bads_tst.shape[0]
    FDR_tst_NN3.append(FDR_tst)
    
    ## FDR on the OOT set
    probOf1_oot = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_oot)).loc[:, 1])
    probFraud_oot = pd.concat([probOf1_oot, Y_oot.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_oot = int(round(len(X_oot) * 0.03))
    temp_oot = probFraud_oot.head(topRows_oot)
    needed_oot = temp_oot.loc[:, 'Fraud']
    bads_oot = probFraud_oot.loc[(probFraud_oot.Fraud == 1)]
    FDR_oot = sum(needed_oot) / bads_oot.shape[0]
    FDR_oot_NN3.append(FDR_oot)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [217]:
FDR_trn_NN3

[0.7487684729064039,
 0.7089430894308943,
 0.6694214876033058,
 0.7377049180327869,
 0.7203252032520325,
 0.7386363636363636,
 0.7512437810945274,
 0.7389240506329114,
 0.6939102564102564,
 0.7475247524752475]

In [218]:
sum(FDR_trn_NN3)/len(FDR_trn_NN3)

0.725540237547473

In [219]:
FDR_tst_NN3

[0.6795366795366795,
 0.6600790513833992,
 0.7300380228136882,
 0.7364341085271318,
 0.6837944664031621,
 0.7222222222222222,
 0.720754716981132,
 0.6991525423728814,
 0.7090163934426229,
 0.7404580152671756]

In [220]:
sum(FDR_tst_NN3)/len(FDR_tst_NN3)

0.7081486218950094

In [195]:
FDR_oot_NN3

[0.5307262569832403,
 0.44692737430167595,
 0.4581005586592179,
 0.5363128491620112,
 0.5083798882681564,
 0.5195530726256983,
 0.5418994413407822,
 0.5307262569832403,
 0.4748603351955307,
 0.5586592178770949]

In [196]:
sum(FDR_oot_NN3)/len(FDR_oot_NN3)

0.5106145251396648

#### Trial 4: two layers, 8 nodes

In [197]:
nitermax = 10
FDR_trn_NN4 = []
FDR_tst_NN4 = []
FDR_oot_NN4 = []
for niter in range(nitermax):
    X_trn, X_tst, Y_trn, Y_tst = train_test_split(x_trntst, y_trntst, test_size = 0.3)
    clf = MLPClassifier(hidden_layer_sizes=(8, 8)).fit(X_trn, Y_trn)
    
    ## FDR on the training set
    probOf1_trn = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_trn)).loc[:, 1])
    probFraud_trn = pd.concat([probOf1_trn, Y_trn.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_trn = int(round(len(X_trn) * 0.03))
    temp_trn = probFraud_trn.head(topRows_trn)
    needed_trn = temp_trn.loc[:, 'Fraud']
    bads_trn = probFraud_trn.loc[(probFraud_trn.Fraud == 1)]
    FDR_trn = sum(needed_trn) / bads_trn.shape[0]
    FDR_trn_NN4.append(FDR_trn)
    
    ## FDR on the test set
    probOf1_tst = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_tst)).loc[:, 1])
    probFraud_tst = pd.concat([probOf1_tst, Y_tst.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_tst = int(round(len(X_tst) * 0.03))
    temp_tst = probFraud_tst.head(topRows_tst)
    needed_tst = temp_tst.loc[:, 'Fraud']
    bads_tst = probFraud_tst.loc[(probFraud_tst.Fraud == 1)]
    FDR_tst = sum(needed_tst) / bads_tst.shape[0]
    FDR_tst_NN4.append(FDR_tst)
    
    ## FDR on the OOT set
    probOf1_oot = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_oot)).loc[:, 1])
    probFraud_oot = pd.concat([probOf1_oot, Y_oot.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_oot = int(round(len(X_oot) * 0.03))
    temp_oot = probFraud_oot.head(topRows_oot)
    needed_oot = temp_oot.loc[:, 'Fraud']
    bads_oot = probFraud_oot.loc[(probFraud_oot.Fraud == 1)]
    FDR_oot = sum(needed_oot) / bads_oot.shape[0]
    FDR_oot_NN4.append(FDR_oot)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [221]:
FDR_trn_NN4

[0.7654723127035831,
 0.7698541329011345,
 0.770764119601329,
 0.7602627257799671,
 0.7625418060200669,
 0.7934426229508197,
 0.7528089887640449,
 0.7766497461928934,
 0.7324414715719063,
 0.7854889589905363]

In [222]:
sum(FDR_trn_NN4)/len(FDR_trn_NN4)

0.7669726885476281

In [223]:
FDR_tst_NN4

[0.7677165354330708,
 0.7250996015936255,
 0.7631578947368421,
 0.7413127413127413,
 0.774074074074074,
 0.7558139534883721,
 0.7755102040816326,
 0.6931407942238267,
 0.774074074074074,
 0.7735042735042735]

In [224]:
sum(FDR_tst_NN4)/len(FDR_tst_NN4)

0.7543404146522532

In [198]:
FDR_oot_NN4

[0.5418994413407822,
 0.5363128491620112,
 0.5139664804469274,
 0.5307262569832403,
 0.5698324022346368,
 0.5810055865921788,
 0.5418994413407822,
 0.5307262569832403,
 0.5418994413407822,
 0.5698324022346368]

In [199]:
sum(FDR_oot_NN4)/len(FDR_oot_NN4)

0.5458100558659218

#### Trial 5: two layers, 8 nodes, activation = 'logistic'

In [200]:
nitermax = 10
FDR_trn_NN5 = []
FDR_tst_NN5 = []
FDR_oot_NN5 = []
for niter in range(nitermax):
    X_trn, X_tst, Y_trn, Y_tst = train_test_split(x_trntst, y_trntst, test_size = 0.3)
    clf = MLPClassifier(hidden_layer_sizes=(8, 8), activation = 'logistic').fit(X_trn, Y_trn)
    
    ## FDR on the training set
    probOf1_trn = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_trn)).loc[:, 1])
    probFraud_trn = pd.concat([probOf1_trn, Y_trn.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_trn = int(round(len(X_trn) * 0.03))
    temp_trn = probFraud_trn.head(topRows_trn)
    needed_trn = temp_trn.loc[:, 'Fraud']
    bads_trn = probFraud_trn.loc[(probFraud_trn.Fraud == 1)]
    FDR_trn = sum(needed_trn) / bads_trn.shape[0]
    FDR_trn_NN5.append(FDR_trn)
    
    ## FDR on the test set
    probOf1_tst = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_tst)).loc[:, 1])
    probFraud_tst = pd.concat([probOf1_tst, Y_tst.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_tst = int(round(len(X_tst) * 0.03))
    temp_tst = probFraud_tst.head(topRows_tst)
    needed_tst = temp_tst.loc[:, 'Fraud']
    bads_tst = probFraud_tst.loc[(probFraud_tst.Fraud == 1)]
    FDR_tst = sum(needed_tst) / bads_tst.shape[0]
    FDR_tst_NN5.append(FDR_tst)
    
    ## FDR on the OOT set
    probOf1_oot = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_oot)).loc[:, 1])
    probFraud_oot = pd.concat([probOf1_oot, Y_oot.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_oot = int(round(len(X_oot) * 0.03))
    temp_oot = probFraud_oot.head(topRows_oot)
    needed_oot = temp_oot.loc[:, 'Fraud']
    bads_oot = probFraud_oot.loc[(probFraud_oot.Fraud == 1)]
    FDR_oot = sum(needed_oot) / bads_oot.shape[0]
    FDR_oot_NN5.append(FDR_oot)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [225]:
FDR_trn_NN5

[0.6908517350157729,
 0.6599063962558502,
 0.6915584415584416,
 0.687603305785124,
 0.6859504132231405,
 0.6962233169129721,
 0.6913183279742765,
 0.6857142857142857,
 0.6981757877280266,
 0.6852791878172588]

In [226]:
sum(FDR_trn_NN5)/len(FDR_trn_NN5)

0.6872581197985149

In [227]:
FDR_tst_NN5

[0.6794871794871795,
 0.7224669603524229,
 0.6666666666666666,
 0.7034220532319392,
 0.6653992395437263,
 0.6602316602316602,
 0.6626016260162602,
 0.6886446886446886,
 0.6415094339622641,
 0.6750902527075813]

In [228]:
sum(FDR_tst_NN5)/len(FDR_tst_NN5)

0.676551976084439

In [229]:
FDR_oot_NN5

[0.4022346368715084,
 0.4301675977653631,
 0.4134078212290503,
 0.3743016759776536,
 0.44692737430167595,
 0.39664804469273746,
 0.41899441340782123,
 0.45251396648044695,
 0.40782122905027934,
 0.4301675977653631]

In [201]:
sum(FDR_oot_NN5)/len(FDR_oot_NN5)

0.4173184357541899

### Boosted Tree

#### Trial 1: # of trees = 800, max_depth = 3, learning_rate = 0.1

In [231]:
nitermax = 10
FDR_trn_xgb1 = []
FDR_tst_xgb1 = []
FDR_oot_xgb1 = []
for niter in range(nitermax):
    X_trn, X_tst, Y_trn, Y_tst = train_test_split(x_trntst, y_trntst, test_size = 0.3)
    clf = GradientBoostingClassifier(n_estimators = 800, max_depth = 3, learning_rate = 0.1).fit(X_trn, Y_trn)
    
    ## FDR on the training set
    probOf1_trn = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_trn)).loc[:, 1])
    probFraud_trn = pd.concat([probOf1_trn, Y_trn.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_trn = int(round(len(X_trn) * 0.03))
    temp_trn = probFraud_trn.head(topRows_trn)
    needed_trn = temp_trn.loc[:, 'Fraud']
    bads_trn = probFraud_trn.loc[(probFraud_trn.Fraud == 1)]
    FDR_trn = sum(needed_trn) / bads_trn.shape[0]
    FDR_trn_xgb1.append(FDR_trn)
    
    ## FDR on the test set
    probOf1_tst = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_tst)).loc[:, 1])
    probFraud_tst = pd.concat([probOf1_tst, Y_tst.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_tst = int(round(len(X_tst) * 0.03))
    temp_tst = probFraud_tst.head(topRows_tst)
    needed_tst = temp_tst.loc[:, 'Fraud']
    bads_tst = probFraud_tst.loc[(probFraud_tst.Fraud == 1)]
    FDR_tst = sum(needed_tst) / bads_tst.shape[0]
    FDR_tst_xgb1.append(FDR_tst)
    
    ## FDR on the OOT set
    probOf1_oot = pd.DataFrame(pd.DataFrame(clf.predict_proba(X_oot)).loc[:, 1])
    probFraud_oot = pd.concat([probOf1_oot, Y_oot.reset_index(drop=True)], axis=1).sort_values(1, ascending = False)
    topRows_oot = int(round(len(X_oot) * 0.03))
    temp_oot = probFraud_oot.head(topRows_oot)
    needed_oot = temp_oot.loc[:, 'Fraud']
    bads_oot = probFraud_oot.loc[(probFraud_oot.Fraud == 1)]
    FDR_oot = sum(needed_oot) / bads_oot.shape[0]
    FDR_oot_xgb1.append(FDR_oot)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [232]:
FDR_trn_xgb1

[0.9983606557377049,
 0.9982905982905983,
 0.9983221476510067,
 0.9983416252072969,
 1.0,
 1.0,
 1.0,
 0.9967845659163987,
 0.9983974358974359,
 1.0]

In [233]:
sum(FDR_trn_xgb1)/len(FDR_trn_xgb1)

0.9988497028700442

In [234]:
FDR_tst_xgb1

[0.9224806201550387,
 0.8657243816254417,
 0.8897058823529411,
 0.9132075471698113,
 0.8676470588235294,
 0.9087591240875912,
 0.8939929328621908,
 0.8861788617886179,
 0.9016393442622951,
 0.8683274021352313]

In [235]:
sum(FDR_tst_xgb1)/len(FDR_tst_xgb1)

0.8917663155262687

In [236]:
FDR_oot_xgb1

[0.5251396648044693,
 0.5195530726256983,
 0.5586592178770949,
 0.5307262569832403,
 0.4245810055865922,
 0.4748603351955307,
 0.5642458100558659,
 0.4972067039106145,
 0.5139664804469274,
 0.5418994413407822]

In [237]:
sum(FDR_oot_xgb1)/len(FDR_oot_xgb1)

0.5150837988826817