In [1]:
import pandas as pd
import numpy as np
import random
from scipy import stats

In [2]:
from sklearn.linear_model import LogisticRegression

In [3]:
from sklearn.feature_selection import RFECV

In [4]:
from sklearn.preprocessing import StandardScaler

In [5]:
v = pd.read_csv('data.csv')

In [6]:
v = v.drop(['Unnamed: 0','random'],axis = 1)

In [7]:
v = v[v.record < 833508]

In [8]:
v = v.set_index('record')

## KS FDR - Drop 50% Variables

In [9]:
KSFDR = pd.DataFrame(columns = ['ks'])

In [10]:
goods = v[v['fraud_label'] == 0]
bads = v[v['fraud_label'] == 1]

In [11]:
i = 0
for column in list(v.columns):
    KSFDR.loc[i,'ks'] = stats.ks_2samp(goods[column],bads[column])[0]
    i = i+1

In [12]:
KSFDR['variable'] = v.columns

In [13]:
numbads = len(bads)

In [14]:
topRows = int(round(len(v)*0.03))
j = 0
for column in v.columns:
    if column != 'fraud_label':
        temp1 = v[[column,'fraud_label']].sort_values(column,ascending=False).head(topRows)
        temp2 = v[[column,'fraud_label']].sort_values(column,ascending=True).head(topRows)
        needed1 = temp1.loc[:,'fraud_label']
        needed2 = temp2.loc[:,'fraud_label']
        FDR1 = sum(needed1)/numbads
        FDR2 = sum(needed2)/numbads
        FDRate = np.maximum(FDR1,FDR2)
        KSFDR.loc[j, 'FDR'] = FDRate
        j = j +1
    else:
        KSFDR.loc[j, 'FDR'] = 1
        j = j+1

In [15]:
KSFDR['KS_rank'] = KSFDR['ks'].rank(axis = 0)
KSFDR['FDR_rank'] = KSFDR['FDR'].rank(axis = 0)
KSFDR['Average_rank'] = (KSFDR['KS_rank'] + KSFDR['FDR_rank'])/2

In [16]:
KSFDR.tail()

Unnamed: 0,ks,variable,FDR,KS_rank,FDR_rank,Average_rank
601,0.289312,address-zip5_unique_dob,0.3149,593.0,593.0,593.0
602,0.295713,address-zip5_unique_name,0.317731,594.0,594.0,594.0
603,0.111611,address-zip5_unique_homephone,0.139669,440.0,443.0,441.5
604,0.0200206,day_week_risk,0.031731,155.0,1.0,78.0
605,1.0,fraud_label,1.0,606.0,606.0,606.0


In [17]:
keep_variable = KSFDR.sort_values('Average_rank', ascending = False).head(int(len(KSFDR)/6))

In [18]:
keep_variable.head()

Unnamed: 0,ks,variable,FDR,KS_rank,FDR_rank,Average_rank
605,1.0,fraud_label,1.0,606.0,606.0,606.0
561,0.328941,address-zip5_pastday,0.354127,604.0,605.0,604.5
547,0.329781,address_pastday,0.351628,605.0,604.0,604.5
11,0.328484,address_count30_date,0.34838,603.0,602.0,602.5
109,0.327827,address-zip5_count30_date,0.351045,602.0,603.0,602.5


In [19]:
keep_variable = list(keep_variable['variable'])

In [20]:
df = v[keep_variable]

In [21]:
df.head()

Unnamed: 0_level_0,fraud_label,address-zip5_pastday,address_pastday,address_count30_date,address-zip5_count30_date,address_count90_date,address-zip5_count90_date,address_count180_date,address-zip5_count180_date,address_count7_date,...,ssn-dob-name_0_count90_count_ratio,homephone-zip5_count3_date,address-homephone_count3_date,address-homephone-zip5_count3_date,name_count90_date,ssn_0_count180_count_ratio,dob-name_count3_date,ssn-name_0_count90_count_ratio,ssn_count3_date,ssn-dob_count3_date
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,365.0,365.0,1,1,1,1,1,1,1,...,0.011111,1,1,1,1,0.005556,1,0.011111,1,1
2,1,365.0,365.0,1,1,1,1,1,1,1,...,0.011111,1,1,1,1,0.005556,1,0.011111,1,1
3,0,365.0,365.0,1,1,1,1,1,1,1,...,0.011111,1,1,1,1,0.005556,1,0.011111,1,1
4,0,365.0,365.0,1,1,1,1,1,1,1,...,0.011111,1,1,1,1,0.005556,1,0.011111,1,1
5,0,365.0,365.0,1,1,1,1,1,1,1,...,0.011111,1,1,1,1,0.005556,1,0.011111,1,1


In [22]:
v = np.nan

## Correlated Variables

In [23]:
# correlation_matrix = pd.read_csv('variable_correlation.csv')

In [24]:
# correlation_matrix = correlation_matrix.set_index('Unnamed: 0')

In [25]:
# correlation_matrix.head()

In [26]:
# correlated_features = set()

In [27]:
# for i in range(len(correlation_matrix.columns)):
#     for j in range(i):
#         if abs(correlation_matrix.iloc[i, j]) > 0.90:
#             colname = correlation_matrix.columns[i]
#            correlated_features.add(colname)

In [28]:
#len(correlated_features)

## Cross Validation

In [29]:
lr = LogisticRegression(max_iter = 1000, solver = 'saga')

In [30]:
keep_variable.remove('fraud_label')

In [31]:
X = df[keep_variable]
Y = df['fraud_label']

In [32]:
scaler = StandardScaler()
data = scaler.fit_transform(X.to_numpy())
data = pd.DataFrame(data, columns = X.columns)

In [33]:
data.head()

Unnamed: 0,address-zip5_pastday,address_pastday,address_count30_date,address-zip5_count30_date,address_count90_date,address-zip5_count90_date,address_count180_date,address-zip5_count180_date,address_count7_date,address-zip5_count7_date,...,ssn-dob-name_0_count90_count_ratio,homephone-zip5_count3_date,address-homephone_count3_date,address-homephone-zip5_count3_date,name_count90_date,ssn_0_count180_count_ratio,dob-name_count3_date,ssn-name_0_count90_count_ratio,ssn_count3_date,ssn-dob_count3_date
0,0.378498,0.40316,-0.117445,-0.10186,-0.180323,-0.165566,-0.20655,-0.215037,-0.072873,-0.067463,...,0.256899,-0.048565,-0.048404,-0.04838,-0.245637,0.346685,-0.046068,0.271896,-0.047251,-0.046014
1,0.378498,0.40316,-0.117445,-0.10186,-0.180323,-0.165566,-0.20655,-0.215037,-0.072873,-0.067463,...,0.256899,-0.048565,-0.048404,-0.04838,-0.245637,0.346685,-0.046068,0.271896,-0.047251,-0.046014
2,0.378498,0.40316,-0.117445,-0.10186,-0.180323,-0.165566,-0.20655,-0.215037,-0.072873,-0.067463,...,0.256899,-0.048565,-0.048404,-0.04838,-0.245637,0.346685,-0.046068,0.271896,-0.047251,-0.046014
3,0.378498,0.40316,-0.117445,-0.10186,-0.180323,-0.165566,-0.20655,-0.215037,-0.072873,-0.067463,...,0.256899,-0.048565,-0.048404,-0.04838,-0.245637,0.346685,-0.046068,0.271896,-0.047251,-0.046014
4,0.378498,0.40316,-0.117445,-0.10186,-0.180323,-0.165566,-0.20655,-0.215037,-0.072873,-0.067463,...,0.256899,-0.048565,-0.048404,-0.04838,-0.245637,0.346685,-0.046068,0.271896,-0.047251,-0.046014


In [34]:
rfecv = RFECV(estimator=lr, step=2, cv=2, verbose=1, n_jobs=1, scoring='roc_auc',min_features_to_select = 20)
rfecv.fit(data, Y)

Fitting estimator with 100 features.




Fitting estimator with 98 features.




Fitting estimator with 96 features.




Fitting estimator with 94 features.




Fitting estimator with 92 features.




Fitting estimator with 90 features.




Fitting estimator with 88 features.




Fitting estimator with 86 features.




Fitting estimator with 84 features.




Fitting estimator with 82 features.




Fitting estimator with 80 features.




Fitting estimator with 78 features.




Fitting estimator with 76 features.




Fitting estimator with 74 features.




Fitting estimator with 72 features.




Fitting estimator with 70 features.




Fitting estimator with 68 features.




Fitting estimator with 66 features.




Fitting estimator with 64 features.




Fitting estimator with 62 features.




Fitting estimator with 60 features.




Fitting estimator with 58 features.




Fitting estimator with 56 features.




Fitting estimator with 54 features.




Fitting estimator with 52 features.




Fitting estimator with 50 features.




Fitting estimator with 48 features.




Fitting estimator with 46 features.




Fitting estimator with 44 features.




Fitting estimator with 42 features.




Fitting estimator with 40 features.




Fitting estimator with 38 features.




Fitting estimator with 36 features.




Fitting estimator with 34 features.




Fitting estimator with 32 features.




Fitting estimator with 30 features.




Fitting estimator with 28 features.




Fitting estimator with 26 features.




Fitting estimator with 24 features.




Fitting estimator with 22 features.




Fitting estimator with 100 features.




Fitting estimator with 98 features.




Fitting estimator with 96 features.




Fitting estimator with 94 features.




Fitting estimator with 92 features.




Fitting estimator with 90 features.




Fitting estimator with 88 features.




Fitting estimator with 86 features.




Fitting estimator with 84 features.




Fitting estimator with 82 features.




Fitting estimator with 80 features.




Fitting estimator with 78 features.




Fitting estimator with 76 features.




Fitting estimator with 74 features.




Fitting estimator with 72 features.




Fitting estimator with 70 features.




Fitting estimator with 68 features.




Fitting estimator with 66 features.




Fitting estimator with 64 features.




Fitting estimator with 62 features.




Fitting estimator with 60 features.




Fitting estimator with 58 features.




Fitting estimator with 56 features.




Fitting estimator with 54 features.




Fitting estimator with 52 features.




Fitting estimator with 50 features.




Fitting estimator with 48 features.




Fitting estimator with 46 features.




Fitting estimator with 44 features.




Fitting estimator with 42 features.




Fitting estimator with 40 features.




Fitting estimator with 38 features.




Fitting estimator with 36 features.




Fitting estimator with 34 features.




Fitting estimator with 32 features.




Fitting estimator with 30 features.




Fitting estimator with 28 features.




Fitting estimator with 26 features.




Fitting estimator with 24 features.




Fitting estimator with 22 features.




Fitting estimator with 100 features.




Fitting estimator with 98 features.




Fitting estimator with 96 features.




KeyboardInterrupt: 

In [None]:
var_selected = pd.DataFrame(sorted(zip(map(lambda x: round(x), rfecv.ranking_), X.columns)),
                            columns = ['ranking', 'variable'])
print(var_selected)