# Feature Selection for id-Features

## import libraries

In [1]:
import pandas as pd

## read data

In [2]:
identity_dataframe = pd.read_csv('data/train_identity.csv')
identity_dataframe.head()

Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0.0,70787.0,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
3,2987011,-5.0,221832.0,,,0.0,-6.0,,,,...,chrome 62.0,,,,F,F,T,T,desktop,
4,2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS


In [3]:
transaction_dataframe = pd.read_csv('data/train_transaction.csv')
transaction_dataframe.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
is_fraud_dataframe = transaction_dataframe[['TransactionID', 'isFraud']]
is_fraud_dataframe.head()

Unnamed: 0,TransactionID,isFraud
0,2987000,0
1,2987001,0
2,2987002,0
3,2987003,0
4,2987004,0


In [6]:
dataframe = is_fraud_dataframe.merge(identity_dataframe, on='TransactionID')
dataframe.head()

Unnamed: 0,TransactionID,isFraud,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0,0.0,70787.0,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,0,-5.0,98945.0,,,0.0,-5.0,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,0,-5.0,191631.0,0.0,0.0,0.0,0.0,,,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
3,2987011,0,-5.0,221832.0,,,0.0,-6.0,,,...,chrome 62.0,,,,F,F,T,T,desktop,
4,2987016,0,0.0,7460.0,0.0,0.0,1.0,0.0,,,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS


## check the percentage of NaNs in each column

In [7]:
for column in dataframe.columns:
    num_nans = dataframe[column].isna().sum()
    print('percentage of NaNs for {}: {:.3f}'.format(
        column, 
        num_nans / len(dataframe) * 100
    ))

percentage of NaNs for TransactionID: 0.000
percentage of NaNs for isFraud: 0.000
percentage of NaNs for id_01: 0.000
percentage of NaNs for id_02: 2.330
percentage of NaNs for id_03: 54.016
percentage of NaNs for id_04: 54.016
percentage of NaNs for id_05: 5.108
percentage of NaNs for id_06: 5.108
percentage of NaNs for id_07: 96.426
percentage of NaNs for id_08: 96.426
percentage of NaNs for id_09: 48.052
percentage of NaNs for id_10: 48.052
percentage of NaNs for id_11: 2.257
percentage of NaNs for id_12: 0.000
percentage of NaNs for id_13: 11.726
percentage of NaNs for id_14: 44.504
percentage of NaNs for id_15: 2.252
percentage of NaNs for id_16: 10.326
percentage of NaNs for id_17: 3.372
percentage of NaNs for id_18: 68.722
percentage of NaNs for id_19: 3.408
percentage of NaNs for id_20: 3.447
percentage of NaNs for id_21: 96.423
percentage of NaNs for id_22: 96.416
percentage of NaNs for id_23: 96.416
percentage of NaNs for id_24: 96.709
percentage of NaNs for id_25: 96.442
per

## remove columns with a lot of NaNs

In [8]:
# keep columns with less than 90% NaNs
less_nan_dataframe = dataframe.loc[:, dataframe.isna().mean() < 0.9]
less_nan_dataframe.head()

Unnamed: 0,TransactionID,isFraud,id_01,id_02,id_03,id_04,id_05,id_06,id_09,id_10,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0,0.0,70787.0,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,0,-5.0,98945.0,,,0.0,-5.0,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,0,-5.0,191631.0,0.0,0.0,0.0,0.0,0.0,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
3,2987011,0,-5.0,221832.0,,,0.0,-6.0,,,...,chrome 62.0,,,,F,F,T,T,desktop,
4,2987016,0,0.0,7460.0,0.0,0.0,1.0,0.0,0.0,0.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS


## check column datatypes

In [9]:
less_nan_dataframe.dtypes

TransactionID      int64
isFraud            int64
id_01            float64
id_02            float64
id_03            float64
id_04            float64
id_05            float64
id_06            float64
id_09            float64
id_10            float64
id_11            float64
id_12             object
id_13            float64
id_14            float64
id_15             object
id_16             object
id_17            float64
id_18            float64
id_19            float64
id_20            float64
id_28             object
id_29             object
id_30             object
id_31             object
id_32            float64
id_33             object
id_34             object
id_35             object
id_36             object
id_37             object
id_38             object
DeviceType        object
DeviceInfo        object
dtype: object