In [55]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
import os,shutil
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, chi2

file_path = '/content'
if os.path.isdir(file_path+'/data'):
  shutil.rmtree(file_path+'/data')
os.makedirs(file_path+'/data')
file_path = file_path+'/data'

In [56]:
df = pd.read_csv('/content/train dataset.csv')

x_columns = list(df.columns)
y_column= 'FRAUD_NONFRAUD'
x_columns.remove(y_column)

timestamp_attributes = ['PWD_UPDT_TS', 'PH_NUM_UPDT_TS', 'CUST_SINCE_DT', 'TRAN_DT', 'TRAN_TS', 'ACTVY_DT']
nominal_attributes = [
                      'CARR_NAME',
                      'RGN_NAME',
                      'STATE_PRVNC_TXT',
                      'ALERT_TRGR_CD',
                      'DVC_TYPE_TXT',
                      'AUTHC_PRIM_TYPE_CD',
                      'AUTHC_SCNDRY_STAT_TXT',
                      'CUST_STATE',
                      'ACTN_CD',
                      'ACTN_INTNL_TXT',
                      'TRAN_TYPE_CD',
                      'FRAUD_NONFRAUD']
numeric_attributes = ['TRAN_AMT', 'ACCT_PRE_TRAN_AVAIL_BAL', 'CUST_AGE', 'OPEN_ACCT_CT', 'WF_dvc_age']
redundant_attributes = ['ACTVY_DT', 'TRAN_DT', 'TRAN_TYPE_CD', 'ACTN_INTNL_TXT', 'ACTN_CD']

In [57]:
# Remove negative values from WF_dvc_age column
negIndices = df[ df['WF_dvc_age'] < 0 ].index
df.drop(negIndices , inplace=True)

In [58]:
# Drop timestamp attributes
drop_timestamp = df.drop(timestamp_attributes, axis = 'columns')
drop_timestamp.to_csv(file_path+'/drop_timestamp.csv', index=False)

# Retain timestamp attributes but remove redundant and useless attributes
retain_timestamp = df.copy()
retain_timestamp = retain_timestamp.drop(redundant_attributes, axis = 'columns')

#Convert timestamp columns from string to datetime type
retain_timestamp['TRAN_TS'] = pd.to_datetime(retain_timestamp['TRAN_TS'] )
retain_timestamp['PWD_UPDT_TS'] = pd.to_datetime(retain_timestamp['PWD_UPDT_TS'] , errors="coerce" )
retain_timestamp['PH_NUM_UPDT_TS'] = pd.to_datetime(retain_timestamp['PH_NUM_UPDT_TS'], errors="coerce" )
retain_timestamp['CUST_SINCE_DT'] = pd.to_datetime(retain_timestamp['CUST_SINCE_DT'] )
# Fill missing and invalid phone and password update timestamps with account creation timestamp
retain_timestamp.PWD_UPDT_TS.fillna(retain_timestamp.CUST_SINCE_DT, inplace=True)
retain_timestamp.PH_NUM_UPDT_TS.fillna(retain_timestamp.CUST_SINCE_DT, inplace=True)

# Create new columns for timestamp attributes
retain_timestamp['PWD_UPDT_DAYS'] = (retain_timestamp.TRAN_TS-retain_timestamp.PWD_UPDT_TS).dt.days
retain_timestamp['PH_NUM_UPDT_DAYS'] = (retain_timestamp.TRAN_TS-retain_timestamp.PH_NUM_UPDT_TS).dt.days
retain_timestamp['CUST_SINCE_DAYS'] = (retain_timestamp.TRAN_TS-retain_timestamp.CUST_SINCE_DT).dt.days

negIndices = retain_timestamp[ retain_timestamp['PWD_UPDT_DAYS'] < 0 ].index
retain_timestamp.drop(negIndices , inplace=True)

negIndices = retain_timestamp[ retain_timestamp['PH_NUM_UPDT_DAYS'] < 0 ].index
retain_timestamp.drop(negIndices , inplace=True)

negIndices = retain_timestamp[ retain_timestamp['CUST_SINCE_DAYS'] < 0 ].index
retain_timestamp.drop(negIndices , inplace=True)

retain_timestamp = retain_timestamp.drop(['TRAN_TS','PWD_UPDT_TS', 'PH_NUM_UPDT_TS', 'CUST_SINCE_DT'], axis = 'columns')
retain_timestamp.to_csv(file_path+'/retain_timestamp.csv', index=False)

In [59]:
for filename in os.listdir(file_path):
  # remove rows with NaN
  df = pd.read_csv(file_path + '/' + filename)
  df.dropna().to_csv(file_path + '/remove_nan-' + filename, index=False)

  # replace NaN with mode
  df = pd.read_csv(file_path + '/' + filename)
  for column in nominal_attributes:
    if column in df.columns:
      mean_value=(df[column].mode())[0]
      df.fillna(value=mean_value, inplace=True)
  df.to_csv(file_path + '/nan_with_mode-' + filename, index=False)
  os.remove(file_path + '/' + filename)


In [60]:
# One hot encode nominal attributes
for filename in os.listdir(file_path):
  _df = pd.read_csv(file_path + '/' + filename)
  for column in nominal_attributes:
    if column in _df.columns:
      top_values = _df[column].value_counts().sort_values(ascending=False)
      # Encode only the values that occur at least 100 times in the dataset
      top_values = list(top_values[top_values>100].keys())

      encoding = pd.get_dummies(_df[column], prefix=column)
      _df = _df.drop(column, axis = 'columns')
      top_values = [column+'_'+x for x in top_values]
      _df = _df.join(encoding[top_values])
  _df = _df.drop(['FRAUD_NONFRAUD_Non-Fraud'], axis = 'columns')
  _df.to_csv(file_path + '/' + filename, index=False)

In [61]:
# Standardize and normalize numerical attributes
for filename in os.listdir(file_path):
  _df = pd.read_csv(file_path + '/' + filename)
  _df_normalize = _df.copy()
  _df_standardize = _df.copy()
  _df_normalize[numeric_attributes] = preprocessing.normalize(_df_normalize[numeric_attributes])
  _df_standardize[numeric_attributes] = preprocessing.normalize(_df_standardize[numeric_attributes])
  _df_normalize.to_csv(file_path + '/normalized-' + filename, index=False)
  _df_standardize.to_csv(file_path + '/standardized-' + filename, index=False)
  os.remove(file_path + '/' + filename)

In [62]:
import seaborn as sns
for filename in os.listdir(file_path):
  _df = pd.read_csv(file_path + '/' + filename)
  x_columns = list(_df.columns)
  y_column= 'FRAUD_NONFRAUD_Fraud'
  x_columns.remove(y_column)
  selector = SelectKBest(chi2, k=50)
  _df_new = selector.fit_transform(_df[x_columns], _df[y_column])
  new_features = np.array(x_columns)[selector.get_support()]
  _df[new_features].to_csv(file_path + '/' + filename, index=False)