In [1]:
%load_ext autoreload
%autoreload 2
%config IPCompleter.greedy=True

import sys, os, time, warnings, pdb, pickle, random, math, re, json
warnings.filterwarnings('ignore')
sys.path.insert(0, '../scripts')

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from datetime import datetime

GLOBAL_SEED=42
np.set_printoptions(precision=4)
sns.set_style("darkgrid")
pd.set_option('display.float_format', '{:.2f}'.format)
%matplotlib inline

In [2]:
day_map = {k:v for k,v in zip(range(7), ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])}

In [3]:
df = pd.concat([pd.read_csv('../data/fraudTrain.csv'), pd.read_csv('../data/fraudTest.csv')], ignore_index=True)
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['dob'] = pd.to_datetime(df['dob'])
df['age_at_trans'] = ((df['trans_date_trans_time'] - df['dob']).dt.days / 365.25).astype(int)
df['name'] = df['first'] + ' ' + df['last']
df['hour'] = df['trans_date_trans_time'].dt.hour
df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
df['week_of_year'] = df['trans_date_trans_time'].dt.isocalendar().week
df['month'] = df['trans_date_trans_time'].dt.month
df['gender_enc'] = df['gender'].apply(lambda x: 1 if x == 'M' else 0)

df = df.sort_values('trans_date_trans_time').reset_index(drop=True)
df = df.drop(columns=['Unnamed: 0', 'dob', 'first', 'last', 'street', 'state', 'city', 'zip', 'trans_num', 'unix_time'])
cols = ['name', 'gender', 'gender_enc', 'age_at_trans', 'job', 'cc_num', 'merchant', 'category', 'merch_lat', 'merch_long', 'city_pop', 'lat', 'long', 'trans_date_trans_time', 'hour', 'day_of_week', 'week_of_year', 'month', 'amt', 'is_fraud']
df = df[cols]

In [4]:
split_point = int(len(df) * 0.85)
train_df = df.iloc[:split_point]
test_df = df.iloc[split_point:]

In [5]:
train_df, val_df = train_test_split(train_df, stratify=train_df['is_fraud'], test_size=0.2, random_state=GLOBAL_SEED)

In [6]:
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [7]:
cat_cols = ['cc_num', 'name', 'merchant', 'job', 'category']
les = {col: LabelEncoder() for col in cat_cols}

for col in cat_cols:
    train_df[f'{col}_enc'] = les[col].fit_transform(train_df[col])

for col in cat_cols:
    val_df[f'{col}_enc'] = les[col].transform(val_df[col])

test_df = test_df[test_df['cc_num'].isin(les['cc_num'].classes_)]
test_df = test_df.sort_values('trans_date_trans_time').reset_index(drop=True)

for col in cat_cols:
    test_df[f'{col}_enc'] = les[col].transform(test_df[col])

In [8]:
num_cols = ['amt', 'lat', 'long', 'city_pop', 'merch_lat', 'merch_long', 'age_at_trans']

In [9]:
scaler = StandardScaler()
scaled_train = scaler.fit_transform(train_df[num_cols])
scaled_val, scaled_test = scaler.transform(val_df[num_cols]), scaler.transform(test_df[num_cols])
for i, col in enumerate(num_cols):
    train_df[f'{col}_norm'] = scaled_train[:, i]
    val_df[f'{col}_norm'] = scaled_val[:, i]
    test_df[f'{col}_norm'] = scaled_test[:, i]

In [10]:
train_df.to_csv('../data/processed_train.csv', index=False)
val_df.to_csv('../data/processed_val.csv', index=False)
test_df.to_csv('../data/processed_test.csv', index=False)