In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import logging
import os
log = logging.getLogger('Exploratory_Data_Analysis')
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s : %(message)s', datefmt='%H:%M:%S')

In [2]:
if os.path.exists('../data/bank_transactions_data_2.csv'):
        df = pd.read_csv('../data/bank_transactions_data_2.csv')
        log.info('Data has successfully been loaded')
else:
    log.error('File Not Found')

11:19:09 - INFO : Data has successfully been loaded


In [3]:
df.head()

Unnamed: 0,TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP Address,MerchantID,Channel,CustomerAge,CustomerOccupation,TransactionDuration,LoginAttempts,AccountBalance,PreviousTransactionDate
0,TX000001,AC00128,14.09,2023-04-11 16:29:14,Debit,San Diego,D000380,162.198.218.92,M015,ATM,70,Doctor,81,1,5112.21,2024-11-04 08:08:08
1,TX000002,AC00455,376.24,2023-06-27 16:44:19,Debit,Houston,D000051,13.149.61.4,M052,ATM,68,Doctor,141,1,13758.91,2024-11-04 08:09:35
2,TX000003,AC00019,126.29,2023-07-10 18:16:08,Debit,Mesa,D000235,215.97.143.157,M009,Online,19,Student,56,1,1122.35,2024-11-04 08:07:04
3,TX000004,AC00070,184.5,2023-05-05 16:32:11,Debit,Raleigh,D000187,200.13.225.150,M002,Online,26,Student,25,1,8569.06,2024-11-04 08:09:06
4,TX000005,AC00411,13.45,2023-10-16 17:51:24,Credit,Atlanta,D000308,65.164.3.100,M091,Online,26,Student,198,1,7429.4,2024-11-04 08:06:39


In [4]:
df['amount_to_balance_ratio'] = (df['TransactionAmount'] / df['AccountBalance'] )

In [5]:
df['avg_txn_amount_account'] = df.groupby('AccountID')['TransactionAmount'].transform('mean')

In [6]:
df['txn_amount_account'] = df.groupby('AccountID')['TransactionAmount'].transform('count')

In [7]:
user_stats = df.groupby('AccountID')['TransactionAmount'].agg(['mean','std']).reset_index()
df = df.merge(user_stats, on='AccountID', how='left', suffixes=('','_user'))
df['amount_zscore'] = (df['TransactionAmount'] - df['mean']) / df['std']

In [8]:
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'])
df['PreviousTransactionDate'] = pd.to_datetime(df['PreviousTransactionDate'])

In [9]:
df['diff_btwn_txn_times'] = abs(df['TransactionDate'] - df['PreviousTransactionDate']).dt.days
df['avg_time_btwn_txns'] = df.groupby('AccountID')['diff_btwn_txn_times'].transform('mean')

In [10]:
df['amount_to_user_avg'] = df['TransactionAmount'] / df['avg_txn_amount_account']

In [11]:
df['unique_merchant_user'] = df.groupby('AccountID')['MerchantID'].transform('nunique')

In [12]:
df['TransactionHour'] = df['TransactionDate'].dt.hour

In [13]:
df['TransactionDayOfTheWeek'] = df['TransactionDate'].dt.dayofweek

In [14]:
df['is_night'] = df['TransactionHour'].apply(lambda h: 1 if h < 6 or h > 22 else 0)

In [15]:
df['unique_devices_per_account'] = df.groupby('AccountID')['DeviceID'].transform('count')

In [16]:
df['unique_ips_per_account'] = df.groupby('AccountID')['IP Address'].transform('count')

In [17]:
df['unique_accounts_per_devices'] = df.groupby('DeviceID')['AccountID'].transform('count')

In [18]:
df['avg_loginAttempts'] = df.groupby('AccountID')['LoginAttempts'].transform('mean')

In [19]:
df['merchant_avg_amount'] = df.groupby('MerchantID')['TransactionAmount'].transform('mean')

In [20]:
df['merchant_amount_deviation'] = (df['TransactionAmount'] - df['merchant_avg_amount'])

In [21]:
df['loginAttemps_excess'] = df['LoginAttempts'] - df['avg_loginAttempts']

In [22]:
df['TransactionAmount'] = np.log1p(df['TransactionAmount'])

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import CountEncoder, HashingEncoder
# define numeric features
numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
one_hot_features = ['Channel','CustomerOccupation','TransactionType']
freq_features = ['Location','MerchantID']
hash_features = ['AccountID','DeviceID','IP Address']
df.drop(columns=['TransactionID','TransactionDate','PreviousTransactionDate'])

Unnamed: 0,AccountID,TransactionAmount,TransactionType,Location,DeviceID,IP Address,MerchantID,Channel,CustomerAge,CustomerOccupation,...,TransactionHour,TransactionDayOfTheWeek,is_night,unique_devices_per_account,unique_ips_per_account,unique_accounts_per_devices,avg_loginAttempts,merchant_avg_amount,merchant_amount_deviation,loginAttemps_excess
0,AC00128,2.714032,Debit,San Diego,D000380,162.198.218.92,M015,ATM,70,Doctor,...,16,1,0,7,7,6,1.000,298.023438,-283.933438,0.000
1,AC00455,5.932882,Debit,Houston,D000051,13.149.61.4,M052,ATM,68,Doctor,...,16,1,0,7,7,5,1.000,295.009259,81.230741,0.000
2,AC00019,4.846468,Debit,Mesa,D000235,215.97.143.157,M009,Online,19,Student,...,18,0,0,4,4,5,1.000,350.868000,-224.578000,0.000
3,AC00070,5.223055,Debit,Raleigh,D000187,200.13.225.150,M002,Online,26,Student,...,16,4,0,8,8,6,1.375,290.427097,-105.927097,-0.375
4,AC00411,2.670694,Credit,Atlanta,D000308,65.164.3.100,M091,Online,26,Student,...,17,0,0,6,6,3,1.000,287.662903,-274.212903,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2507,AC00297,6.753683,Credit,Colorado Springs,D000625,21.157.41.17,M072,Branch,33,Doctor,...,17,2,0,10,10,3,1.000,314.232000,541.978000,0.000
2508,AC00322,5.531570,Debit,Tucson,D000410,49.174.157.140,M029,Branch,48,Doctor,...,17,2,0,9,9,5,1.000,271.292800,-19.752800,0.000
2509,AC00095,3.388787,Debit,San Diego,D000095,58.1.27.124,M087,Branch,56,Retired,...,17,0,0,8,8,5,1.000,227.976400,-199.346400,0.000
2510,AC00118,5.230948,Debit,Denver,D000634,21.190.11.223,M041,Online,23,Student,...,16,4,0,4,4,2,1.000,258.365500,-72.395500,0.000


In [None]:
# define transformers 
num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

robust_features = ['AccountBalance', 'TransactionAmount_log']
robust_transformer = Pipeline(steps=[
    ('robust', RobustScaler())
])

onehot_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

hash_transformer = Pipeline(steps=[
    ('encoder',HashingEncoder(n_components=16))
])

freq_transformer = Pipeline(steps=[
    ('encoder',CountEncoder())
])

# combine into a column transformer
preprocessor = ColumnTransformer(transformers=[
    ('num',num_transformer, numeric_features),
    ('robust',robust_transformer,robust_features),
    ('onehot',onehot_transformer,one_hot_features),
    ('freq',freq_transformer, freq_features),
    ('hash',hash_transformer,hash_features)
])