In [15]:
# A script that is designed to clean up and slim down the data to retain only the features needed for Anamoly Detection
import numpy as np
import pandas as pd
import h5py
from datetime import datetime 

In [16]:
# Load the fraud detection dataset from kaggle
df = pd.read_csv("/Users/chiral/git_projects/fraud_detection/dataset/bank_transactions_data_2.csv")

In [17]:
# Take a first look at the data
df.head()

Unnamed: 0,TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP Address,MerchantID,Channel,CustomerAge,CustomerOccupation,TransactionDuration,LoginAttempts,AccountBalance,PreviousTransactionDate
0,TX000001,AC00128,14.09,4/11/23 16:29,Debit,San Diego,D000380,162.198.218.92,M015,ATM,70,Doctor,81,1,5112.21,11/4/24 08:08
1,TX000002,AC00455,376.24,6/27/23 16:44,Debit,Houston,D000051,13.149.61.4,M052,ATM,68,Doctor,141,1,13758.91,11/4/24 08:09
2,TX000003,AC00019,126.29,7/10/23 18:16,Debit,Mesa,D000235,215.97.143.157,M009,Online,19,Student,56,1,1122.35,11/4/24 08:07
3,TX000004,AC00070,184.5,5/5/23 16:32,Debit,Raleigh,D000187,200.13.225.150,M002,Online,26,Student,25,1,8569.06,11/4/24 08:09
4,TX000005,AC00411,13.45,10/16/23 17:51,Credit,Atlanta,D000308,65.164.3.100,M091,Online,26,Student,198,1,7429.4,11/4/24 08:06


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2512 entries, 0 to 2511
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   TransactionID            2512 non-null   object 
 1   AccountID                2512 non-null   object 
 2   TransactionAmount        2512 non-null   float64
 3   TransactionDate          2512 non-null   object 
 4   TransactionType          2512 non-null   object 
 5   Location                 2512 non-null   object 
 6   DeviceID                 2512 non-null   object 
 7   IP Address               2512 non-null   object 
 8   MerchantID               2512 non-null   object 
 9   Channel                  2512 non-null   object 
 10  CustomerAge              2512 non-null   int64  
 11  CustomerOccupation       2512 non-null   object 
 12  TransactionDuration      2512 non-null   int64  
 13  LoginAttempts            2512 non-null   int64  
 14  AccountBalance          

In [19]:
# Lets try and extract more information from the datetime like, hour, week nd time since last purchase
# Plot the amount of time between purchases, which could indicate fraud
# Convert the current and last transaction time to a date time format
df['TransactionDate_dt'] = pd.to_datetime(df['TransactionDate'], format='%m/%d/%y %H:%M')
df['PreviousTransactionDate_dt'] = pd.to_datetime(df['PreviousTransactionDate'], format='%m/%d/%y %H:%M')

df['TransactionDate_hour'] = pd.to_datetime(df['TransactionDate'], format='%m/%d/%y %H:%M').dt.hour
df['TransactionWeekNumber'] = pd.to_datetime(df['TransactionDate'], format='%m/%d/%y %H:%M').dt.isocalendar().week


# Calculate the days between the current and last purchase, normalize by 10k to 
df['DaysSinceLastPurchase'] = (df['PreviousTransactionDate_dt']-df['TransactionDate_dt']).dt.total_seconds()/(3600*24)

In [20]:
# Knowing that all data 
column_names = list(df.columns)
column_data_types = list(df.dtypes)

# Converting all string types to an integer type for processing
for dtype, name in zip(column_data_types, column_names):
    # print(dtype)
    if dtype == "object":
        df[name] = df[name].astype('category').cat.codes


In [21]:
print(column_names)

['TransactionID', 'AccountID', 'TransactionAmount', 'TransactionDate', 'TransactionType', 'Location', 'DeviceID', 'IP Address', 'MerchantID', 'Channel', 'CustomerAge', 'CustomerOccupation', 'TransactionDuration', 'LoginAttempts', 'AccountBalance', 'PreviousTransactionDate', 'TransactionDate_dt', 'PreviousTransactionDate_dt', 'TransactionDate_hour', 'TransactionWeekNumber', 'DaysSinceLastPurchase']


In [22]:
df.head()

Unnamed: 0,TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP Address,MerchantID,Channel,...,CustomerOccupation,TransactionDuration,LoginAttempts,AccountBalance,PreviousTransactionDate,TransactionDate_dt,PreviousTransactionDate_dt,TransactionDate_hour,TransactionWeekNumber,DaysSinceLastPurchase
0,0,126,14.09,1247,1,36,365,186,14,0,...,0,81,1,5112.21,2,2023-04-11 16:29:00,2024-11-04 08:08:00,16,15,572.652083
1,1,450,376.24,1717,1,15,50,82,51,0,...,0,141,1,13758.91,3,2023-06-27 16:44:00,2024-11-04 08:09:00,16,26,495.642361
2,2,18,126.29,1810,1,23,229,343,8,2,...,3,56,1,1122.35,1,2023-07-10 18:16:00,2024-11-04 08:07:00,18,28,482.577083
3,3,68,184.5,1569,1,33,182,300,1,2,...,3,25,1,8569.06,3,2023-05-05 16:32:00,2024-11-04 08:09:00,16,18,548.650694
4,4,406,13.45,261,0,1,298,501,90,2,...,3,198,1,7429.4,0,2023-10-16 17:51:00,2024-11-04 08:06:00,17,42,384.59375


In [23]:
# df.describe()

In [24]:
# Let's clean up the data by dropping unnecessary columns that won't be used in training.

# Defining an array of featuers to drop
extraneous_variables = ['TransactionID','AccountID','TransactionAmount','Location','TransactionDate','DeviceID','IP Address','MerchantID','TransactionDate_dt','PreviousTransactionDate_dt']
df.drop(extraneous_variables, axis=1, inplace=True)

In [25]:
# Removing the data that we don't want for interfacing with the k-clustering algorithm
df.head()

Unnamed: 0,TransactionType,Channel,CustomerAge,CustomerOccupation,TransactionDuration,LoginAttempts,AccountBalance,PreviousTransactionDate,TransactionDate_hour,TransactionWeekNumber,DaysSinceLastPurchase
0,1,0,70,0,81,1,5112.21,2,16,15,572.652083
1,1,0,68,0,141,1,13758.91,3,16,26,495.642361
2,1,2,19,3,56,1,1122.35,1,18,28,482.577083
3,1,2,26,3,25,1,8569.06,3,16,18,548.650694
4,0,2,26,3,198,1,7429.4,0,17,42,384.59375


In [26]:
df.dtypes

TransactionType               int8
Channel                       int8
CustomerAge                  int64
CustomerOccupation            int8
TransactionDuration          int64
LoginAttempts                int64
AccountBalance             float64
PreviousTransactionDate       int8
TransactionDate_hour         int32
TransactionWeekNumber       UInt32
DaysSinceLastPurchase      float64
dtype: object

In [27]:
# Convert integer columns to NumPy integer type
for col in df.select_dtypes(include='number').columns:
    if pd.api.types.is_integer_dtype(df[col]):
        df[col] = df[col].astype(np.int64)  # Change to np.int32 if needed


In [38]:
# I think the data might be in good enough shape to convert to hdf5 now.

# Calling the hdf5 dataset something that makes sense
filename = 'credit_card_fraud.h5'

# Converting to hdf5
df.to_hdf(filename, key='fraud_dataset', mode='w')


In [39]:
# Checking that the hdf5 file was created properly
df_hdf5 = pd.read_hdf(filename, key='fraud_dataset')
print(df_hdf5.head())

   TransactionType  Channel  CustomerAge  CustomerOccupation  \
0                1        0           70                   0   
1                1        0           68                   0   
2                1        2           19                   3   
3                1        2           26                   3   
4                0        2           26                   3   

   TransactionDuration  LoginAttempts  AccountBalance  \
0                   81              1         5112.21   
1                  141              1        13758.91   
2                   56              1         1122.35   
3                   25              1         8569.06   
4                  198              1         7429.40   

   PreviousTransactionDate  TransactionDate_hour  TransactionWeekNumber  \
0                        2                    16                     15   
1                        3                    16                     26   
2                        1                    1

In [None]:
# Alright, so let's try to get training.
