# Import Packages and Read Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv('train_accounts.csv')
df_test = pd.read_csv('test_accounts.csv')
df_transactions = pd.read_csv('transactions.csv')

In [3]:
train_list = df_train.account.to_list()
test_list = df_test.account.to_list()
whole_list = train_list + test_list

print('Training records:', len(train_list), 'Testing records', len(test_list))
print('Total records:', len(whole_list), len(set(whole_list)))

fraud_list = df_train[df_train.flag == 1].account.to_list()
print('Fraud records:', len(fraud_list))

Training records: 25198 Testing records 6300
Total records: 31498 31498
Fraud records: 2455


In [4]:
print(df_transactions.shape)
print(df_transactions.from_account.nunique(), df_transactions.to_account.nunique())

(874985, 6)
87799 80227


## Preprocessing

In [5]:
# There is one record with NULL value but not in the train / test list
print(df_transactions.isnull().sum())
df_transactions = df_transactions.dropna()

from_account            0
to_account              1
transaction_time_utc    1
value                   1
gas                     1
gas_price               1
dtype: int64


In [6]:
df_transactions_part_0 = df_transactions.copy()

In [7]:
df_transactions.transaction_time_utc = pd.to_datetime(df_transactions.transaction_time_utc)
df_transactions['transaction_year'] = df_transactions.transaction_time_utc.dt.year
df_transactions['transaction_month'] = df_transactions.transaction_time_utc.dt.month
df_transactions['transaction_day'] = df_transactions.transaction_time_utc.dt.day
df_transactions['transaction_date'] = df_transactions.transaction_time_utc.dt.date

In [8]:
df_transactions['gas_price'] = df_transactions['gas_price'].div(1000000000)
df_transactions['gas_fee'] = df_transactions['gas'].mul(df_transactions['gas_price'])
df_transactions['gas_fee'].describe()

count    8.749840e+05
mean     2.977605e+07
std      1.501229e+08
min      0.000000e+00
25%      2.404282e+06
50%      8.550000e+06
75%      2.529370e+07
max      4.085285e+10
Name: gas_fee, dtype: float64

In [9]:
df_transactions['is_token'] = np.where(df_transactions['value']=='0', 1, 0)
df_transactions['value_digit'] = df_transactions.value.apply(lambda x: len(x))

In [10]:
df_transactions_part_1 = df_transactions.copy()

In [11]:
df_transactions.describe()

Unnamed: 0,gas,gas_price,transaction_year,transaction_month,transaction_day,gas_fee,is_token,value_digit
count,874984.0,874984.0,874984.0,874984.0,874984.0,874984.0,874984.0,874984.0
mean,298439.2,100.692407,2020.0,6.97626,15.782499,29776050.0,0.698604,6.031943
std,554868.6,292.680415,0.0,1.198192,9.012622,150122900.0,0.458865,7.691872
min,21000.0,0.0,2020.0,5.0,1.0,0.0,0.0,1.0
25%,54901.0,33.0,2020.0,6.0,8.0,2404282.0,0.0,1.0
50%,130000.0,59.0,2020.0,7.0,16.0,8550000.0,1.0,1.0
75%,400000.0,100.8,2020.0,8.0,24.0,25293700.0,1.0,16.0
max,11903970.0,171397.020211,2020.0,9.0,31.0,40852850000.0,1.0,23.0


## Dataset Construction

In [12]:
df_transactions.columns

Index(['from_account', 'to_account', 'transaction_time_utc', 'value', 'gas',
       'gas_price', 'transaction_year', 'transaction_month', 'transaction_day',
       'transaction_date', 'gas_fee', 'is_token', 'value_digit'],
      dtype='object')

In [13]:
df_all = pd.DataFrame(data={'account': whole_list})
usable_var_1 = ['transaction_date', 'gas', 'gas_price', 'gas_fee', 'is_token', 'value_digit']

df_all = pd.merge(df_all, df_transactions[])


SyntaxError: invalid syntax. Perhaps you forgot a comma? (871106317.py, line 4)

## Feature Engineering

1. from account features
2. to account features

- maximum transaction number within the same date
- total number of transaction
- whether from/to account identified as fraud
- total number of token/non-token transaction
- max, min, mean, std of gas price / gas / gas_fee
- max, min, mean, std of transaction value_digit

explore distribution: see if transformation needed

implement models: regression, autoencoder, boosting, RNN