# Ethereum Fraud Detection

## Import Packages and Read Data

In [17]:
import pandas as pd
import numpy as np

In [18]:
df_train = pd.read_csv('data/train_accounts.csv')
df_test = pd.read_csv('data/test_accounts.csv')
df_transactions = pd.read_csv('data/transactions.csv')

In [19]:
train_list = df_train.account.to_list()
test_list = df_test.account.to_list()
whole_list = train_list + test_list

print('Training records:', len(train_list), 'Testing records', len(test_list))
print('Total records:', len(whole_list), len(set(whole_list)))

fraud_list = df_train[df_train.flag == 1].account.to_list()
print('Fraud records:', len(fraud_list))

Training records: 25198 Testing records 6300
Total records: 31498 31498
Fraud records: 2455


In [20]:
# Flag Distribution
print(df_train.flag.value_counts())

0    22743
1     2455
Name: flag, dtype: int64


In [21]:
print(df_transactions.isnull().sum())

print('b316' in train_list)
print('b316' in test_list)

display(df_transactions[df_transactions.to_account.isnull()])
df_transactions = df_transactions.dropna()

from_account            0
to_account              0
transaction_time_utc    0
value                   0
gas                     0
gas_price               0
dtype: int64
False
False


Unnamed: 0,from_account,to_account,transaction_time_utc,value,gas,gas_price


## Exploratory Data Analysis & Preprocessing

In [22]:
print(df_transactions.shape)
print(df_transactions.from_account.nunique(), df_transactions.to_account.nunique())

(5826604, 6)
604847 419535


In [23]:
df_transactions.transaction_time_utc = pd.to_datetime(df_transactions.transaction_time_utc)
df_transactions['transaction_year'] = df_transactions.transaction_time_utc.dt.year
df_transactions['transaction_month'] = df_transactions.transaction_time_utc.dt.month
df_transactions['transaction_day'] = df_transactions.transaction_time_utc.dt.day

In [24]:
df_transactions['gas_price'] = df_transactions['gas_price'].div(1000000000)
df_transactions['gas_fee'] = df_transactions['gas'].mul(df_transactions['gas_price'])
df_transactions['gas_fee'].describe()

count    5.826604e+06
mean     1.526253e+07
std      9.673410e+07
min      0.000000e+00
25%      4.590000e+05
50%      1.890000e+06
75%      9.000000e+06
max      6.923966e+10
Name: gas_fee, dtype: float64

In [25]:
df_transactions['is_token'] = np.where(df_transactions['value']==0, 1, 0)
df_transactions['value_digit'] = df_transactions.value.apply(lambda x: len(x))

In [26]:
display(df_transactions)
display(df_transactions[df_transactions.from_account.isin(fraud_list)])
display(df_transactions[df_transactions.to_account.isin(fraud_list)])

Unnamed: 0,from_account,to_account,transaction_time_utc,value,gas,gas_price,transaction_year,transaction_month,transaction_day,gas_fee,is_token,value_digit
0,a00996,b31499,2020-05-04 14:54:03,0,72585,11.500000,2020,5,4,8.347275e+05,0,1
1,a07890,b31500,2020-05-04 14:55:06,0,54426,11.349723,2020,5,4,6.177200e+05,0,1
2,a22857,b31501,2020-05-04 14:55:23,0,200000,14.024585,2020,5,4,2.804917e+06,0,1
3,a07890,b31502,2020-05-04 14:55:23,108900000000000000,21000,11.349723,2020,5,4,2.383442e+05,0,18
4,a21390,b31501,2020-05-04 14:56:05,0,149999,32.000000,2020,5,4,4.799968e+06,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5826599,b37259,a16395,2020-05-04 13:20:57,2000000000000000,21000,8.000000,2020,5,4,1.680000e+05,0,16
5826600,a18542,b31501,2020-05-04 13:21:32,0,60000,10.800001,2020,5,4,6.480001e+05,0,1
5826601,a20151,b966524,2020-05-04 13:21:32,13000000000000000,21000,10.800001,2020,5,4,2.268000e+05,0,17
5826602,a25907,b31505,2020-05-04 13:22:10,0,1500000,12.000000,2020,5,4,1.800000e+07,0,1


Unnamed: 0,from_account,to_account,transaction_time_utc,value,gas,gas_price,transaction_year,transaction_month,transaction_day,gas_fee,is_token,value_digit
420,a05997,b31735,2020-05-04 15:35:10,0,210000,0.00,2020,5,4,0.000000e+00,0,1
500,a05997,b31735,2020-05-04 15:57:42,0,210000,0.00,2020,5,4,0.000000e+00,0,1
622,a26604,b31818,2020-05-04 16:29:48,6076665128599282058,21000,20.00,2020,5,4,4.200000e+05,0,19
1236,a15674,b32254,2020-05-04 17:26:13,97718000000000000,21000,30.00,2020,5,4,6.300000e+05,0,17
1448,a13198,b31704,2020-05-04 18:48:44,100000000000000000,279749,8.00,2020,5,4,2.237992e+06,0,18
...,...,...,...,...,...,...,...,...,...,...,...,...
5825441,a10789,a06893,2020-05-04 07:46:15,10000000000000000,21000,7.00,2020,5,4,1.470000e+05,0,17
5825679,a24708,b966219,2020-05-04 08:54:34,2389126830000000000,21000,10.89,2020,5,4,2.286900e+05,0,19
5825693,a04245,b966219,2020-05-04 08:58:13,20623693655805113552,21000,9.90,2020,5,4,2.079000e+05,0,20
5826254,a04164,b31710,2020-05-04 12:02:38,0,120000,12.00,2020,5,4,1.440000e+06,0,1


Unnamed: 0,from_account,to_account,transaction_time_utc,value,gas,gas_price,transaction_year,transaction_month,transaction_day,gas_fee,is_token,value_digit
51,b31530,a10789,2020-05-04 15:06:43,10000000000000000,21000,12.000000,2020,5,4,2.520000e+05,0,17
157,b31600,a10789,2020-05-04 14:52:36,10000000000000000,21000,12.000000,2020,5,4,2.520000e+05,0,17
163,b31604,a04046,2020-05-04 14:06:35,1087000000000000000,200000,12.000000,2020,5,4,2.400000e+06,0,19
459,b31754,a26604,2020-05-04 15:51:28,1999727000000000000,21000,13.000000,2020,5,4,2.730000e+05,0,19
472,b31762,a22356,2020-05-04 15:54:08,100000000000000000,21000,8.000000,2020,5,4,1.680000e+05,0,18
...,...,...,...,...,...,...,...,...,...,...,...,...
5826144,b966474,a14838,2020-05-04 11:21:06,999893950000000000,21000,5.050000,2020,5,4,1.060500e+05,0,18
5826256,b31709,a04164,2020-05-04 12:02:54,1500000000000000,120000,12.000000,2020,5,4,1.440000e+06,0,16
5826290,b921188,a10789,2020-05-04 12:12:59,10000000000000000,21000,8.000000,2020,5,4,1.680000e+05,0,17
5826512,b966512,a14838,2020-05-04 12:57:37,1989853000000000000,21000,7.000000,2020,5,4,1.470000e+05,0,19


In [27]:
print(df_transactions.value.value_counts())

0                      3677450
1000000000000000000      36483
100000000000000000       27323
10000000000000000        22847
1000000000000000         22662
                        ...   
299159748522984033           1
99212373461673444            1
1131243810000000000          1
200662480387428992           1
51949804000000000            1
Name: value, Length: 1248010, dtype: int64


In [28]:
print(df_transactions[df_transactions.from_account.isin(fraud_list)].to_account.value_counts())

b31735     3437
b35358     3330
b57644     3292
b141502    1460
b279004    1455
           ... 
b655785       1
b394623       1
b655793       1
b655950       1
a06893        1
Name: to_account, Length: 19254, dtype: int64


In [29]:
print(df_transactions[df_transactions.from_account.isin(fraud_list)].value.value_counts())

0                       31209
10000000000000000        1670
5000000000000000         1098
1000000000000000000       617
100000000000000000        541
                        ...  
102834000000000             1
75676000000000              1
105764000000000             1
44836000000000              1
20623693655805113552        1
Name: value, Length: 19521, dtype: int64


In [30]:
print(df_transactions[df_transactions.from_account.isin(fraud_list)].gas.value_counts())


21000      18313
60000      16578
210000      6411
250000      5652
100000       527
           ...  
152541         1
52467          1
53931          1
145379         1
3905623        1
Name: gas, Length: 6386, dtype: int64


In [31]:
print(df_transactions[df_transactions.from_account.isin(fraud_list)].gas_price.value_counts())

0.000000      6393
4.000000      3028
21.000000     2808
41.000000     2722
5.000000      2341
              ... 
143.428571       1
41.000001        1
39.600002        1
139.657143       1
9.900000         1
Name: gas_price, Length: 4449, dtype: int64


In [32]:
df_train

Unnamed: 0,account,flag
0,a17249,0
1,a03683,1
2,a22146,0
3,a26056,1
4,a13971,0
...,...,...
25193,a24443,0
25194,a12337,0
25195,a08122,0
25196,a27826,1
