# Ethereum transactions exploratory data analysis

### Read the pandas dataframe parquet files

In [1]:
import os
import json

import pandas as pd
import numpy as np
import altair as alt

In [2]:
PATH = '../data/'

In [3]:
# Set dataframe display behaviours 
pd.set_option('display.max_columns', 100, 'display.max_rows', 100, 'display.max_colwidth', 100, 'display.float_format', '{:.4f}'.format)

In [4]:
file_dir = os.listdir(PATH)
file_list = [os.path.join(PATH, file) for file in file_dir if file.startswith('df_')]

In [5]:
df_raw = pd.concat([pd.read_parquet(file) for file in file_list])

In [6]:
df = df_raw.copy()

### Convert `type` to categorical data type

In [7]:
df['type'] = df['type'].astype('category')

### Add `eth_value` column
Convert `value` which by default is in Wei (the smallest unit of Ether) to Ether.  
1 ETH = 10^18 Wei  
1 ETH = 10^9 Gwei

In [8]:
df['eth_value'] = df['value'].div(10**18)

### Read the block aggregated transaction counts json file
This dataset is a total count of transactions per day rather than raw transactions.

In [9]:
block_txn = json.load(open('../data/raw/block_txn_counts.json'))
df_txn_raw = pd.DataFrame(block_txn)
df_txn = df_txn_raw.copy()

### Convert `block_timestamp` to datetime and create `block_date` field

In [10]:
df_txn['block_timestamp'] = df_txn['block_timestamp'].astype('datetime64[s]')
df_txn['block_date'] = df_txn['block_timestamp'].astype('datetime64[D]')

### Read the address labels json file

In [11]:
df_labels = pd.read_parquet(f'{PATH}address_labels.parquet')

### The Ethereum raw transactions dataset
- Due to memory and storage limitation of the Jupyter free instance on the cloud, only raw transactions from 1 July 2022 to 7 July 2022 and 6 July 2021 were extracted from the API of a remote node provider (Alchemy).
- Raw transactions for 30 June and 8 July 2022 are incomplete.
- The total number of transactions for below dates (except 30 June and 8 July 2022) matches those in [Dune](add link)

#### The number of transactions in the dataset by date

In [28]:
df.groupby(['block_date']).size()

block_date
2021-07-06    1355421
2022-06-30        511
2022-07-01    1178620
2022-07-02    1189382
2022-07-03    1157052
2022-07-04    1151999
2022-07-05    1182504
2022-07-06    1161432
2022-07-07    1194330
2022-07-08      76276
dtype: int64

In [29]:
df.sample(5)

Unnamed: 0,hash,blockHash,blockNumber,chainId,from,gas,gasPrice,input,maxFeePerGas,maxPriorityFeePerGas,nonce,to,transactionIndex,type,value,block_timestamp,block_date,eth_value
1125015,0x78d88e4c50b9c1a97093d7f303dd43441da5d4b7636d95fe70b8dd132c44e4f9,0x64fe66e59f0b5395f0640790d9ea3b3279354220a1d121972b88ca3024782f2f,15059755,1.0,0x3cd751e6b0078be393132286c442345e5dc49699,21000,33525372544,0x,57000000000.0,2000000000.0,8243372,0x6794767befe0ff5fb1f308980668cbed1f494d6e,36,2,2.9096e+16,2022-07-02 00:18:01,2022-07-02,0.0291
622755,0x0f86345519a94e715c63120e6ad9f8d01e7bf38fd779107cfd060c7da02f755c,0xb661f9e5210ee76caef30cdc048e172cc74fb82bab8c3c1210e26c4aef7baac8,15085408,,0xdf6c4ff43d5f0578b27f0bd7ebf49ab34b52ddd9,500000,130000000000,0x095ea7b30000000000000000000000007a250d5630b4cf539739df2c5dacb4c659f2488dffffffffffffffffffffff...,,,379,0x4da8126300cb00c4fd956d218d732f90067c849e,8,0,0.0,2022-07-05 23:11:20,2022-07-05,0.0
1326773,0xf78adf1b11415d9a8a20440c9ccd8dfb9ef58136e74d8ca2abb47e65e60fb0bf,0xce738e86b5f4a00ee4813974c15642c6ee8f11064057c47e2650bdde886935ff,12776884,,0x46340b20830761efd32832a74d7169b29feb9758,350000,37000000000,0xa9059cbb0000000000000000000000006a9f839381ae927f17f842d33b3b29a679cef1f50000000000000000000000...,,,2648177,0x95ad61b0a150d79219dcf64e1e6cc01f0b64c4ce,127,0,0.0,2021-07-06 23:31:50,2021-07-06,0.0
2819143,0x319c8ad1cb3c28bc70aa50c0e838e4f054d7e394cc5c20c4b54d2601c2f1a031,0x7ed1c84977b5564a8ea216a25c56ea89adf5689ae09ac3112ef89deafe4f27f0,15097465,1.0,0x4d355e966ea4fe3eae6ccf8db60f5df70d653f78,47029,59195268370,0x5b34b966,118064488712.0,2500000000.0,90,0x00000000006c3852cbef3e08e8df289169ede581,88,2,0.0,2022-07-07 20:06:42,2022-07-07,0.0
610280,0xb3bf549c807938d92db38c9290345684bb862a84fa0565c26192eb8d7a633990,0x4b7e09589e42430117ff2f0d1d011f609c9d041b08ffcfe49dcbc12155d21885,15067603,1.0,0x46014290d380eda76a3e823023e5e88026248191,320000,16012104051,0xfb0f3ee100000000000000000000000000000000000000000000000000000000000000200000000000000000000000...,100000000000.0,2500000000.0,384,0x00000000006c3852cbef3e08e8df289169ede581,155,2,1.1e+16,2022-07-03 05:12:49,2022-07-03,0.011


### Year over year transactions difference July 2021 vs July 2022

In [21]:
txn_daily = df_txn.query('block_date.between("2021-07-01","2021-07-07", inclusive="both") or block_date.between("2022-07-01","2022-07-07", inclusive="both")')
txn_daily = txn_daily.groupby(['block_date']).agg(transactions=('block_transactions','sum'))

In [22]:
txn_daily = txn_daily.reset_index()
txn_daily['block_year'] = txn_daily['block_date'].dt.year

In [28]:
alt.Chart(txn_daily).mark_line().encode(
    x=alt.X('monthdate(block_date):O', title='Month-Day'),
    y='transactions:Q',
    color=alt.Color('block_year:N', scale=alt.Scale(range=['saddlebrown','steelblue']), legend=alt.Legend(title='year')),
    tooltip=[alt.Tooltip('transactions', format=','), alt.Tooltip('block_year', title='year')],
).properties(
    height=500,
    width=800
)

In [34]:
txn_daily_pct = txn_daily.set_index('block_date').drop(columns='block_year').pct_change(periods=7).dropna()

In [41]:
alt.Chart(txn_daily_pct.reset_index()).mark_bar().encode(
    x=alt.X('monthdate(block_date):O', title='Month-Day'),
    y=alt.Y('transactions:Q', axis=alt.Axis(format='%', title='Year-Over-Year 2022 vs 2021 % Difference')),
    color=alt.condition(
        alt.datum.transactions > 0,
        alt.value("steelblue"),
        alt.value("orange")),
    tooltip=[alt.Tooltip('monthdate(block_date)', title='Month-Day'), alt.Tooltip('transactions', title='2022 vs 2021 % Difference', format='.2%')]
).properties(
    height=500,
    width=800
)

In [65]:
df_agg_2022 = df.query('block_date == "2022-07-06"').groupby(['from']).agg(total_transactions=('hash','count'),
                                                                                         total_value_eth=('eth_value','sum'))
df_agg_2022['value_per_transaction_eth'] = (df_agg_2022['total_value_eth']/df_agg_2022['total_transactions'])
df_agg_2022 = df_agg_2022.reset_index().merge(df_labels, left_on='from', right_on='address', how='left')
df_agg_2022.drop(columns='address', inplace=True)

In [70]:
df_agg_2022_label = df_agg_2022.sort_values(by='total_transactions', ascending=False).iloc[:30,:].groupby(['address_label']).agg({'total_transactions':'sum',
                                                                                                                        'total_value_eth':'sum'})

In [80]:
df_agg_2022_label['year'] = 2022

In [81]:
df_agg_2022_label.sort_values(by='total_transactions', ascending=False)

Unnamed: 0_level_0,total_transactions,total_value_eth,year
address_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Coinbase,48492,93370.428,2022
Binance,29069,175311.4319,2022
No Label,25425,13830.8358,2022
Ethermine,18967,3983.204,2022
FTX Exchange,13262,49302.1786,2022
Crypto.com,7354,4609.6422,2022
Hiveon,6633,1497.736,2022
2Miners: PPLNS,4661,961.9955,2022
Flexpool.io,3299,766.5415,2022
F2Pool Old,3298,1830.9624,2022


In [82]:
df_agg_2021 = df.query('block_date == "2021-07-06"').groupby(['from']).agg(total_transactions=('hash','count'),
                                                                                         total_value_eth=('eth_value','sum'))
df_agg_2021['value_per_transaction_eth'] = (df_agg_2021['total_value_eth']/df_agg_2021['total_transactions'])
df_agg_2021 = df_agg_2021.reset_index().merge(df_labels, left_on='from', right_on='address', how='left')
df_agg_2021.drop(columns='address', inplace=True)

In [83]:
df_agg_2021_label = df_agg_2021.sort_values(by='total_transactions', ascending=False).iloc[:30,:].groupby(['address_label']).agg({'total_transactions':'sum',
                                                                                                                        'total_value_eth':'sum'})

In [84]:
df_agg_2021_label['year'] = 2021

In [85]:
df_agg_2021_label.sort_values(by='total_transactions', ascending=False)

Unnamed: 0_level_0,total_transactions,total_value_eth,year
address_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Binance,93092,326876.0901,2021
Coinbase,65251,97064.3755,2021
Ethermine,20714,5344.9277,2021
Crypto.com,17182,4324.6382,2021
2Miners: PPLNS,9007,805.2077,2021
BlockFi,6986,0.0,2021
F2Pool Old,5685,1252.0328,2021
Gate.io,5300,3874.3286,2021
Nanopool,5049,860.9278,2021
FTX Exchange,4796,53.04,2021
