# Ethereum transactions exploratory data analysis

### Read the pandas dataframe parquet files

In [1]:
import os
import json

import pandas as pd
import numpy as np
import altair as alt

In [2]:
PATH = '../data/'

In [3]:
# Set dataframe display behaviours 
pd.set_option('display.max_columns', 100, 'display.max_rows', 100, 'display.max_colwidth', 100, 'display.float_format', '{:.4f}'.format)

In [4]:
file_dir = os.listdir(PATH)
file_list = [os.path.join(PATH, file) for file in file_dir if file.startswith('df_')]

In [5]:
df_raw = pd.concat([pd.read_parquet(file) for file in file_list])

In [6]:
df = df_raw.copy()

### Convert `type` to categorical data type

In [7]:
df['type'] = df['type'].astype('category')

### Add `eth_value` column
Convert `value` which by default is in Wei (the smallest unit of Ether) to Ether.  
1 ETH = 10^18 Wei  
1 ETH = 10^9 Gwei

In [8]:
df['eth_value'] = df['value'].div(10**18)

### Read the block aggregated transaction counts json file
This dataset is a total count of transactions per day rather than raw transactions.

In [9]:
block_txn = json.load(open('../data/raw/block_txn_counts.json'))
df_txn_raw = pd.DataFrame(block_txn)
df_txn = df_txn_raw.copy()

### Convert `block_timestamp` to datetime and create `block_date` field

In [10]:
df_txn['block_timestamp'] = df_txn['block_timestamp'].astype('datetime64[s]')
df_txn['block_date'] = df_txn['block_timestamp'].astype('datetime64[D]')

### Read the address labels json file

In [11]:
df_labels = pd.read_parquet(f'{PATH}address_labels.parquet')

### The Ethereum raw transactions dataset
- Due to memory and storage limitation of the Jupyter free instance on the cloud, only raw transactions from 1 July 2022 to 7 July 2022 and 6 July 2021 were extracted from the API of a remote node provider (Alchemy).
- Raw transactions for 30 June and 8 July 2022 are incomplete.
- The total number of transactions for below dates (except 30 June and 8 July 2022) matches those in [Dune](add link)

#### The number of transactions in the dataset by date

In [28]:
df.groupby(['block_date']).size()

block_date
2021-07-06    1355421
2022-06-30        511
2022-07-01    1178620
2022-07-02    1189382
2022-07-03    1157052
2022-07-04    1151999
2022-07-05    1182504
2022-07-06    1161432
2022-07-07    1194330
2022-07-08      76276
dtype: int64

In [29]:
df.sample(5)

Unnamed: 0,hash,blockHash,blockNumber,chainId,from,gas,gasPrice,input,maxFeePerGas,maxPriorityFeePerGas,nonce,to,transactionIndex,type,value,block_timestamp,block_date,eth_value
1125015,0x78d88e4c50b9c1a97093d7f303dd43441da5d4b7636d95fe70b8dd132c44e4f9,0x64fe66e59f0b5395f0640790d9ea3b3279354220a1d121972b88ca3024782f2f,15059755,1.0,0x3cd751e6b0078be393132286c442345e5dc49699,21000,33525372544,0x,57000000000.0,2000000000.0,8243372,0x6794767befe0ff5fb1f308980668cbed1f494d6e,36,2,2.9096e+16,2022-07-02 00:18:01,2022-07-02,0.0291
622755,0x0f86345519a94e715c63120e6ad9f8d01e7bf38fd779107cfd060c7da02f755c,0xb661f9e5210ee76caef30cdc048e172cc74fb82bab8c3c1210e26c4aef7baac8,15085408,,0xdf6c4ff43d5f0578b27f0bd7ebf49ab34b52ddd9,500000,130000000000,0x095ea7b30000000000000000000000007a250d5630b4cf539739df2c5dacb4c659f2488dffffffffffffffffffffff...,,,379,0x4da8126300cb00c4fd956d218d732f90067c849e,8,0,0.0,2022-07-05 23:11:20,2022-07-05,0.0
1326773,0xf78adf1b11415d9a8a20440c9ccd8dfb9ef58136e74d8ca2abb47e65e60fb0bf,0xce738e86b5f4a00ee4813974c15642c6ee8f11064057c47e2650bdde886935ff,12776884,,0x46340b20830761efd32832a74d7169b29feb9758,350000,37000000000,0xa9059cbb0000000000000000000000006a9f839381ae927f17f842d33b3b29a679cef1f50000000000000000000000...,,,2648177,0x95ad61b0a150d79219dcf64e1e6cc01f0b64c4ce,127,0,0.0,2021-07-06 23:31:50,2021-07-06,0.0
2819143,0x319c8ad1cb3c28bc70aa50c0e838e4f054d7e394cc5c20c4b54d2601c2f1a031,0x7ed1c84977b5564a8ea216a25c56ea89adf5689ae09ac3112ef89deafe4f27f0,15097465,1.0,0x4d355e966ea4fe3eae6ccf8db60f5df70d653f78,47029,59195268370,0x5b34b966,118064488712.0,2500000000.0,90,0x00000000006c3852cbef3e08e8df289169ede581,88,2,0.0,2022-07-07 20:06:42,2022-07-07,0.0
610280,0xb3bf549c807938d92db38c9290345684bb862a84fa0565c26192eb8d7a633990,0x4b7e09589e42430117ff2f0d1d011f609c9d041b08ffcfe49dcbc12155d21885,15067603,1.0,0x46014290d380eda76a3e823023e5e88026248191,320000,16012104051,0xfb0f3ee100000000000000000000000000000000000000000000000000000000000000200000000000000000000000...,100000000000.0,2500000000.0,384,0x00000000006c3852cbef3e08e8df289169ede581,155,2,1.1e+16,2022-07-03 05:12:49,2022-07-03,0.011


### Year over year transactions difference 1 to 7 July 2021 vs 1 to 7 July 2022

#### Create an aggregated daily transactions dataset

In [95]:
df_txn_daily = df_txn.query('block_date.between("2021-07-01","2021-07-07", inclusive="both") or block_date.between("2022-07-01","2022-07-07", inclusive="both")')
df_txn_daily = df_txn_daily.groupby(['block_date']).agg(transactions=('block_transactions','sum'))
df_txn_daily.reset_index(inplace=True)
df_txn_daily['block_year'] = df_txn_daily['block_date'].dt.year

In [105]:
df_txn_daily.head()

Unnamed: 0,block_date,transactions,block_year
0,2021-07-01,1211710,2021
1,2021-07-02,1181574,2021
2,2021-07-03,1122543,2021
3,2021-07-04,1105343,2021
4,2021-07-05,1147895,2021


In [123]:
alt.Chart(df_txn_daily).mark_line().encode(
    x=alt.X('monthdate(block_date):O', title='Month-Day'),
    y='transactions:Q',
    color=alt.Color('block_year:N', scale=alt.Scale(range=['saddlebrown','steelblue']), legend=alt.Legend(title='year')),
    tooltip=[alt.Tooltip('block_year', title='year'), alt.Tooltip('transactions', format=','), alt.Tooltip('monthdate(block_date)', title='Month-Day')],
).properties(
    title='14% decline in transactions on 6 July 2022 compared to 6 July 2021',
    height=500,
    width=800
)

#### Calculate the percentage difference year over year for each day

In [115]:
df_txn_daily_pct = df_txn_daily.set_index('block_date').drop(columns='block_year').pct_change(periods=7).dropna()
df_txn_daily_pct.rename(columns={'transactions':'pct_change'}, inplace=True)

In [117]:
df_txn_daily_pct.head()

Unnamed: 0_level_0,pct_change
block_date,Unnamed: 1_level_1
2022-07-01,-0.0273
2022-07-02,0.0066
2022-07-03,0.0307
2022-07-04,0.0422
2022-07-05,0.0301


In [119]:
alt.Chart(df_txn_daily_pct.reset_index()).mark_bar().encode(
    x=alt.X('monthdate(block_date):O', title='Month-Day'),
    y=alt.Y('pct_change:Q', axis=alt.Axis(format='%', title='Year-Over-Year 2022 vs 2021 % Difference')),
    color=alt.condition(
        alt.datum.pct_change > 0,
        alt.value("steelblue"),
        alt.value("orange")),
    tooltip=[alt.Tooltip('monthdate(block_date)', title='Month-Day'), alt.Tooltip('pct_change', title='2022 vs 2021 % Difference', format='.2%')]
).properties(
    height=500,
    width=800
)

#### Identify top 50 addresses with most transactions on 6 July 2022 and 2021
- The name (label) of addresses were manually sourced from https://etherscan.io/
- Note that an organization, for e.g. a crypto exchange can have multiple addresses.
  - For e.g. compare this address [0x3cd751e6b0078be393132286c442345e5dc49699](https://etherscan.io/address/0x3cd751e6b0078be393132286c442345e5dc49699) and [0xb5d85cbf7cb3ee0d56b3bb207d5fc4b82f43f511](https://etherscan.io/address/0xb5d85cbf7cb3ee0d56b3bb207d5fc4b82f43f511)

In [87]:
df_agg = df.query('block_date.isin(["2021-07-06","2022-07-06"])').groupby(['from','block_date']).agg(total_transactions=('hash','count'),total_value_eth=('eth_value','sum'))
df_agg.reset_index(inplace=True)
df_agg['year'] = df_agg['block_date'].dt.year
df_agg = df_agg.merge(df_labels, left_on='from', right_on='address', how='left')
df_agg.drop(columns='address', inplace=True)
df_agg_label = df_agg.sort_values(by='total_transactions', ascending=False).iloc[:50,:].groupby(['address_label','year']).agg({'total_transactions':'sum','total_value_eth':'sum'})

#### Aggregate the top 50 addresses by their address names (labels)

In [191]:
df_agg_label_pivot = df_agg_label.pivot_table(index='address_label', columns='year')
df_agg_label_pivot.columns = ['total_transactions_2021','total_transactions_2022','total_value_eth_2021','total_value_eth_2022']
df_agg_label_pivot.reset_index(inplace=True)
df_agg_label_pivot.fillna(0, inplace=True)

df_agg_label_pivot['total_transactions_pct_diff'] = df_agg_label_pivot['total_transactions_2022']/df_agg_label_pivot['total_transactions_2021']-1
df_agg_label_pivot['total_value_eth_pct_diff'] = df_agg_label_pivot['total_value_eth_2022']/df_agg_label_pivot['total_value_eth_2021']-1
df_agg_label_pivot['value_per_transaction_2021'] = df_agg_label_pivot['total_value_eth_2021']/df_agg_label_pivot['total_transactions_2021']
df_agg_label_pivot['value_per_transaction_2022'] = df_agg_label_pivot['total_value_eth_2022']/df_agg_label_pivot['total_transactions_2022']
df_agg_label_pivot['value_per_txn_pct_diff'] = df_agg_label_pivot['value_per_transaction_2022']/df_agg_label_pivot['value_per_transaction_2021']-1
df_agg_label_pivot.replace(np.inf, np.nan, inplace=True)
df_agg_label_pivot.set_index('address_label', inplace=True)

In [192]:
df_agg_label_pivot.sort_values(by='total_transactions_2021', ascending=False)

Unnamed: 0_level_0,total_transactions_2021,total_transactions_2022,total_value_eth_2021,total_value_eth_2022,total_transactions_pct_diff,total_value_eth_pct_diff,value_per_transaction_2021,value_per_transaction_2022,value_per_txn_pct_diff
address_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Binance,90505.0,26033.0,325242.787,165070.8563,-0.7124,-0.4925,3.5936,6.3408,0.7645
Coinbase,65251.0,45881.0,97064.3755,93370.428,-0.2969,-0.0381,1.4876,2.0351,0.3681
No Label,33066.0,8829.0,1888.5941,2042.9935,-0.733,0.0818,0.0571,0.2314,3.0513
Ethermine,20714.0,18967.0,5344.9277,3983.204,-0.0843,-0.2548,0.258,0.21,-0.1861
Crypto.com,17182.0,7354.0,4324.6382,4609.6422,-0.572,0.0659,0.2517,0.6268,1.4904
2Miners: PPLNS,9007.0,4661.0,805.2077,961.9955,-0.4825,0.1947,0.0894,0.2064,1.3087
Spark Pool: Distributor,7588.0,0.0,3219.7602,0.0,-1.0,-1.0,0.4243,,
Hiveon Pool,7210.0,0.0,981.8453,0.0,-1.0,-1.0,0.1362,,
BlockFi,6986.0,0.0,0.0,0.0,-1.0,,0.0,,
BeePool,6347.0,0.0,867.6726,0.0,-1.0,-1.0,0.1367,,


In [255]:
alt.Chart(df_agg_label_pivot.dropna(subset='total_transactions_pct_diff').reset_index()).mark_bar().encode(
    x=alt.X('total_transactions_pct_diff:Q', title='6 July 2022 vs 6 July 2021 Transactions % Difference', axis=alt.Axis(format='%')),
    y=alt.Y('address_label', sort='x', title='Addresses'),
    size=alt.Size('total_transactions_2021:Q', title='6 July 2021 Transaction Volume', scale=alt.Scale(range=[0,30])),
    color=alt.condition(
        alt.datum.total_transactions_pct_diff > 0,
        alt.value("steelblue"),
        alt.value("orange")),
    tooltip=[alt.Tooltip('total_transactions_pct_diff', title='6 July 2022 vs 2021 % Diff', format='.2%'), 
             alt.Tooltip('total_transactions_2022', format=',', title='6 July 2022 Transactions'),
             alt.Tooltip('total_transactions_2021', format=',', title='6 July 2021 Transactions')]
).properties(
    height=500,
    width=800
)

In [231]:
df_agg_label_pivot_excl = df_agg_label_pivot.dropna(subset='total_transactions_pct_diff').query('total_transactions_pct_diff < 0').reset_index()

In [279]:
bar = alt.Chart(df_agg_label_pivot_excl).mark_bar().encode(
    x=alt.X('total_transactions_pct_diff:Q', title='6 July 2022 vs 6 July 2021 Transactions and ETH Value % Difference', axis=alt.Axis(format='%')),
    y=alt.Y('address_label', sort='x', title='Addresses'),
    size=alt.Size('total_transactions_2021:Q', title='6 July 2021 Transaction Volume', scale=alt.Scale(range=[0,30])),
    color=alt.condition(
        alt.datum.total_transactions_pct_diff > 0,
        alt.value("steelblue"),
        alt.value("orange")),
    tooltip=[alt.Tooltip('total_transactions_pct_diff', title='6 July 2022 vs 2021 % Diff', format='.2%'), 
             alt.Tooltip('total_transactions_2022', format=',', title='6 July 2022 Transactions'),
             alt.Tooltip('total_transactions_2021', format=',', title='6 July 2021 Transactions')]
).properties(
    height=500,
    width=800
)


tick = alt.Chart(df_agg_label_pivot_excl).mark_tick(
    color='red',
    thickness=5,
    size=40 * 0.7
).encode(
         x=alt.X('total_value_eth_pct_diff', axis=alt.Axis(format='%')),
         y=alt.Y('address_label', sort='x', title='Addresses'),
         tooltip=[alt.Tooltip('total_value_eth_pct_diff', title='6 July 2022 vs 2021 ETH Transferred % Diff', format='.2%'), 
                  alt.Tooltip('total_value_eth_2022', format=',.6', title='6 July 2022 Total ETH Transferred'),
                  alt.Tooltip('total_value_eth_2021', format=',.6', title='6 July 2021 Total ETH Transferred')]
).properties(
    height=500,
    width=800
)

bar + tick