# Ethereum transactions analysis

## Read the pandas dataframe parquet files

In [1]:
import os
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt

In [2]:
PATH = '../data/'

In [3]:
pd.set_option('display.max_columns', 100, 'display.max_rows', 100, 'display.max_colwidth', 100)
#Remove scientific notation
# pd.set_option('display.float_format', lambda x: '%.f' % x)
#Show number of decimals
pd.set_option('display.float_format', '{:.4f}'.format)

In [4]:
file_dir = os.listdir(PATH)
file_list = [os.path.join(PATH, file) for file in file_dir if file.startswith('df_')]

In [5]:
df_raw = pd.concat([pd.read_parquet(file) for file in file_list])

In [6]:
df = df_raw.copy()

### Convert `type` to categorical data type
Seems to have reverted to 'int64' after reading from parquet into dataframe.

In [7]:
df['type'] = df['type'].astype('category')

### Add `eth_value` column
Convert `value` which is in Wei (the smallest unit of Ether) to Ether.  
1 ETH = 10^18 Wei  
1 ETH = 10^9 Gwei

In [8]:
df['eth_value'] = df['value'].div(10**18)

## Read the address labels json file

In [10]:
# df_labels = pd.concat([df_labels, tmp_labels])
# df_labels.to_parquet(f'{PATH}address_labels.parquet')

In [11]:
df_labels = pd.read_parquet(f'{PATH}address_labels.parquet')

## Keep the dataset to 1 to 7 July 2022

In [11]:
df = df.query('block_timestamp.between("2022-07-01", "2022-07-08", inclusive="both")')

## Read the block transaction counts json file

In [11]:
block_txn = json.load(open('../data/raw/block_txn_counts.json'))

In [12]:
df_txn_raw = pd.DataFrame(block_txn)

In [13]:
df_txn = df_txn_raw.copy()

In [14]:
df_txn['block_timestamp'] = df_txn['block_timestamp'].astype('datetime64[s]')
df_txn['block_date'] = df_txn['block_timestamp'].astype('datetime64[D]')

### Get block numbers for 6 June 2021

In [16]:
# tmp_df = df_txn.query('block_date == "2021-07-06"').sort_values(by='block_number')

In [16]:
# tmp_df['block_number'].min(), tmp_df['block_number'].max()

(12770590, 12777014)

In [31]:
# tmp_missing_blocks = tmp_df['block_number'].tolist()

In [10]:
# Save the list of missing blocks for re-extraction
# tmp_str = json.dumps(tmp_missing_blocks)
# with open('../data/raw/tmp_missing_blocks.json', 'w') as f:
#     f.write(tmp_str)

### Identify missing transactions in the main dataset

In [44]:
# isin_main_array = df.query('block_date.between("2022-07-04","2022-07-05", inclusive="both")')['blockNumber'].unique().tolist()

In [46]:
# isin_txn_array = df_txn.query('block_date.between("2022-07-04","2022-07-05", inclusive="both")')['block_number'].unique().tolist()

In [48]:
# missing 470 blocks from the main dataset
# len(isin_txn_array) - len(isin_main_array)

470

In [49]:
# missing_list = [i for i in isin_txn_array if (i not in isin_main_array) or (i not in isin_txn_array)]

In [12]:
#Save the list of missing blocks for re-extraction
# tmp_str = json.dumps(tmp_missing_blocks)
# with open('../data/raw/tmp_missing_blocks.json', 'w') as f:
#     f.write(tmp_str)

#### Data fields
- `hash` - Hash of the transaction
- `blockHash` - Hash of the block
- `blockNumber` - Block number
- `from` - Address of the sender
- `gas` - Gas provided by the sender
- `gasPrice` - Gas price provided by the sender in Wei
- `input` - The data sent along with the transaction. Commonly used as part of contract interaction or as a message sent to the recipient.
- `nonce` - The number of transactions made by the sender prior to this one
- `r` - The Elliptic Curve Digital Signature Algorithm (ECDSA) signature r. The standardised R field of the signature. See: https://openethereum.github.io/JSONRPC
- `s` - The Elliptic Curve Digital Signature Algorithm (ECDSA) signature s
- `to` - Address of the receiver
- `transactionIndex` - Integer of the transactions index position in the block
- `type` - Overtime, Ethereum Improvement Proposals (EIPs) have changed what a valid transaction looks like, whilst maintaining a high level of backwards compatibility with other transaction types. Today, the main types of transactions are legacy transactions and typed transactions (i.e. Type 1, Type 2).  
See https://mycelium.xyz/research/the-journey-of-an-ethereum-transaction/.  
Also see https://docs.dune.com/data-tables/data-tables/raw-data/ethereum-data#ethereum.transactions. 
   - 0: 'legacy' 
   - 1: 'accessList/Type 1' 
   - 2: 'DynamicFee/Type 2'    
- `v` - The Elliptic Curve Digital Signature Algorithm (ECDSA) recovery id. The standardised V field of the signature. See: https://openethereum.github.io/JSONRPC
- `value` - The amount of ether transferred in Wei. 1 ETH = 10^18 Wei. 1 ETH = 10^9 Gwei.
- `accessList` - Contains addresses and storage keys that will be accessed. These are fields from legacy transactions, EIP-2930. See: https://openethereum.github.io/JSONRPC
- `chainId` - Value used in replay-protected transaction signing as introduced by EIP-155
- `maxFeePerGas` - The maximum fee per gas the transaction sender is willing to pay total (introduced by EIP1559). For detailed explanation, refer to https://docs.alchemy.com/alchemy/guides/eip-1559/maxpriorityfeepergas-vs-maxfeepergas
- `maxPriorityFeePerGas` - The maximum fee per gas the transaction sender is willing to pay total (introduced by EIP1559) Refer to https://docs.alchemy.com/alchemy/guides/eip-1559/maxpriorityfeepergas-vs-maxfeepergas
- `block_timestamp` - Timestamp of the block

References:  
https://docs.dune.com/data-tables/data-tables/raw-data/ethereum-data#ethereum.transactions  
The "bigquery-public-data.crypto_ethereum.transactions" column description  
https://ethereum.org/en/developers/docs/apis/json-rpc/  
https://docs.alchemy.com/alchemy/apis/ethereum/eth-gettransactionbyhash  

### We have the following entities in our domain:
- Blocks
- Transaction
- Value and Fees
- Account

The process/relationship:
- A block contains multiple transactions
- A transaction is a request for computation on the Ethereum Virtual Machine (EVM) and it is a fulfilled transaction request and the associated change in the EVM.  
When a request is broadcast, other participants on the network verify, validate and carry out the computation.  
A transaction refers to an action initiated by an externally owned account, in other words, managed by a human not a contract (smart contract, ie. controlled by code).
A transaction requires a fee and must be mined to be valid.
- An account can be externally-owned or a contract (smart contract). An account is not a wallet. An account is the keypair for a user-owned Ethereum account. A wallet is an interface or application that lets you interact with your Ethereum account

References:
https://ethereum.org/en/developers/docs/intro-to-ethereum/  
https://ethereum.org/en/developers/docs/accounts/#a-note-on-wallets

### Externally-owned account vs contract (smart contract)
References:   
https://ethereum.org/en/developers/docs/accounts/#types-of-account  
https://info.etherscan.com/understanding-ethereum-accounts
- Externally-owned (controlled by anyone with private keys):
  - Creating an account costs nothing
  - Can initiate transactions
  - Transactions between externally-owned accounts can only be ETH/token transfers
- Contract (smart contract):
  - Creating a contract has a cost because you are using network storage
  - Can only send transactions in response to receiving a transaction
  - Transactions from an external account to a contract account can trigger code which can execute many different actions, such as transferring tokens or even creating a new contract

### Types of transactions
References:  
https://ethereum.org/en/developers/docs/transactions/#types-of-transactions  
https://medium.com/coinmonks/discovering-the-secrets-of-an-ethereum-transaction-64febb00935c  
- Regular transactions: a transaction from one account to another.
  - The `input` field is empty if it is a transaction from one account to another.
- Contract deployment transactions: a transaction without a `to` address, and the data field is used for the contract code.
  - The `to` address is empty, if it is a smart contract deployment and `input` field must be filled.
  - If it is a contract creation, the `input` will contain information about which function to call and the arguments. This will require decoding to identify the details.
- Execution of a contract: a transaction that interacts with a deployed smart contract. In this case, 'to' address is the smart contract address
  - This means that when the `to` address is not empty, it could be a transaction to a smart contract address executed by a smart contract.

### We have the following information about our entities:
- Blocks: `blockHash`, `blockNumber`, `block_timestamp`
- Transaction: `hash`, `from`, `to`, `transactionIndex`, `type`, `input`, `chainId`
- Value and Fees: `value`, `gas`, `gasPrice`, `maxFeePerGas`, `maxPriorityFeePerGas`
- Account: `nonce`

## Univariate analysis

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8212460 entries, 0 to 3011962
Data columns (total 16 columns):
 #   Column                Dtype         
---  ------                -----         
 0   hash                  object        
 1   blockHash             object        
 2   blockNumber           int64         
 3   chainId               float64       
 4   from                  object        
 5   gas                   int64         
 6   gasPrice              int64         
 7   input                 object        
 8   maxFeePerGas          float64       
 9   maxPriorityFeePerGas  float64       
 10  nonce                 int64         
 11  to                    object        
 12  transactionIndex      int64         
 13  type                  category      
 14  value                 float64       
 15  block_timestamp       datetime64[ns]
dtypes: category(1), datetime64[ns](1), float64(4), int64(5), object(5)
memory usage: 1010.3+ MB


In [17]:
df.head(3)

Unnamed: 0,hash,blockHash,blockNumber,chainId,from,gas,gasPrice,input,maxFeePerGas,maxPriorityFeePerGas,nonce,to,transactionIndex,type,value,block_timestamp
0,0x4ba4952586cd471bf7be6f5bce210a52fb1139e279994bbfe05353884b77f325,0xc21f383b86f543e0038ada2a3554bf2b7ced841fad4363871f1fd8400721aff9,15053949,1.0,0xea674fdde714fd979de3edf0f56aa9716b898ec8,250000,17082324542,0x,38052090000.0,1000000000.0,43640737,0x40329687913cf9d84e3132cb4d8d61130704599d,0,2,9.971602e+16,2022-07-01 02:36:35
1,0xd04e6bed821f4c616a400d5bb43a525dbad01fff53b2fb42c9a40d448df371c4,0xc21f383b86f543e0038ada2a3554bf2b7ced841fad4363871f1fd8400721aff9,15053949,1.0,0xea674fdde714fd979de3edf0f56aa9716b898ec8,250000,17082324542,0x,38052090000.0,1000000000.0,43640738,0xb55ec04cbda8d2d602f3108ddb66496a3cb1ee9b,1,2,9.962975e+16,2022-07-01 02:36:35
2,0xc1f95ac0a6f41e9a53713193e83af95694052181a2ec9b788ab00ab389412cb6,0xc21f383b86f543e0038ada2a3554bf2b7ced841fad4363871f1fd8400721aff9,15053949,1.0,0xea674fdde714fd979de3edf0f56aa9716b898ec8,250000,17082324542,0x,38052090000.0,1000000000.0,43640739,0xdd27415aa937d75f13bf5877f7cbf092e42612b3,2,2,1.734777e+16,2022-07-01 02:36:35


How many blocks are there in our dataset?

In [23]:
df['blockNumber'].nunique()

43706

How many transactions?

In [21]:
df.shape[0]

8212460

How many senders?

In [22]:
df['from'].nunique()

1782142

How many recipients?

In [24]:
df['to'].nunique()

1311994

In [37]:
pd.set_option('display.float_format', lambda x: '%.f' % x)

What is the minimum and maximum `gas`?

In [38]:
df['gas'].describe()

count    8212460
mean      163931
std       385066
min        21000
25%        31500
50%        90000
75%       207128
max     30029295
Name: gas, dtype: float64

In [None]:
df.query('gas == 30029295')

Unnamed: 0,hash,blockHash,blockNumber,chainId,from,gas,gasPrice,input,maxFeePerGas,maxPriorityFeePerGas,nonce,to,transactionIndex,type,value,block_timestamp
642917,0xdd6d61a22ede7b658487fd45bb645f1dfecab607d48e2e1e50cf0915e3d18abf,0x35ffb57b42bb69b018f95c39b7b0e84bac7e91d8c854149d7192eb20912b854d,15078507,1,0x6d72dc1b254fb818a1382a05212466b83424e469,30029295,25375730731,0x8fe5d2e90000000000000000000000000000000000000000000000000000000000e611910000000000000000000000...,37015077777,7000000000,37,0x30f7bf69d92828441f5a6bfcf818df25deb2c4b4,0,2,0,2022-07-04 21:33:21


What is the minimum and maximum `gasPrice`?

In [27]:
df['gasPrice'].describe()

count    8.212460e+06
mean     3.015800e+10
std      1.227708e+11
min      3.717514e+09
25%      1.455000e+10
50%      2.138334e+10
75%      3.489424e+10
max      2.294510e+14
Name: gasPrice, dtype: float64

In [28]:
df['nonce'].describe()

count    8.212460e+06
mean     1.317989e+06
std      5.830671e+06
min      0.000000e+00
25%      1.500000e+01
50%      1.830000e+02
75%      1.693500e+04
max      4.377498e+07
Name: nonce, dtype: float64

In [29]:
df['transactionIndex'].describe()

count    8.212460e+06
mean     1.383877e+02
std      1.141302e+02
min      0.000000e+00
25%      5.000000e+01
50%      1.120000e+02
75%      2.020000e+02
max      1.307000e+03
Name: transactionIndex, dtype: float64

In [30]:
df['type'].value_counts(dropna=False)

2    6881664
0    1315335
1      15461
Name: type, dtype: int64

In [31]:
df['type'].value_counts(dropna=False, normalize=True)

2    0.837954
0    0.160163
1    0.001883
Name: type, dtype: float64

In [32]:
# pd.set_option('display.float_format', lambda x: '%.f' % x)

In [33]:
df['value'].describe()

count    8.212460e+06
mean     1.697553e+18
std      1.871322e+20
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      4.800000e+16
max      1.500000e+23
Name: value, dtype: float64

In [34]:
df['chainId'].value_counts(dropna=False, normalize=True)

1.0    0.839837
NaN    0.160163
Name: chainId, dtype: float64

In [44]:
# pd.set_option('display.float_format', lambda x: '%.f' % x)

In [35]:
df['maxFeePerGas'].describe()

count    6.881664e+06
mean     6.971994e+10
std      1.914036e+11
min      3.717514e+09
25%      2.200000e+10
50%      3.469551e+10
75%      6.196162e+10
max      2.294510e+14
Name: maxFeePerGas, dtype: float64

In [36]:
df['maxPriorityFeePerGas'].describe()

count    6.881664e+06
mean     5.248661e+09
std      1.149491e+11
min      0.000000e+00
25%      1.500000e+09
50%      2.000000e+09
75%      2.500000e+09
max      2.294510e+14
Name: maxPriorityFeePerGas, dtype: float64

## Overall figures
In this analysis, we will not look into:  
The `r`, `s` and `v` which are the ECDSA signatures.  
`accessList` which contains addresses and storage keys from legacy transactions. The field also needs to be flattened.  
All transactions are included regardless of status, ie. success or failed (no field to determine this in the dataset).
- How many blocks are there in our dataset?  
43,706
- How many transactions?  
8,212,460
- How many senders:  
1,782,142
- How many recipients:  
1,311,992
- What are the categories in `type`?  
84% of transactions are 2, 16% are 0 and 0.002% are 1
- What are the categories in `chainId`
84% are 1 and 16% are missing the `chainId`
- Which dates were these transactions executed?
2022-07-01 to 2022-07-07

## Key Findings on Overall Figures
- There are more senders than recipients. Why?
- The 84% of transactions has a `type` of 2 and 16% are 0. The 84% vs 16% split is the same as `chainId` split. What is the relationship? 
- An example of an outlier with `gas` = 30,029,295 was a failed transaction, see `hash` = '0xdd6d61a22ede7b658487fd45bb645f1dfecab607d48e2e1e50cf0915e3d18abf'. Note that the `value` is 0. Also confirmed in https://etherscan.io/tx/0xdd6d61a22ede7b658487fd45bb645f1dfecab607d48e2e1e50cf0915e3d18abf. Comments in etherscan says "WRONG_BLOCK_NUMBER" but another checking the Alchemy API with eth_getTransactionByHash confirmed it is correct block number.

How many blocks per day?

In [12]:
df.resample('D', on='block_timestamp').agg(blocks=('blockNumber','nunique'))

Unnamed: 0_level_0,blocks
block_timestamp,Unnamed: 1_level_1
2022-07-01,6253
2022-07-02,6227
2022-07-03,6315
2022-07-04,6200
2022-07-05,6242
2022-07-06,6280
2022-07-07,6189


How many transactions per day?

In [14]:
df.resample('D', on='block_date').size()

block_date
2022-07-01    1178620
2022-07-02    1189382
2022-07-03    1157052
2022-07-04    1150609
2022-07-05    1181035
2022-07-06    1161432
2022-07-07    1194330
Freq: D, dtype: int64

Total value per day?

In [49]:
df.resample('D', on='block_timestamp').agg(total_value=('value','sum'))

Unnamed: 0_level_0,total_value
block_timestamp,Unnamed: 1_level_1
2022-07-01,2529957095970754663546880
2022-07-02,1242727481254419196018688
2022-07-03,802698618545780836794368
2022-07-04,1979039773764470716760064
2022-07-05,1838238726207302323404800
2022-07-06,1882904639433685247983616
2022-07-07,3665519196793809473110016


### Year over year transactions difference July 2021 vs July 2022

In [34]:
txn_daily = df_txn.query('block_date.between("2021-07-01","2021-07-07", inclusive="both") or block_date.between("2022-07-01","2022-07-07", inclusive="both")')
txn_daily = txn_daily.groupby(['block_date']).agg(transactions=('block_transactions','sum'))

In [35]:
txn_daily_pct = txn_daily.pct_change(periods=7).dropna()

In [36]:
alt.Chart(txn_daily_pct.reset_index()).mark_bar().encode(
    x=alt.X('yearmonthdate(block_date):O', axis=alt.Axis(title='Date')),
    y=alt.Y('transactions:Q', axis=alt.Axis(format='%', title='2022 vs 2021 Percentage Difference')),
    tooltip=['block_date','transactions']                    
).properties(
    height=500,
    width=800
)

In [67]:
df_agg = df.query('block_date == "2022-07-06"').groupby(['from','transaction_type']).agg(total_transactions=('hash','count'),
                                                                                         total_value_eth=('eth_value','sum'))
df_agg['value_per_transaction_eth'] = (df_agg['total_value_eth']/df_agg['total_transactions'])
df_agg_label = df_agg.reset_index().merge(df_labels, left_on='from', right_on='address', how='left')
df_agg_label.drop(columns='address', inplace=True)

In [68]:
df_agg_label.sort_values(by='total_transactions', ascending=False).head(50)

Unnamed: 0,from,transaction_type,total_transactions,total_value_eth,value_per_transaction_eth,address_label
332748,0xea674fdde714fd979de3edf0f56aa9716b898ec8,externally owned,18967,3983.204,0.21,Ethermine
87082,0x3cd751e6b0078be393132286c442345e5dc49699,externally owned,11734,30881.9334,2.6318,Coinbase
258229,0xb5d85cbf7cb3ee0d56b3bb207d5fc4b82f43f511,externally owned,10516,8623.8305,0.8201,Coinbase
315370,0xddfabcdc4d8ffc6d5beaf154f18b778f892a0740,externally owned,10074,29116.8499,2.8903,Coinbase
333789,0xeb2629a2734e272bcc07bda959863f316f4bd4cf,externally owned,8601,24747.8142,2.8773,Coinbase
100391,0x46340b20830761efd32832a74d7169b29feb9758,externally owned,7354,4609.6422,0.6268,Crypto.com
273538,0xc098b2a3aa256d2140208c3de6543aaef5cd3a94,externally owned,6896,49230.2986,7.139,FTX Exchange
68321,0x2faf487a4414fe77e2327f0bf4ae2a264a776ad2,externally owned,6366,71.88,0.0113,FTX Exchange
48592,0x21a31ee1afc51d94c2efccaa2092ad1028285549,externally owned,5904,21444.8212,3.6323,Binance
58672,0x28c6c06298d514db089934071355e5743bf21d60,externally owned,5752,98599.0455,17.1417,Binance


In [43]:
# tmp_labels = pd.DataFrame({'address':['0xeae33deef684802dd05b7026d09fe9831edb7b37'],
#                            'labels':['No Label']})

In [21]:
df_agg_label.groupby(['from','transaction_type']).agg({'total_transactions':'sum',
                                                         'total_value_eth':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,total_transactions,total_value_eth
from,transaction_type,Unnamed: 2_level_1,Unnamed: 3_level_1
0x0000000000006e543164be036824fcf832e67e47,externally owned,1,0.0750
0x000000000003ebf123909630caed826f160fcb7f,externally owned,1,0.0000
0x000000000005e4314ea8efd6f9ec0e8d65444384,externally owned,1,0.0100
0x00000000005dbcb0d0513fcda746382fe8a53468,externally owned,1,0.0000
0x000000000062c524bd4d32f62f2e569bf171ffdd,externally owned,5,0.1000
...,...,...,...
0xfffff449f1a35eb0facca8d4659d8e15cf2f77ba,externally owned,1,0.5000
0xffffff5800b709071d4adc74759ae4b89bef2a9d,externally owned,11,0.0000
0xffffff5b36f8388d0a6c6586654cd2da65ea59be,externally owned,2,0.4700
0xfffffff42beda895371adfd3f3b05fb35b9b5239,externally owned,1,0.0000


In [40]:
df_agg_label.sort_values(by=['total_transactions','total_value_eth'], ascending=[False,False]).head(10)

Unnamed: 0,from,total_transactions,total_value_eth,value_per_transaction,labels
332296,0xea674fdde714fd979de3edf0f56aa9716b898ec8,18967,3983.2,0.21,Ethermine
86959,0x3cd751e6b0078be393132286c442345e5dc49699,11734,30881.93,2.63,Coinbase
257875,0xb5d85cbf7cb3ee0d56b3bb207d5fc4b82f43f511,10516,8623.83,0.82,Coinbase
314943,0xddfabcdc4d8ffc6d5beaf154f18b778f892a0740,10074,29116.85,2.89,Coinbase
333336,0xeb2629a2734e272bcc07bda959863f316f4bd4cf,8601,24747.81,2.88,Coinbase
100250,0x46340b20830761efd32832a74d7169b29feb9758,7354,4609.64,0.63,Crypto.com
273166,0xc098b2a3aa256d2140208c3de6543aaef5cd3a94,6896,49230.3,7.14,FTX Exchange
68225,0x2faf487a4414fe77e2327f0bf4ae2a264a776ad2,6366,71.88,0.01,FTX Exchange
48525,0x21a31ee1afc51d94c2efccaa2092ad1028285549,5904,21444.82,3.63,Binance
58591,0x28c6c06298d514db089934071355e5743bf21d60,5752,98599.05,17.14,Binance


In [42]:
df_agg_label.sort_values(by=['total_value_eth'], ascending=False).head(10)

Unnamed: 0,from,total_transactions,total_value_eth,value_per_transaction,labels
58591,0x28c6c06298d514db089934071355e5743bf21d60,5752,98599.05,17.14,Binance
71808,0x3229149012a035ef51d724e0343eb31ce3e4bb7d,36,93499.11,2597.2,No Label
298416,0xd24400ae8bfebb18ca49be86258a3c749cf46853,3101,49303.25,15.9,Gemini
273166,0xc098b2a3aa256d2140208c3de6543aaef5cd3a94,6896,49230.3,7.14,FTX Exchange
152887,0x6b3e9111635944e494da15714c60e06c58c367cc,1,48274.23,48274.23,
46822,0x2077817661a2a0a12f8885dd39ed394f44cdc28a,1,48274.22,48274.22,No Label
8907,0x0600d21f9877caf9a823f4ef49049a417ad569f9,2,45495.26,22747.63,No Label
86959,0x3cd751e6b0078be393132286c442345e5dc49699,11734,30881.93,2.63,Coinbase
314943,0xddfabcdc4d8ffc6d5beaf154f18b778f892a0740,10074,29116.85,2.89,Coinbase
55279,0x267be1c1d684f78cb4f6a176c4911b741e4ffdc0,1331,28273.76,21.24,


In [32]:
df_agg_2021 = df.query('block_date == "2021-07-06"').groupby(['from']).agg(total_transactions=('hash','count'),
                                                                           total_value_eth=('eth_value','sum'))
df_agg_2021['value_per_transaction'] = df_agg_2021['total_value_eth']/df_agg_2021['total_transactions']
df_agg_2021_label = df_agg_2021.reset_index().merge(df_labels, left_on='from', right_on='address', how='left')
df_agg_2021_label.drop(columns='address', inplace=True)

In [41]:
df_agg_2021_label.sort_values(by=['total_transactions','total_value_eth'], ascending=[False,False]).head(10)

Unnamed: 0,from,total_transactions,total_value_eth,value_per_transaction,labels
58801,0x28c6c06298d514db089934071355e5743bf21d60,26214,199100.24,7.6,Binance
333333,0xea674fdde714fd979de3edf0f56aa9716b898ec8,20714,5344.93,0.26,Ethermine
48836,0x21a31ee1afc51d94c2efccaa2092ad1028285549,17730,15577.6,0.88,Binance
100911,0x46340b20830761efd32832a74d7169b29feb9758,17182,4324.64,0.25,Crypto.com
87484,0x3cd751e6b0078be393132286c442345e5dc49699,16787,31528.93,1.88,Coinbase
259277,0xb5d85cbf7cb3ee0d56b3bb207d5fc4b82f43f511,16046,24213.11,1.51,Coinbase
315797,0xddfabcdc4d8ffc6d5beaf154f18b778f892a0740,15380,17467.0,1.14,Coinbase
318455,0xdfd5293d8e347dfe59e90efd55b2956a1343963d,15225,26675.91,1.75,Binance
124381,0x56eddb7aa87536c09ccc2793473599fd21a8b17f,13690,40479.4,2.96,Binance
214634,0x9696f59e4d72e237be84ffd425dcad154bf96976,13268,43348.34,3.27,Binance


## Create a list of addresses with labels
Source: https://dune.com/labels and https://etherscan.io/

In [82]:
# address_labels = {}
# address_labels['address'] = df_agg.sort_values(by='total_transactions', ascending=False).reset_index().loc[:29,'from'].tolist()

In [83]:
# address_labels['labels'] = ['Ethermine','Coinbase','Coinbase','Coinbase','Coinbase','Crypto.com','FTX Exchange','FTX Exchange','Binance','Binance',
#                             'Binance','No Label','KuCoin','Binance','2Miners: PPLNS','Binance','No Label','Hiveon: Spreader','Flexpool.io','F2Pool Old',
#                             'Hiveon: Spreader','Gemini','No Label','Binance','No Label','No Label','No Label','Coinbase','No Label','No Label']

In [84]:
# address_labels_str = json.dumps(address_labels)
# with open(f'{PATH}address_labels.json', 'w') as f:
#     f.write(address_labels_str)

## Key Findings
- Compared to 6 July 2021, on 6 July 2022, there was 14% less in transactions.