## Wrap Up: Etherscan


In [1]:
url = "https://etherscan.io/"

Simple GET request without JS.

In [2]:
import requests

html = requests.get(url)

html.content




With custom headers.

In [3]:
hdr = {'User-Agent': 'Mozilla/5.0'}

html = requests.get(url, headers=hdr)

html.content


b'\n<!doctype html>\n<html lang="en">\n<head><title>\r\n\t Ethereum (ETH) Blockchain Explorer\r\n</title><meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" /><meta name="author" content="etherscan.io" /><meta name="Description" content="Etherscan allows you to explore and search the Ethereum blockchain for transactions, addresses, tokens, prices and other activities taking place on Ethereum (ETH)" /><meta name="keywords" content="ethereum, explorer, ether, search, blockchain, crypto, currency" /><meta name="format-detection" content="telephone=no" />\n<meta property="og:title" content=" Ethereum (ETH) Blockchain Explorer" /> <meta property="og:description" content="Etherscan allows you to explore and search the Ethereum blockchain for transactions, addresses, tokens, prices and other activities taking place on Ethereum (ETH)" /> <meta property="og:type" content="website" /> <meta property="og:site_name" content="Ethereum (ETH) 

Improved GET request.

In [4]:

import urllib.request

page = urllib.request.urlopen(url).read()

HTTPError: HTTP Error 403: Forbidden

Improved GET request with custom headers.

In [5]:

from urllib.request import Request, urlopen

req = Request(url, headers = hdr)
page = urlopen(req)


Make soup.

In [6]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page)
soup

<!DOCTYPE html>
<html lang="en">
<head><title>
	 Ethereum (ETH) Blockchain Explorer
</title><meta charset="utf-8"/><meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/><meta content="etherscan.io" name="author"/><meta content="Etherscan allows you to explore and search the Ethereum blockchain for transactions, addresses, tokens, prices and other activities taking place on Ethereum (ETH)" name="Description"/><meta content="ethereum, explorer, ether, search, blockchain, crypto, currency" name="keywords"/><meta content="telephone=no" name="format-detection"/>
<meta content=" Ethereum (ETH) Blockchain Explorer" property="og:title"/> <meta content="Etherscan allows you to explore and search the Ethereum blockchain for transactions, addresses, tokens, prices and other activities taking place on Ethereum (ETH)" property="og:description"/> <meta content="website" property="og:type"/> <meta content="Ethereum (ETH) Blockchain Explorer" property="og:site_name"/> 

^C


In [1]:
# Import selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By


In [2]:
# Load driver

# from selenium.webdriver.chrome.options import Options
# options = Options()
# options.add_argument("--headless")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

TypeError: __init__() got an unexpected keyword argument 'service'

In [None]:
driver.get(url)

Finding one specific element with XPath and class name.

_Note: the browser can export these selectors._

In [None]:
# CSS selector not available in Selenium.
# mCSB_1_container > div:nth-child(1) > div.col-sm-8 > div > div.text-nowrap > span.d-block.mb-1.mb-sm-0 > a

my_xpath = '//*[@id="mCSB_1_container"]/div[1]/div[2]/div/div[1]/span[1]/a'

elem = driver.find_element(By.XPATH, my_xpath)



In [None]:

elem.text

In [None]:
elem = driver.find_element(By.CLASS_NAME, 'hash-tag')

elem.text


In [None]:
elements = driver.find_elements(By.CLASS_NAME, 'hash-tag')

In [None]:
type(elements), len(elements)

#### All transactions

Let's now get all transactions, not just what is on the home page.


In [2]:
url = 'https://etherscan.io/txs'

By visual inspection in DevTools, each row contains 3 elements:
- transaction hash
- from
- to

In [3]:
driver.get(url)
elements = driver.find_elements(By.CLASS_NAME, 'hash-tag')

NameError: name 'driver' is not defined

In [None]:
# Let's prepare for it.

transactions = []
toAdx = ''
fromAdx = ''

## Go through all elements containing an address or a transaction hash.
for i in range(len(elements)):
    adx = elements[i-1].text
    if i % 2 and not i % 3 == 0:
        fromAdx = adx
    if i % 3 == 0:
        toAdx = adx
        transactions.append((fromAdx, toAdx))

print(transactions)

print(len(transactions))

Let's build some simple statistics, counting the number of times an address receive/send a transaction. 


In [None]:
from collections import defaultdict
sentCounts = defaultdict(int)
receivedCounts = defaultdict(int)

for i in range(len(elements)):
    adx = elements[i-1].text
    if i % 2 and not i % 3 == 0:
        sentCounts[adx] += 1
    if i % 3 == 0:
        receivedCounts[adx] += 1


print('Sent Counts')
print(sentCounts)
print('Received Counts')
print(receivedCounts)

In [None]:
# Support functions.

from datetime import datetime

def print_time():
    now = datetime.now()
    # dd/mm/YY H:M:S
    dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
    print("time: ", dt_string)

def update_to_from(elements):
    for i in range(len(elements)):
        adx = elements[i-1].text
        if i % 2 and not i % 3 == 0:
            sentCounts[adx] += 1
        if i % 3 == 0:
            receivedCounts[adx] += 1


In [None]:
# Fetch multiple pages of transactions.

url = 'https://etherscan.io/txs?ps=100&p='

# Reset counts.
sentCounts = defaultdict(int)
receivedCounts = defaultdict(int)

for i in range(1,4):
    my_url = url + str(i)
    driver.get(my_url)
    elements = driver.find_elements(By.CLASS_NAME, 'hash-tag')
    update_to_from(elements)
    print_time()



In [None]:
def printSorted(dic):
    sorted_dic = sorted(dic.items(), key=lambda x: x[1], reverse=True)
    for i in sorted_dic:
        print(i[0], i[1])


In [None]:
printSorted(sentCounts)


In [None]:
printSorted(receivedCounts)

Let's gather more pages of transactions.

We can also try out different methods.

In [None]:
def getTxs(n=3, reset=True, showCounts=True, method="requests"):
    # Access global variables from within a function's body.
    global sentCounts
    global receivedCounts

    url = 'https://etherscan.io/txs?ps=100&p='

    # Reset counts.
    if reset:
        print('Counts reseted')
        sentCounts = defaultdict(int)
        receivedCounts = defaultdict(int)

    for i in range(1,n+1):
        my_url = url + str(i)

        if method == "requests":
            html = requests.get(my_url, headers=hdr)
            soup = BeautifulSoup(html.content, 'html.parser')
            elements = soup.select(".hash-tag")
        elif method == "selenium":
            driver.get(my_url)
            elements = driver.find_elements(By.CLASS_NAME, 'hash-tag')
        elif method == "urllib":
            req = Request(url, headers = hdr)
            page = urlopen(req)
            soup = BeautifulSoup(page, 'html.parser')
            elements = soup.select(".hash-tag")

        update_to_from(elements) 
        print_time()
    
    if showCounts:
        printSorted(sentCounts)
        printSorted(receivedCounts)
 

In [None]:
getTxs(1, method = "selenium", showCounts = False)

In [None]:
getTxs(1, method = "requests", showCounts = False)

In [None]:
getTxs(1, method = "urllib", showCounts = False)

In [None]:
printSorted(sentCounts)

In [None]:
printSorted(receivedCounts)

In [None]:
import pandas as pd

In [None]:
# Sent.
items = sentCounts.items()
df_sent = pd.DataFrame.from_dict(items)
df_sent.columns = ["address", "sent"]
df_sent.sort_values(by="sent", ascending=False)

In [None]:
# Sent.
items = receivedCounts.items()
df_received = pd.DataFrame.from_dict(items)
df_received.columns = ["address", "received"]
df_received.sort_values(by="received", ascending=False)

In [None]:
df_txs = pd.merge(df_sent, df_received, on = "address", how = "outer")
df_txs = df_txs.fillna(0)
df_txs.sort_values(by="received", ascending=False)

In [None]:
df_txs.plot.scatter(x='sent',
                    y='received',
                    c='DarkBlue')

In [None]:
import random

def jitter(x):
    return x + random.uniform(0, .5) -.25

df_txs['sent_jit'] = df_txs['sent'].apply(lambda x: jitter(x))
df_txs['received_jit'] = df_txs['received'].apply(lambda x: jitter(x))

In [None]:
df_txs.plot.scatter(x='sent_jit',
                    y='received_jit',
                    c='DarkBlue')