In [1]:
import requests
import io

import numpy as np
import pandas as pd

from pycoingecko import CoinGeckoAPI

from bs4 import BeautifulSoup as bts

from selenium import webdriver

from PyPDF2 import PdfReader

### Notes

**Why isn't this a script?**

We are only generating the dataset once. We don't need to write a script that does that is properly formatted and tested, we can do it using a notebooks and test our results at the end.

**Wtf is going on here?**

Haha, yeah it's kind of complicated. And very possibly (probably) overly engineered and convoluted. Oh well.

This what's going on:

1. Use the CG API to extract market caps of top 100 coins
2. Scrape the ramining market caps of all other coins from CG web
3. Get whitepaper pdf page urls by scraping whitepaper.io
4. Use those to obtain the raw pdf urls
5. Feed those pdf urls into a PDF parser and extract pdf data

**Lots of data is wrong!!!**

Some of the data might be slightly off but it is all roughly correct. We only need to be roughly correct here because 1) we're primarily concerned with relativity (features systematically reduced due to bad data extraxction doesn't matter) 2) errors will be small (read code) and 3) I don't really care because this whole thing is just kind of illustrative.

## Market Cap DataFrame

### Top 100

First we use the CG API to get the first 100 (by market cap).

In [2]:
# Make a df with all api coin data
cg = CoinGeckoAPI()
api_cap_df = pd.DataFrame(cg.get_coins_markets(vs_currency="usd"))

In [3]:
api_cap_df.head(3)

Unnamed: 0,id,symbol,name,image,current_price,market_cap,market_cap_rank,fully_diluted_valuation,total_volume,high_24h,...,total_supply,max_supply,ath,ath_change_percentage,ath_date,atl,atl_change_percentage,atl_date,roi,last_updated
0,bitcoin,btc,Bitcoin,https://coin-images.coingecko.com/coins/images...,60957.0,1204355492027,1,1281363000000.0,29156415737,61495.0,...,21000000.0,21000000.0,73738.0,-17.35629,2024-03-14T07:10:36.635Z,67.81,89769.73003,2013-07-06T00:00:00.000Z,,2024-08-10T10:15:29.933Z
1,ethereum,eth,Ethereum,https://coin-images.coingecko.com/coins/images...,2636.51,317233476684,2,317233500000.0,15121476590,2658.04,...,120265400.0,,4878.26,-46.01014,2021-11-10T14:24:19.604Z,0.432979,608189.75289,2015-10-20T00:00:00.000Z,"{'times': 56.8249595834471, 'currency': 'btc',...",2024-08-10T10:15:29.303Z
2,tether,usdt,Tether,https://coin-images.coingecko.com/coins/images...,1.0,115771693420,3,115771700000.0,27956890949,1.008,...,115659600000.0,,1.32,-24.449,2018-07-24T00:00:00.000Z,0.572521,74.59827,2015-03-02T00:00:00.000Z,,2024-08-10T10:15:30.228Z


### All the rest

Unfortunately the API (free version) only allows us to access the top 100 coins.

For the others, we scrape their market cap and add this scraped data to a df.

In [4]:
scrape_dict_list = []  # instantiate empty list
for page_num in range(2, 47):  # only have market cap info for coins up to page 47 on CG
    # Get HTML for the page
    url = 'https://www.coingecko.com/?page='+ str(page_num)
    result = requests.get(url, headers={"User-Agent":"Mozilla/5.0"})
    soup = bts(result.text, 'html.parser')
    # Get list of names and tickers
    raw_name_list = soup.findAll('div', class_='tw-text-gray-700 dark:tw-text-moon-100 tw-font-semibold tw-text-sm tw-leading-5')
    name_ticker_list = [raw_name.text.split() for raw_name in raw_name_list][1:-1] # [1:] to skip "Highlights"
    # Get list of market caps
    if page_num == 2:
        start_mc = 800000000  # starting market cap for the second page of results
    else:
        start_mc = scrape_dict_list[-1]['market_cap']
    raw_price_list = soup.findAll('span', {'data-price-target': 'price'})
    mc_list = [start_mc]
    for i in range(10, len(raw_price_list)):
        num_item = float(raw_price_list[i].text.replace('$', '').replace(',', ''))
        if mc_list[-1] > num_item > mc_list[-1]*0.8:  # to ensure we're finding market cap and not other values
            mc_list.append(num_item)
    mc_list = mc_list[1:] # [1:] to skip start_mc
    # Combine into a dictionary and append to list
    if len(name_ticker_list) <= len(mc_list):
        scrape_len = len(name_ticker_list)
    else:
        scrape_len = len(mc_list)
    for i in range(scrape_len):
        coin_dict={}
        coin_dict['symbol'] = name_ticker_list[i][-1]
        coin_dict['name'] = ' '.join(name_ticker_list[i][:-1])
        coin_dict['market_cap'] = mc_list[i]
        scrape_dict_list.append(coin_dict)

In [5]:
# Make a dataframe with all scrape coin data
scrape_cap_df = pd.DataFrame(scrape_dict_list)

In [6]:
scrape_cap_df.head(3)

Unnamed: 0,symbol,name,market_cap
0,DYDX,dYdX,643229311.0
1,ENS,Ethereum Name Service,642401803.0
2,GALA,GALA,642192226.0


### Combine to one market cap df

In [7]:
# Transform API df
api_trans_df = api_cap_df[['name', 'market_cap']]
# Transform scrape df
scrape_trans_df = scrape_cap_df[['name', 'market_cap']].astype({'market_cap': 'int64'})

In [8]:
# Union - should be roughly disjoint sets
combined_cap_df = pd.concat([api_trans_df, scrape_trans_df], axis=0)
# Dropping duplicates to be safe
clean_combined_cap_df = combined_cap_df.drop_duplicates(subset=['name'], keep=False)

In [9]:
clean_combined_cap_df.head(3)

Unnamed: 0,name,market_cap
0,Bitcoin,1204355492027
1,Ethereum,317233476684
2,Tether,115771693420


## Whitepaper DataFrame

Need to scrape extract information from whitepapers found online at https://whitepaper.io/

### Getting WP URLs

Firstly that means scraping the links to the online WP PDF.

In [50]:
link_dict = {}

driver = webdriver.Chrome()

for j in range(1, 242):
    web = f"https://whitepaper.io/coins?page={j}"
    for i in range(1, 11):
        try:
            driver.get(web)
            
            driver.implicitly_wait(3)
            
            button_path = f'//*[@id="whitePapersList"]/div/div[2]/table/tbody/tr[{i}]/td[2]/a/div/div[1]'  # like button element
            button_location = driver.find_element(by='xpath', value=button_path)  # find the like button
            driver.execute_script("arguments[0].click();", button_location)  # click on like
            
            driver.implicitly_wait(3)
            
            button_path = '//*[@id="coin-view"]/div[1]/div[1]/div[1]/div[1]/div/div/div[3]/div/div/div/div[4]/a'  # like button element
            button_location = driver.find_element(by='xpath', value=button_path)  # find the like button
            driver.execute_script("arguments[0].click();", button_location)  # click on like
            
            driver.implicitly_wait(3)
    
            url = driver.current_url
            
            link_dict.update({url[url.rfind("/")+1:url.rfind("whitepaper")-1].replace("-", "_"): url})
        except:
            pass

        print(f"Page: {j}, url: {url})")

#driver.quit()

Page: 1, url: https://whitepaper.io/document/0/bitcoin-whitepaper)
Page: 1, url: https://whitepaper.io/document/718/ethereum-whitepaper)
Page: 1, url: https://whitepaper.io/document/6/tether-whitepaper)
Page: 1, url: https://whitepaper.io/document/716/usd-coin-whitepaper)
Page: 1, url: https://whitepaper.io/document/10/binance-whitepaper)
Page: 1, url: https://whitepaper.io/document/1/ripple-whitepaper)
Page: 1, url: https://whitepaper.io/document/581/cardano-whitepaper)
Page: 1, url: https://whitepaper.io/document/602/solana-whitepaper)
Page: 1, url: https://whitepaper.io/document/672/dogecoin-whitepaper)
Page: 1, url: https://whitepaper.io/document/594/matic-network-whitepaper)
Page: 2, url: https://whitepaper.io/document/596/polkadot-whitepaper)
Page: 2, url: https://whitepaper.io/document/758/shiba-inu-whitepaper)
Page: 2, url: https://whitepaper.io/document/588/dai-whitepaper)
Page: 2, url: https://whitepaper.io/document/646/polygon-whitepaper)
Page: 2, url: https://whitepaper.io/

In [51]:
link_dict

{'bitcoin': 'https://whitepaper.io/document/0/bitcoin-whitepaper',
 'ethereum': 'https://whitepaper.io/document/718/ethereum-whitepaper',
 'tether': 'https://whitepaper.io/document/6/tether-whitepaper',
 'usd_coin': 'https://whitepaper.io/document/716/usd-coin-whitepaper',
 'binance': 'https://whitepaper.io/document/10/binance-whitepaper',
 'ripple': 'https://whitepaper.io/document/1/ripple-whitepaper',
 'cardano': 'https://whitepaper.io/document/581/cardano-whitepaper',
 'solana': 'https://whitepaper.io/document/602/solana-whitepaper',
 'dogecoin': 'https://whitepaper.io/document/672/dogecoin-whitepaper',
 'matic_network': 'https://whitepaper.io/document/594/matic-network-whitepaper',
 'polkadot': 'https://whitepaper.io/document/596/polkadot-whitepaper',
 'shiba_inu': 'https://whitepaper.io/document/758/shiba-inu-whitepaper',
 'dai': 'https://whitepaper.io/document/588/dai-whitepaper',
 'polygon': 'https://whitepaper.io/document/646/polygon-whitepaper',
 'tron': 'https://whitepaper.io

In [52]:
len(link_dict)

175

In [53]:
# Make a df with all the scraped links
wp_url_df = pd.DataFrame({'name': link_dict.values(), 'raw_wp_url': link_dict.keys()})

In [54]:
# Write results to a csv so I don't have to run this again
wp_url_df.to_csv('wp_url_df')

### Get PDF URLs

Then we get the PDF link from those pages to pass to our PDF viewer,

In [12]:
# Function for getting pdf url from raw wp link
def get_pdf_url(raw_wp_url):
    result = requests.get(raw_wp_url, headers={"User-Agent":"Mozilla/5.0"})
    soup = bts(result.text, 'html.parser')
    return soup.find('div', class_="flex flex-col flex-1").object.attrs['data']

In [13]:
# Use this function to add pdf url to our df
wp_url_df['pdf_url'] = wp_url_df.apply(lambda x: get_pdf_url(x))

ValueError: Cannot set a DataFrame with multiple columns to the single column pdf_url

### Get WP attributes

Use PyPDF2 to extract some useful attributes from these PDFs.

In [None]:
pdf_att_dict_list = []  # instantiate empty list
for url in wp_url_df['pdf_url']:
    coin_feature_dict = {}
    coin_feature_dict['pdf_url'] = url

    with io.BytesIO(response.content) as f:
        pdf = PdfReader(f)
        
        coin_feature_dict['number_pages'] = len(pdf.pages)
        number_images = 0
        number_words = 0
        number_equations = 0
    
        for page in pdf.pages:
            try:
                number_images += len(page.images)
            except:
                number_images += 1
            coin_feature_dict['number_images'] = number_images
            coin_feature_dict['number_characters'] += len(page.extract_text())
            coin_feature_dict['number_equations'] += page.extract_text().count("=")

    pdf_att_dict_list.append(coin_feature_dict)

In [None]:
# Make a dataframe with all pdf data
pdf_att_dict_list = pd.DataFrame(pdf_att_dict_list)

In [None]:
# Combine with url df
combined_wp_df = wp_url_df.merge(pdf_att_dict_list, how='inner', on='pdf_url')

In [None]:
# CHECK FOR DUPLICATES!!!

## Combined DataFrame

In [None]:
# Create lower case column for join
clean_combined_cap_df['name_lower'] = clean_combined_cap_df['name'].apply(lambda x: x.lower().replace(" ", "_")

In [None]:
# Join on lower case name
combined_total_df = clean_combined_cap_df.merge(
    right=clean_combined_pdf_df,
    how='inner',
    left_on='name_lower',
    right_on='name'
)

In [None]:
# CHECKS!!!

In [None]:
# Reformat final DF and save
raw_df_to_save = combined_total_df[['']].rename()
raw_df_to_save.to_csv(
    path_or_buf=,
    encoding='utf-8'
)