In [0]:
# Link to limitations:
# https://docs.databricks.com/aws/en/getting-started/free-edition-limitations
import pandas as pd
import requests
import time
%pip install lxml
%pip install requests

In [0]:
def scrape_data(start):
    url = "https://finance.yahoo.com/markets/crypto/most-active/?start={start}&count=100"
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                    "AppleWebKit/537.36 (KHTML, like Gecko) "
                    "Chrome/118.0.0.0 Safari/537.36"
    }

    response = requests.get(url, headers=headers)
    response.raise_for_status()  # raises if blocked

    dfs = pd.read_html(response.text)
    crypto_table = dfs[0]
    #print(f"Scraping: {start}-{start + 99}")
    return crypto_table


In [0]:
crypto_table = pd.DataFrame()

In [0]:
# Scrapes the 25 most active cryptos at this moment
temp = scrape_data(0)
crypto_table = pd.concat([crypto_table, temp])

In [0]:
crypto_table = crypto_table.drop("Unnamed: 2", axis=1)
crypto_table = crypto_table.dropna(subset=['Name'])

In [0]:
# Rename columns to be more readable
crypto_table = crypto_table.rename(columns={
    'Change %': 'Change_Percent',
    'Market Cap': 'Market_Cap',
    'Volume In Currency (24hr)': 'Volume_In_Currency_24hr',
    'Total Volume All Currencies (24hr)': 'Total_Volume_All_Currencies_24hr',
    'Circulating Supply': 'Circulating_Supply',
    '52 Wk Change %': '52_Wk_Change_Percent',
    '52 Wk Range': '52_Wk_Range'
})

In [0]:
# Clean the "Change %" column
crypto_table["Change_Percent"] = (
    crypto_table["Change_Percent"]
    .astype(str)                           # ensure it's string type
    .str.replace('%', '', regex=False)     # remove the % symbol
    .str.replace('+', '', regex=False)     # remove + sign if any
    .str.replace(',', '', regex=False)     # remove commas
    .astype(float)                         # convert to float
)

In [0]:
# Fix the price variable, split up the numbers, and save only the first element
crypto_table["Price"] = round(
    crypto_table["Price"]
    .astype(str)
    .str.split(" ")          # split on space
    .str[0]                  # take first element
    .str.replace(",", "", regex=False)  # remove commas
    .astype(float),
)

In [0]:
from datetime import datetime
from pyspark.sql import functions as F

# Add scrape_data
crypto_table["scrape_date"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# Convert to Spark DataFrame
spark_df = spark.createDataFrame(crypto_table)

# Persist as a Delta table (append mode)
spark_df.write.format("delta").mode("append").saveAsTable("historical_crypto_trends")

In [0]:
# including social media sentiment,
## Scrape twitter trending page... Evaluate if there is ever a lot of activity regarding a coin
### EX: Elon musk puts Bitcoin in his bio, bitcoin raises 20%