In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import config 
import sql_con
from requests import Session
from requests.exceptions import ConnectionError, Timeout, TooManyRedirects
import json
import csv

In [2]:
# get module variables
ROOT_DIR = config.ROOT_DIR
select_records = sql_con.select_records
insert_records = sql_con.insert_records
update_records = sql_con.update_records
conn_odbc = sql_con.conn_odbc
read_contents = sql_con.read_contents

In [3]:
# function to make calls to cryptocompare API
def get_data(url, parameters, headers):
    session = Session()
    session.headers.update(headers)
    try:
        response = session.get(url, params=parameters)
        data = json.loads(response.text)
        return data
    except (ConnectionError, Timeout, TooManyRedirects) as e:
        print(e)

In [4]:
# function to get api data for top 10 coins
def get_coin_data(coin_list, url, headers):
    coin_data = []
    parameters = { 
        "tsym":"USD",
        "allData":"true"
    }
    for coin in coin_list:
        parameters["fsym"] = coin
        res_json = get_data(url, parameters, headers)
        data = res_json["Data"]
        # iterate through the data and add the coin name to each row
        for row in data:
            row["symbol"] = coin
        coin_data.extend(data)
    return coin_data

In [5]:
# authorization header for making calls to crypto compare API
# read api key from config file using dotev module
headers = {
  "authorization": f"Apikey {config.API_KEY}"
}

In [6]:
# relevant urls for making calls to crypto compare API
top10_url = "https://min-api.cryptocompare.com/data/top/mktcapfull"
hist_url = "https://min-api.cryptocompare.com/data/histoday"

In [7]:
# get top 10 coins by market cap, capture json response
parameters = {
  "tsym":"USD",
  "limit": 10
}

res_json_top10 = get_data(top10_url, parameters, headers)
data_top10 = res_json_top10["Data"]

In [8]:
# capture top 10 coins in a list of dictionaries and write to json file (ingestion layer)
top10_coins = [{"Name": coin["CoinInfo"]["Name"], "FullName": coin["CoinInfo"]["FullName"], "Algorithm": coin["CoinInfo"]["Algorithm"], "ProofType": coin["CoinInfo"]["ProofType"]} for coin in data_top10]
with open(rf"{ROOT_DIR}/data/top10_coins.json", "w") as f:
    f.write(json.dumps(top10_coins))

In [9]:
# make request to cryptocompare api to get historical data for bitcoin quote prices in USD

# parameters = {
#   "fsym": "BTC",
#   "tsym":"USD",
#   "allData":"true"
# }

# res_json = get_data(hist_url, parameters, headers)
# data = res_json["Data"]

In [10]:
import findspark
findspark.init()

In [11]:
# spark session start to begin transforming data (processing layer)
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("crypto_analysis").getOrCreate()

In [12]:
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType

schema = StructType([
    StructField("Name", StringType(), True),
    StructField("FullName", StringType(), True),
    StructField("Algorithm", StringType(), True),
    StructField("ProofType", StringType(), True)
])
df_top10 = spark.read.option("schema", schema).json(rf"{ROOT_DIR}\data\top10_coins.json").select("Name", "FullName", "Algorithm", "ProofType")
df_top10.show(truncate=False)

+----+------------+---------+---------+
|Name|FullName    |Algorithm|ProofType|
+----+------------+---------+---------+
|BTC |Bitcoin     |SHA-256  |PoW      |
|ETH |Ethereum    |Ethash   |PoS      |
|USDT|Tether      |N/A      |N/A      |
|XRP |XRP         |N/A      |XRP LCP  |
|BNB |Binance Coin|BEP-2    |PoSA     |
|USDC|USD Coin    |N/A      |N/A      |
|ADA |Cardano     |Ouroboros|PoS      |
|DOGE|Dogecoin    |Scrypt   |PoW      |
|APT |Aptos       |N/A      |N/A      |
|SOL |Solana      |N/A      |PoH      |
+----+------------+---------+---------+



In [13]:
coin_list = df_top10.rdd.map(lambda x: x[0]).collect()
coin_list

['BTC', 'ETH', 'USDT', 'XRP', 'BNB', 'USDC', 'ADA', 'DOGE', 'APT', 'SOL']

In [14]:
# read historical data for top 10 coins from cryptocompare API
all_coins_data = get_coin_data(coin_list, hist_url, headers)

In [15]:
# capture data from reponse and write to json file (ingestion layer)
with open(rf"{ROOT_DIR}\data\all_coins_data.json", "w") as f:
    f.write(json.dumps(all_coins_data))

In [16]:
# read ingested json file and print out first 10 records

schema = StructType([
    StructField("symbol", StringType(), True),
    StructField("time", LongType(), True),
    StructField("close", DoubleType(), True),
    StructField("high", DoubleType(), True),
    StructField("low", DoubleType(), True),
    StructField("open", DoubleType(), True),
    StructField("volumefrom", DoubleType(), True),
    StructField("volumeto", DoubleType(), True),
    StructField("conversionType", StringType(), True),
    StructField("conversionSymbol", StringType(), True)
])

df = spark.read.option("schema", schema).json(rf"{ROOT_DIR}/data/all_coins_data.json")
df.show(n=10)

+-------+----------------+--------------+-------+-------+-------+------+----------+----------+--------+
|  close|conversionSymbol|conversionType|   high|    low|   open|symbol|      time|volumefrom|volumeto|
+-------+----------------+--------------+-------+-------+-------+------+----------+----------+--------+
|0.04951|                |        direct|0.04951|0.04951|0.04951|   BTC|1279324800|      20.0|  0.9902|
|0.08584|                |        direct|0.08585|0.05941|0.04951|   BTC|1279411200|     75.01|   5.092|
| 0.0808|                |        direct|0.09307|0.07723|0.08584|   BTC|1279497600|     574.0|   49.66|
|0.07474|                |        direct|0.08181|0.07426| 0.0808|   BTC|1279584000|     262.0|   20.59|
|0.07921|                |        direct|0.07921|0.06634|0.07474|   BTC|1279670400|     575.0|   42.26|
| 0.0505|                |        direct|0.08181| 0.0505|0.07921|   BTC|1279756800|    2160.0|  129.78|
|0.06262|                |        direct|0.06767| 0.0505| 0.0505

In [17]:
# get ingestion date as current unix epoch time
# write data to csv file after adding ingestion date (csv ingestion point for data pipeline)
from pyspark.sql.functions import unix_timestamp, from_unixtime, col

df = df.withColumn("ingestion_date (unix epoch)", unix_timestamp()).withColumnRenamed("time", "time (unix epoch)")
df.write.mode("overwrite").csv(rf"{ROOT_DIR}/data/btc_price.csv", header=True)

In [18]:
# read from ingested csv file and print out first 10 records
df = spark.read.csv(rf"{ROOT_DIR}/data/btc_price.csv", header=True)
df.show(n=10,truncate=False)

+-------+----------------+--------------+-------+-------+-------+------+-----------------+----------+--------+---------------------------+
|close  |conversionSymbol|conversionType|high   |low    |open   |symbol|time (unix epoch)|volumefrom|volumeto|ingestion_date (unix epoch)|
+-------+----------------+--------------+-------+-------+-------+------+-----------------+----------+--------+---------------------------+
|0.04951|null            |direct        |0.04951|0.04951|0.04951|BTC   |1279324800       |20.0      |0.9902  |1680552807                 |
|0.08584|null            |direct        |0.08585|0.05941|0.04951|BTC   |1279411200       |75.01     |5.092   |1680552807                 |
|0.0808 |null            |direct        |0.09307|0.07723|0.08584|BTC   |1279497600       |574.0     |49.66   |1680552807                 |
|0.07474|null            |direct        |0.08181|0.07426|0.0808 |BTC   |1279584000       |262.0     |20.59   |1680552807                 |
|0.07921|null            |d

In [19]:
df.show(n=10,truncate=False)

+-------+----------------+--------------+-------+-------+-------+------+-----------------+----------+--------+---------------------------+
|close  |conversionSymbol|conversionType|high   |low    |open   |symbol|time (unix epoch)|volumefrom|volumeto|ingestion_date (unix epoch)|
+-------+----------------+--------------+-------+-------+-------+------+-----------------+----------+--------+---------------------------+
|0.04951|null            |direct        |0.04951|0.04951|0.04951|BTC   |1279324800       |20.0      |0.9902  |1680552807                 |
|0.08584|null            |direct        |0.08585|0.05941|0.04951|BTC   |1279411200       |75.01     |5.092   |1680552807                 |
|0.0808 |null            |direct        |0.09307|0.07723|0.08584|BTC   |1279497600       |574.0     |49.66   |1680552807                 |
|0.07474|null            |direct        |0.08181|0.07426|0.0808 |BTC   |1279584000       |262.0     |20.59   |1680552807                 |
|0.07921|null            |d

In [20]:
# extract necessary columns
df = df.select(["symbol", "time (unix epoch)", "open", "close", "high", "low", "volumefrom", "volumeto"])

In [21]:
df.dtypes

[('symbol', 'string'),
 ('time (unix epoch)', 'string'),
 ('open', 'string'),
 ('close', 'string'),
 ('high', 'string'),
 ('low', 'string'),
 ('volumefrom', 'string'),
 ('volumeto', 'string')]

In [22]:
df.printSchema()

root
 |-- symbol: string (nullable = true)
 |-- time (unix epoch): string (nullable = true)
 |-- open: string (nullable = true)
 |-- close: string (nullable = true)
 |-- high: string (nullable = true)
 |-- low: string (nullable = true)
 |-- volumefrom: string (nullable = true)
 |-- volumeto: string (nullable = true)



In [23]:
# get descriptive statistics for numeric columns
df.describe(["symbol", "open", "close", "high", "low", "volumefrom", "volumeto"]).show()

+-------+------+-----------------+-----------------+-----------------+-----------------+--------------------+-------------------+
|summary|symbol|             open|            close|             high|              low|          volumefrom|           volumeto|
+-------+------+-----------------+-----------------+-----------------+-----------------+--------------------+-------------------+
|  count| 46440|            46440|            46440|            46440|            46440|               46440|              46440|
|   mean|  null|946.4146105644052|947.0533118057502|972.5415717613753| 917.450032232025|2.4748964267424885E7| 9.24603992402797E7|
| stddev|  null| 5303.62724937596| 5305.08385670817|5446.683901175829|5141.835064620559| 1.595190753321859E8|3.553256959223039E8|
|    min|   ADA|              0.0|              0.0|              0.0|              0.0|                 0.0|                0.0|
|    max|   XRP|          9999.93|          9999.93|           9990.4|           999.73|  

In [24]:
df.show(n=10)

+------+-----------------+-------+-------+-------+-------+----------+--------+
|symbol|time (unix epoch)|   open|  close|   high|    low|volumefrom|volumeto|
+------+-----------------+-------+-------+-------+-------+----------+--------+
|   BTC|       1279324800|0.04951|0.04951|0.04951|0.04951|      20.0|  0.9902|
|   BTC|       1279411200|0.04951|0.08584|0.08585|0.05941|     75.01|   5.092|
|   BTC|       1279497600|0.08584| 0.0808|0.09307|0.07723|     574.0|   49.66|
|   BTC|       1279584000| 0.0808|0.07474|0.08181|0.07426|     262.0|   20.59|
|   BTC|       1279670400|0.07474|0.07921|0.07921|0.06634|     575.0|   42.26|
|   BTC|       1279756800|0.07921| 0.0505|0.08181| 0.0505|    2160.0|  129.78|
|   BTC|       1279843200| 0.0505|0.06262|0.06767| 0.0505|    2402.5|  141.07|
|   BTC|       1279929600|0.06262|0.05454|0.06161|0.05049|    496.32|   26.73|
|   BTC|       1280016000|0.05454| 0.0505|0.05941| 0.0505|   1551.48|   85.06|
|   BTC|       1280102400| 0.0505|  0.056|  0.056|  

In [25]:
# set spark session timezone to UTC to have a uniform reference point for all date related fields
spark.conf.set("spark.sql.session.timeZone", "UTC")
df = df.withColumn("date_time (unix)", from_unixtime("time (unix epoch)", "yyyy-MM-dd HH:mm:ss"))
spark.conf.unset("spark.sql.session.timeZone")

In [26]:
# timezone will default to system timezone (Easter Standard Time) in absence of specific spark.sql.session.timeZone setting
df.withColumn("date_time", from_unixtime("time (unix epoch)", "yyyy-MM-dd HH:mm:ss")).show(n=10, truncate=False)

+------+-----------------+-------+-------+-------+-------+----------+--------+-------------------+-------------------+
|symbol|time (unix epoch)|open   |close  |high   |low    |volumefrom|volumeto|date_time (unix)   |date_time          |
+------+-----------------+-------+-------+-------+-------+----------+--------+-------------------+-------------------+
|BTC   |1279324800       |0.04951|0.04951|0.04951|0.04951|20.0      |0.9902  |2010-07-17 00:00:00|2010-07-16 20:00:00|
|BTC   |1279411200       |0.04951|0.08584|0.08585|0.05941|75.01     |5.092   |2010-07-18 00:00:00|2010-07-17 20:00:00|
|BTC   |1279497600       |0.08584|0.0808 |0.09307|0.07723|574.0     |49.66   |2010-07-19 00:00:00|2010-07-18 20:00:00|
|BTC   |1279584000       |0.0808 |0.07474|0.08181|0.07426|262.0     |20.59   |2010-07-20 00:00:00|2010-07-19 20:00:00|
|BTC   |1279670400       |0.07474|0.07921|0.07921|0.06634|575.0     |42.26   |2010-07-21 00:00:00|2010-07-20 20:00:00|
|BTC   |1279756800       |0.07921|0.0505 |0.0818

#### Create a new dataframe with a column called HV Ratio that is the ratio of the High Price versus volume of stock traded for a day

In [27]:
df2 = df.withColumn("HV Ratio", col("high")/col("volumefrom"))

# display dataframe in descending order of HV Ratio
df2.sort("HV Ratio", ascending=False).show(truncate=False)

+------+-----------------+------+------+------+------+----------+--------+-------------------+------------------+
|symbol|time (unix epoch)|open  |close |high  |low   |volumefrom|volumeto|date_time (unix)   |HV Ratio          |
+------+-----------------+------+------+------+------+----------+--------+-------------------+------------------+
|USDT  |1424476800       |1.15  |1.5   |1.5   |1.15  |1.415E-4  |2.122E-4|2015-02-21 00:00:00|10600.706713780919|
|USDT  |1441152000       |0.96  |1.15  |1.15  |0.96  |1.252E-4  |1.44E-4 |2015-09-02 00:00:00|9185.303514376996 |
|USDT  |1456185600       |1.1   |1.1   |1.1   |1.1   |1.518E-4  |1.67E-4 |2016-02-23 00:00:00|7246.376811594203 |
|USDT  |1420761600       |1.0   |1.0   |1.0   |1.0   |2.4E-4    |2.4E-4  |2015-01-09 00:00:00|4166.666666666667 |
|USDT  |1422662400       |0.9274|4.37  |4.37  |0.874 |0.005     |0.02185 |2015-01-31 00:00:00|874.0             |
|USDT  |1441065600       |0.96  |0.96  |0.96  |0.96  |0.002304  |2.301   |2015-09-01 00:

In [29]:
# sort by date_time (unix) in descending order to get HV Ratiio for most recent dates
df2.sort("date_time (unix)", ascending=False).show(truncate=False)

+------+-----------------+--------+--------+--------+--------+---------------+---------------+-------------------+----------------------+
|symbol|time (unix epoch)|open    |close   |high    |low     |volumefrom     |volumeto       |date_time (unix)   |HV Ratio              |
+------+-----------------+--------+--------+--------+--------+---------------+---------------+-------------------+----------------------+
|BTC   |1680480000       |28186.76|27759.57|28494.64|27604.93|41256.85       |1.15862636569E9|2023-04-03 00:00:00|0.6906644593564463    |
|XRP   |1680480000       |0.5185  |0.5021  |0.5242  |0.5019  |6.15062376E7   |3.150123519E7  |2023-04-03 00:00:00|8.522712824820877E-9  |
|BNB   |1680480000       |313.85  |304.06  |314.19  |303.62  |9014.86        |2804823.13     |2023-04-03 00:00:00|0.034852454724754456  |
|USDT  |1680480000       |1.0     |1.0     |1.0     |0.9998  |2.7159850404E8 |2.7163548483E8 |2023-04-03 00:00:00|3.6819054049455432E-9 |
|ADA   |1680480000       |0.3817  

In [32]:
df2.groupby("symbol").agg({"close": "max"}).show()

+------+----------+
|symbol|max(close)|
+------+----------+
|   ADA|     2.968|
|   APT|     9.789|
|   BNB|     9.995|
|   BTC|   9999.93|
|  DOGE|    9.9E-5|
|   ETH|    999.64|
|   SOL|     99.81|
|  USDC|     1.026|
|  USDT|     5.498|
|   XRP|      2.78|
+------+----------+



In [34]:
from pyspark.sql.functions import avg
df2.groupby("symbol").agg(avg("close").alias("avg_close"),
                        avg("high").alias("avg_high"),
                        avg("low").alias("avg_low"),
                        avg("open").alias("avg_open"),
                        avg("volumefrom").alias("avg_volumefrom"),
                        avg("volumeto").alias("avg_volumeto"),
                        avg("HV Ratio").alias("avg_HV Ratio")
                        ).show()


+------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+
|symbol|           avg_close|            avg_high|             avg_low|           avg_open|      avg_volumefrom|        avg_volumeto|        avg_HV Ratio|
+------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+
|   ETH|   501.0253513996553|   517.4536457795006|  481.83859629629563|  500.6383101421187|   674272.1391968105|3.0955675140206695E8|0.002274162741457...|
|  DOGE|0.026344117136089567|0.027883436963824296|0.024782380691214433|0.02632436447028423|1.4018262050905445E8|1.7948530156456403E7|1.683267362620944...|
|  USDC| 0.35299541343669266| 0.36152579672695906| 0.34332220583548656|0.35279746543927665|  1320531.0947394487|   1311705.874812661|2.811381384068696...|
|   BTC|   8890.733470411278|   9126.361142426802|   8616.743333591725

#### What day had the Peak High in Price for each cryptocurrency in dataset?

In [55]:
df_max_high = df2.groupby(col("symbol").alias("sym")).agg({"high": "max"})
df2.join(df_max_high, (df2["symbol"] == df_max_high["sym"]) & (df2["high"] == df_max_high["max(high)"])).select(["symbol", "date_time (unix)", "high"]).show()

+------+-------------------+------+
|symbol|   date_time (unix)|  high|
+------+-------------------+------+
|   BTC|2020-06-10 00:00:00|9990.4|
|   ETH|2018-02-03 00:00:00| 997.6|
|  USDT|2015-03-09 00:00:00| 5.797|
|   XRP|2018-01-04 00:00:00|  3.29|
|   BNB|2018-10-01 00:00:00| 9.991|
|  USDC|2020-03-12 00:00:00| 1.643|
|   ADA|2021-09-02 00:00:00| 3.097|
|  DOGE|2015-06-29 00:00:00|9.9E-5|
|   APT|2022-10-24 00:00:00| 9.963|
|   SOL|2022-01-24 00:00:00| 99.89|
+------+-------------------+------+

