In [67]:
import pandas as pd
import numpy as np
%matplotlib inline
import config 
import sql_con
from requests import Session
from requests.exceptions import ConnectionError, Timeout, TooManyRedirects
import json
import csv

In [68]:
# get module variables
ROOT_DIR = config.ROOT_DIR
select_records = sql_con.select_records
insert_records = sql_con.insert_records
update_records = sql_con.update_records
conn_odbc = sql_con.conn_odbc
read_contents = sql_con.read_contents

In [69]:
# make request to cryptocompare api to get historical data for bitcoin quote prices in USD
url = "https://min-api.cryptocompare.com/data/histoday"
parameters = {
  "fsym": "BTC",
  "tsym":"USD",
  "allData":"true"
}
headers = {
  "authorization": f"Apikey {config.API_KEY}",
}

session = Session()
session.headers.update(headers)

try:
  response = session.get(url, params=parameters)
  res_json = json.loads(response.text)
except (ConnectionError, Timeout, TooManyRedirects) as e:
  print(e)

# res1 = requests.get(url, headers=headers)

In [70]:
# capture data from reponse and write to json file (ingestion layer)
data = res_json["Data"]
with open(rf"{ROOT_DIR}\data\btc_api_data.json", "w") as f:
    f.write(json.dumps(data))

In [71]:
# spark session start to begin transforming data (processing layer)
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("crypto_analysis").getOrCreate()

In [72]:
# define schema for data
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType

schema = StructType([
    StructField("time", LongType(), True),
    StructField("close", DoubleType(), True),
    StructField("high", DoubleType(), True),
    StructField("low", DoubleType(), True),
    StructField("open", DoubleType(), True),
    StructField("volumefrom", DoubleType(), True),
    StructField("volumeto", DoubleType(), True),
    StructField("conversionType", StringType(), True),
    StructField("conversionSymbol", StringType(), True)
])

In [73]:
# read ingested json file and print out first 10 records
df = spark.read.option("schema", schema).json(rf"{ROOT_DIR}/data/btc_api_data.json")
df.show(n=10)

+-------+----------------+--------------+-------+-------+-------+----------+----------+--------+
|  close|conversionSymbol|conversionType|   high|    low|   open|      time|volumefrom|volumeto|
+-------+----------------+--------------+-------+-------+-------+----------+----------+--------+
|0.04951|                |        direct|0.04951|0.04951|0.04951|1279324800|      20.0|  0.9902|
|0.08584|                |        direct|0.08585|0.05941|0.04951|1279411200|     75.01|   5.092|
| 0.0808|                |        direct|0.09307|0.07723|0.08584|1279497600|     574.0|   49.66|
|0.07474|                |        direct|0.08181|0.07426| 0.0808|1279584000|     262.0|   20.59|
|0.07921|                |        direct|0.07921|0.06634|0.07474|1279670400|     575.0|   42.26|
| 0.0505|                |        direct|0.08181| 0.0505|0.07921|1279756800|    2160.0|  129.78|
|0.06262|                |        direct|0.06767| 0.0505| 0.0505|1279843200|    2402.5|  141.07|
|0.05454|                |    

In [74]:
# get ingestion date as current unix epoch time
# write data to csv file after adding ingestion date (csv ingestion point for data pipeline)
from pyspark.sql.functions import unix_timestamp, current_timestamp, from_unixtime, col, date_format

df = df.withColumn("ingestion_date (unix epoch)", unix_timestamp()).withColumnRenamed("time", "time (unix epoch)")
df.write.mode("overwrite").csv(rf"{ROOT_DIR}/data/btc_price.csv", header=True)


In [75]:
# read from ingested csv file and print out first 10 records
df = spark.read.csv(rf"{ROOT_DIR}/data/btc_price.csv", header=True)
df.show(n=10,truncate=False)

+-------+----------------+--------------+-------+-------+-------+-----------------+----------+--------+---------------------------+
|close  |conversionSymbol|conversionType|high   |low    |open   |time (unix epoch)|volumefrom|volumeto|ingestion_date (unix epoch)|
+-------+----------------+--------------+-------+-------+-------+-----------------+----------+--------+---------------------------+
|0.04951|null            |direct        |0.04951|0.04951|0.04951|1279324800       |20.0      |0.9902  |1680479765                 |
|0.08584|null            |direct        |0.08585|0.05941|0.04951|1279411200       |75.01     |5.092   |1680479765                 |
|0.0808 |null            |direct        |0.09307|0.07723|0.08584|1279497600       |574.0     |49.66   |1680479765                 |
|0.07474|null            |direct        |0.08181|0.07426|0.0808 |1279584000       |262.0     |20.59   |1680479765                 |
|0.07921|null            |direct        |0.07921|0.06634|0.07474|1279670400 

In [76]:
df.show(n=10,truncate=False)

+-------+----------------+--------------+-------+-------+-------+-----------------+----------+--------+---------------------------+
|close  |conversionSymbol|conversionType|high   |low    |open   |time (unix epoch)|volumefrom|volumeto|ingestion_date (unix epoch)|
+-------+----------------+--------------+-------+-------+-------+-----------------+----------+--------+---------------------------+
|0.04951|null            |direct        |0.04951|0.04951|0.04951|1279324800       |20.0      |0.9902  |1680479765                 |
|0.08584|null            |direct        |0.08585|0.05941|0.04951|1279411200       |75.01     |5.092   |1680479765                 |
|0.0808 |null            |direct        |0.09307|0.07723|0.08584|1279497600       |574.0     |49.66   |1680479765                 |
|0.07474|null            |direct        |0.08181|0.07426|0.0808 |1279584000       |262.0     |20.59   |1680479765                 |
|0.07921|null            |direct        |0.07921|0.06634|0.07474|1279670400 

In [77]:
# extract necessary columns
df = df.select(["time (unix epoch)", "open", "close", "high", "low", "volumefrom", "volumeto"])

In [78]:
df.dtypes

[('time (unix epoch)', 'string'),
 ('open', 'string'),
 ('close', 'string'),
 ('high', 'string'),
 ('low', 'string'),
 ('volumefrom', 'string'),
 ('volumeto', 'string')]

In [79]:
df.printSchema()

root
 |-- time (unix epoch): string (nullable = true)
 |-- open: string (nullable = true)
 |-- close: string (nullable = true)
 |-- high: string (nullable = true)
 |-- low: string (nullable = true)
 |-- volumefrom: string (nullable = true)
 |-- volumeto: string (nullable = true)



In [80]:
# get descriptive statistics for numeric columns
df.describe(["open", "close", "high", "low", "volumefrom", "volumeto"]).show()

+-------+-----------------+------------------+------------------+-----------------+------------------+--------------------+
|summary|             open|             close|              high|              low|        volumefrom|            volumeto|
+-------+-----------------+------------------+------------------+-----------------+------------------+--------------------+
|  count|             4643|              4643|              4643|             4643|              4643|                4643|
|   mean|8880.602872302385|  8886.67325578074| 9122.189641488276|8612.653696144727|53761.577872065376|4.5704536163200855E8|
| stddev|14496.22260263107|14498.411483115498|14886.111362812839|14052.53673852626| 48761.98430856715| 8.097913929815264E8|
|    min|          0.04951|           0.04951|           0.04951|             0.01|               0.0|                 0.0|
|    max|          9999.93|           9999.93|            9990.4|           999.73|          99812.97|          9999594.97|
+-------

In [81]:
df.show(n=10)

+-----------------+-------+-------+-------+-------+----------+--------+
|time (unix epoch)|   open|  close|   high|    low|volumefrom|volumeto|
+-----------------+-------+-------+-------+-------+----------+--------+
|       1279324800|0.04951|0.04951|0.04951|0.04951|      20.0|  0.9902|
|       1279411200|0.04951|0.08584|0.08585|0.05941|     75.01|   5.092|
|       1279497600|0.08584| 0.0808|0.09307|0.07723|     574.0|   49.66|
|       1279584000| 0.0808|0.07474|0.08181|0.07426|     262.0|   20.59|
|       1279670400|0.07474|0.07921|0.07921|0.06634|     575.0|   42.26|
|       1279756800|0.07921| 0.0505|0.08181| 0.0505|    2160.0|  129.78|
|       1279843200| 0.0505|0.06262|0.06767| 0.0505|    2402.5|  141.07|
|       1279929600|0.06262|0.05454|0.06161|0.05049|    496.32|   26.73|
|       1280016000|0.05454| 0.0505|0.05941| 0.0505|   1551.48|   85.06|
|       1280102400| 0.0505|  0.056|  0.056|   0.05|     877.0|   46.91|
+-----------------+-------+-------+-------+-------+----------+--

In [82]:
# set spark session timezone to UTC to have a uniform reference point for all date related fields
from pyspark.sql.functions import to_timestamp
spark.conf.set("spark.sql.session.timeZone", "UTC")
df = df.withColumn("date_time (unix)", from_unixtime("time (unix epoch)", "yyyy-MM-dd HH:mm:ss"))
spark.conf.unset("spark.sql.session.timeZone")

In [83]:
# timezone will default to system timezone (Easter Standard Time) in absence of specific spark.sql.session.timeZone setting
df.withColumn("date_time", from_unixtime("time (unix epoch)", "yyyy-MM-dd HH:mm:ss")).show(n=10, truncate=False)

+-----------------+-------+-------+-------+-------+----------+--------+-------------------+-------------------+
|time (unix epoch)|open   |close  |high   |low    |volumefrom|volumeto|date_time (unix)   |date_time          |
+-----------------+-------+-------+-------+-------+----------+--------+-------------------+-------------------+
|1279324800       |0.04951|0.04951|0.04951|0.04951|20.0      |0.9902  |2010-07-17 00:00:00|2010-07-16 20:00:00|
|1279411200       |0.04951|0.08584|0.08585|0.05941|75.01     |5.092   |2010-07-18 00:00:00|2010-07-17 20:00:00|
|1279497600       |0.08584|0.0808 |0.09307|0.07723|574.0     |49.66   |2010-07-19 00:00:00|2010-07-18 20:00:00|
|1279584000       |0.0808 |0.07474|0.08181|0.07426|262.0     |20.59   |2010-07-20 00:00:00|2010-07-19 20:00:00|
|1279670400       |0.07474|0.07921|0.07921|0.06634|575.0     |42.26   |2010-07-21 00:00:00|2010-07-20 20:00:00|
|1279756800       |0.07921|0.0505 |0.08181|0.0505 |2160.0    |129.78  |2010-07-22 00:00:00|2010-07-21 20

#### Create a new dataframe with a column called HV Ratio that is the ratio of the High Price versus volume of stock traded for a day

In [84]:
df2 = df.withColumn("HV Ratiio", df["high"]/df["volumefrom"])
df2.show(truncate=False)

+-----------------+-------+-------+-------+-------+----------+--------+-------------------+---------------------+
|time (unix epoch)|open   |close  |high   |low    |volumefrom|volumeto|date_time (unix)   |HV Ratiio            |
+-----------------+-------+-------+-------+-------+----------+--------+-------------------+---------------------+
|1279324800       |0.04951|0.04951|0.04951|0.04951|20.0      |0.9902  |2010-07-17 00:00:00|0.0024755            |
|1279411200       |0.04951|0.08584|0.08585|0.05941|75.01     |5.092   |2010-07-18 00:00:00|0.001144514064791361 |
|1279497600       |0.08584|0.0808 |0.09307|0.07723|574.0     |49.66   |2010-07-19 00:00:00|1.6214285714285715E-4|
|1279584000       |0.0808 |0.07474|0.08181|0.07426|262.0     |20.59   |2010-07-20 00:00:00|3.122519083969465E-4 |
|1279670400       |0.07474|0.07921|0.07921|0.06634|575.0     |42.26   |2010-07-21 00:00:00|1.3775652173913044E-4|
|1279756800       |0.07921|0.0505 |0.08181|0.0505 |2160.0    |129.78  |2010-07-22 00:00:

In [87]:
# sort by date_time (unix) in descending order to get HV Ratiio for most recent dates
df2.sort("date_time (unix)", ascending=False).show(truncate=False)

+-----------------+--------+--------+--------+--------+----------+---------------+-------------------+-------------------+
|time (unix epoch)|open    |close   |high    |low     |volumefrom|volumeto       |date_time (unix)   |HV Ratiio          |
+-----------------+--------+--------+--------+--------+----------+---------------+-------------------+-------------------+
|1680393600       |28465.3 |28204.02|28538.36|27880.95|24746.49  |6.9881659115E8 |2023-04-02 00:00:00|1.1532285992882223 |
|1680307200       |28477.29|28465.3 |28810.95|28265.42|19774.03  |5.6329940551E8 |2023-04-01 00:00:00|1.4570095220852806 |
|1680220800       |28037.46|28477.29|28650.47|27541.23|50916.39  |1.43674028379E9|2023-03-31 00:00:00|0.5626964126875452 |
|1680134400       |28355.87|28037.46|29172.9 |27731.64|63033.78  |1.79002247511E9|2023-03-30 00:00:00|0.4628137484377425 |
|1680048000       |27274.9 |28355.87|28640.99|27261.75|52216.18  |1.47107639838E9|2023-03-29 00:00:00|0.5485079528988908 |
|1679961600     