In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import config 
import sql_con
from requests import Session
from requests.exceptions import ConnectionError, Timeout, TooManyRedirects
import json
import csv

In [2]:
# get module variables
ROOT_DIR = config.ROOT_DIR
select_records = sql_con.select_records
insert_records = sql_con.insert_records
update_records = sql_con.update_records
conn_odbc = sql_con.conn_odbc
read_contents = sql_con.read_contents

In [3]:
# make request to cryptocompare api to get historical data for bitcoin quote prices in USD
url = "https://min-api.cryptocompare.com/data/histoday"
parameters = {
  "fsym":"BTC",
  "tsym":"USD",
  "allData":"true"
}
headers = {
  "authorization": f"Apikey {config.API_KEY}",
}

session = Session()
session.headers.update(headers)

try:
  response = session.get(url, params=parameters)
  res_json = json.loads(response.text)
except (ConnectionError, Timeout, TooManyRedirects) as e:
  print(e)

# res1 = requests.get(url, headers=headers)

In [14]:
# capture data from reponse and write to json file (ingestion layer)
data = res_json["Data"]
with open(rf"{ROOT_DIR}\data\btc_api_data.json", "w") as f:
    f.write(json.dumps(data))

In [15]:
# spark session start to begin transforming data (processing layer)
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("crypto_analysis").getOrCreate()

In [16]:
# define schema for data
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType

schema = StructType([
    StructField("time", LongType(), True),
    StructField("close", DoubleType(), True),
    StructField("high", DoubleType(), True),
    StructField("low", DoubleType(), True),
    StructField("open", DoubleType(), True),
    StructField("volumefrom", DoubleType(), True),
    StructField("volumeto", DoubleType(), True),
    StructField("conversionType", StringType(), True),
    StructField("conversionSymbol", StringType(), True)
])

In [17]:
df = spark.read.option("schema", schema).json(rf"{ROOT_DIR}/data/btc_api_data.json")
df.show(n=10)

+-------+----------------+--------------+-------+-------+-------+----------+----------+--------+
|  close|conversionSymbol|conversionType|   high|    low|   open|      time|volumefrom|volumeto|
+-------+----------------+--------------+-------+-------+-------+----------+----------+--------+
|0.04951|                |        direct|0.04951|0.04951|0.04951|1279324800|      20.0|  0.9902|
|0.08584|                |        direct|0.08585|0.05941|0.04951|1279411200|     75.01|   5.092|
| 0.0808|                |        direct|0.09307|0.07723|0.08584|1279497600|     574.0|   49.66|
|0.07474|                |        direct|0.08181|0.07426| 0.0808|1279584000|     262.0|   20.59|
|0.07921|                |        direct|0.07921|0.06634|0.07474|1279670400|     575.0|   42.26|
| 0.0505|                |        direct|0.08181| 0.0505|0.07921|1279756800|    2160.0|  129.78|
|0.06262|                |        direct|0.06767| 0.0505| 0.0505|1279843200|    2402.5|  141.07|
|0.05454|                |    

In [18]:
# get ingestion date as current unix epoch time
# write data to csv file after adding ingestion date (csv ingestion point for data pipeline)
from pyspark.sql.functions import unix_timestamp, current_timestamp, from_unixtime, col

df = df.withColumn("ingestion_date (unix epoch)", unix_timestamp()).withColumnRenamed("time", "time (unix epoch)")
df.show(n=10,truncate=False)
df.write.mode("overwrite").csv(rf"{ROOT_DIR}/data/btc_price.csv", header=True)


+-------+----------------+--------------+-------+-------+-------+-----------------+----------+--------+---------------------------+
|close  |conversionSymbol|conversionType|high   |low    |open   |time (unix epoch)|volumefrom|volumeto|ingestion_date (unix epoch)|
+-------+----------------+--------------+-------+-------+-------+-----------------+----------+--------+---------------------------+
|0.04951|                |direct        |0.04951|0.04951|0.04951|1279324800       |20.0      |0.9902  |1680128004                 |
|0.08584|                |direct        |0.08585|0.05941|0.04951|1279411200       |75.01     |5.092   |1680128004                 |
|0.0808 |                |direct        |0.09307|0.07723|0.08584|1279497600       |574.0     |49.66   |1680128004                 |
|0.07474|                |direct        |0.08181|0.07426|0.0808 |1279584000       |262.0     |20.59   |1680128004                 |
|0.07921|                |direct        |0.07921|0.06634|0.07474|1279670400 

In [19]:
# extract necessary columns
df = df.select(["time (unix epoch)", "open", "close", "high", "low", "volumefrom", "volumeto"])

In [20]:
df.dtypes

[('time (unix epoch)', 'bigint'),
 ('open', 'double'),
 ('close', 'double'),
 ('high', 'double'),
 ('low', 'double'),
 ('volumefrom', 'double'),
 ('volumeto', 'double')]

In [21]:
df.printSchema()

root
 |-- time (unix epoch): long (nullable = true)
 |-- open: double (nullable = true)
 |-- close: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- volumefrom: double (nullable = true)
 |-- volumeto: double (nullable = true)



In [22]:
# get descriptive statistics for numeric columns
df.describe(["open", "close", "high", "low", "volumefrom", "volumeto"]).show()

+-------+------------------+------------------+------------------+------------------+-----------------+-------------------+
|summary|              open|             close|              high|               low|       volumefrom|           volumeto|
+-------+------------------+------------------+------------------+------------------+-----------------+-------------------+
|  count|              4639|              4639|              4639|              4639|             4639|               4639|
|   mean| 8863.829104569946| 8869.925597454188| 9105.228244326378| 8596.062054580721|53773.24717180417|4.564568673610941E8|
| stddev|14491.204850836328|14493.431332483877|14881.309372455418|14047.221579806623|48778.41408221796|8.097486945744911E8|
|    min|           0.04951|           0.04951|           0.04951|              0.01|              0.0|                0.0|
|    max|          67549.14|          67549.14|          68978.64|          66312.42|        572349.32|  1.112022085477E10|
+-------

In [23]:
df.show(n=10)

+-----------------+-------+-------+-------+-------+----------+--------+
|time (unix epoch)|   open|  close|   high|    low|volumefrom|volumeto|
+-----------------+-------+-------+-------+-------+----------+--------+
|       1279324800|0.04951|0.04951|0.04951|0.04951|      20.0|  0.9902|
|       1279411200|0.04951|0.08584|0.08585|0.05941|     75.01|   5.092|
|       1279497600|0.08584| 0.0808|0.09307|0.07723|     574.0|   49.66|
|       1279584000| 0.0808|0.07474|0.08181|0.07426|     262.0|   20.59|
|       1279670400|0.07474|0.07921|0.07921|0.06634|     575.0|   42.26|
|       1279756800|0.07921| 0.0505|0.08181| 0.0505|    2160.0|  129.78|
|       1279843200| 0.0505|0.06262|0.06767| 0.0505|    2402.5|  141.07|
|       1279929600|0.06262|0.05454|0.06161|0.05049|    496.32|   26.73|
|       1280016000|0.05454| 0.0505|0.05941| 0.0505|   1551.48|   85.06|
|       1280102400| 0.0505|  0.056|  0.056|   0.05|     877.0|   46.91|
+-----------------+-------+-------+-------+-------+----------+--