In [2]:
import pandas as pd
import numpy as np
%matplotlib inline
import config 
import sql_con
from requests import Session
from requests.exceptions import ConnectionError, Timeout, TooManyRedirects
import json
import csv

In [3]:
# get module variables
ROOT_DIR = config.ROOT_DIR
select_records = sql_con.select_records
insert_records = sql_con.insert_records
update_records = sql_con.update_records
conn_odbc = sql_con.conn_odbc
read_contents = sql_con.read_contents

In [4]:
# make request to cryptocompare api to get historical data for bitcoin quote prices in USD
url = "https://min-api.cryptocompare.com/data/histoday"
parameters = {
  "fsym":"BTC",
  "tsym":"USD",
  "allData":"true"
}
headers = {
  "authorization": f"Apikey {config.API_KEY}",
}

session = Session()
session.headers.update(headers)

try:
  response = session.get(url, params=parameters)
  res_json = json.loads(response.text)
except (ConnectionError, Timeout, TooManyRedirects) as e:
  print(e)

# res1 = requests.get(url, headers=headers)

In [5]:
# capture data from reponse and write to json file (ingestion layer)
data = res_json["Data"]
with open(rf"{ROOT_DIR}\data\btc_api_data.json", "w") as f:
    f.write(json.dumps(data))

In [6]:
# spark session start to begin transforming data (processing layer)
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("crypto_analysis").getOrCreate()

In [7]:
# define schema for data
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType

schema = StructType([
    StructField("time", LongType(), True),
    StructField("close", DoubleType(), True),
    StructField("high", DoubleType(), True),
    StructField("low", DoubleType(), True),
    StructField("open", DoubleType(), True),
    StructField("volumefrom", DoubleType(), True),
    StructField("volumeto", DoubleType(), True),
    StructField("conversionType", StringType(), True),
    StructField("conversionSymbol", StringType(), True)
])

In [8]:
df = spark.read.option("schema", schema).json(rf"{ROOT_DIR}/data/btc_api_data.json")
df.show()

+-------+----------------+--------------+-------+-------+-------+----------+----------+--------+
|  close|conversionSymbol|conversionType|   high|    low|   open|      time|volumefrom|volumeto|
+-------+----------------+--------------+-------+-------+-------+----------+----------+--------+
|0.04951|                |        direct|0.04951|0.04951|0.04951|1279324800|      20.0|  0.9902|
|0.08584|                |        direct|0.08585|0.05941|0.04951|1279411200|     75.01|   5.092|
| 0.0808|                |        direct|0.09307|0.07723|0.08584|1279497600|     574.0|   49.66|
|0.07474|                |        direct|0.08181|0.07426| 0.0808|1279584000|     262.0|   20.59|
|0.07921|                |        direct|0.07921|0.06634|0.07474|1279670400|     575.0|   42.26|
| 0.0505|                |        direct|0.08181| 0.0505|0.07921|1279756800|    2160.0|  129.78|
|0.06262|                |        direct|0.06767| 0.0505| 0.0505|1279843200|    2402.5|  141.07|
|0.05454|                |    

In [9]:
df.dtypes

[('close', 'double'),
 ('conversionSymbol', 'string'),
 ('conversionType', 'string'),
 ('high', 'double'),
 ('low', 'double'),
 ('open', 'double'),
 ('time', 'bigint'),
 ('volumefrom', 'double'),
 ('volumeto', 'double')]

In [10]:
df.printSchema()

root
 |-- close: double (nullable = true)
 |-- conversionSymbol: string (nullable = true)
 |-- conversionType: string (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- open: double (nullable = true)
 |-- time: long (nullable = true)
 |-- volumefrom: double (nullable = true)
 |-- volumeto: double (nullable = true)



In [24]:
# get descriptive statistics for numeric columns
df.describe(["open", "close", "high", "low", "volumefrom", "volumeto"]).show()

+-------+------------------+------------------+------------------+------------------+------------------+-------------------+
|summary|              open|             close|              high|               low|        volumefrom|           volumeto|
+-------+------------------+------------------+------------------+------------------+------------------+-------------------+
|  count|              4639|              4639|              4639|              4639|              4639|               4639|
|   mean| 8863.829104569946| 8869.954069107562| 9105.228244326378| 8596.062054580721| 53772.31459150669|4.564303813039892E8|
| stddev|14491.204850836328|14493.469641538006|14881.309372455418|14047.221579806623|48778.531906651115|8.097197773104932E8|
|    min|           0.04951|           0.04951|           0.04951|              0.01|               0.0|                0.0|
|    max|          67549.14|          67549.14|          68978.64|          66312.42|         572349.32|  1.112022085477E10|


In [12]:
df.describe().printSchema()

root
 |-- summary: string (nullable = true)
 |-- close: string (nullable = true)
 |-- conversionSymbol: string (nullable = true)
 |-- conversionType: string (nullable = true)
 |-- high: string (nullable = true)
 |-- low: string (nullable = true)
 |-- open: string (nullable = true)
 |-- time: string (nullable = true)
 |-- volumefrom: string (nullable = true)
 |-- volumeto: string (nullable = true)



In [22]:
df.columns

['close',
 'conversionSymbol',
 'conversionType',
 'high',
 'low',
 'open',
 'time',
 'volumefrom',
 'volumeto']

In [13]:
# ingest data into excel file (csv ingestion point for data pipeline)
# after initial transformation, data is written to csv file for ingestion into database
df.write.mode("overwrite").csv(rf"{ROOT_DIR}/data/btc_price.csv", header=True)