In [5]:
import pandas as pd
import numpy as np
%matplotlib inline
import config 
import sql_con
from requests import Session
from requests.exceptions import ConnectionError, Timeout, TooManyRedirects
import json
import csv

In [6]:
# get module variables
ROOT_DIR = config.ROOT_DIR
select_records = sql_con.select_records
insert_records = sql_con.insert_records
update_records = sql_con.update_records
conn_odbc = sql_con.conn_odbc
read_contents = sql_con.read_contents

In [7]:
# make request to cryptocompare api to get historical data for bitcoin quote prices in USD
url = "https://min-api.cryptocompare.com/data/histoday"
parameters = {
  "fsym":"BTC",
  "tsym":"USD",
  "allData":"true"
}
headers = {
  "authorization": f"Apikey {config.API_KEY}",
}

session = Session()
session.headers.update(headers)

try:
  response = session.get(url, params=parameters)
  res_json = json.loads(response.text)
except (ConnectionError, Timeout, TooManyRedirects) as e:
  print(e)

# res1 = requests.get(url, headers=headers)

In [8]:
# get data from json response and normalize it into a pandas dataframe
data = res_json["Data"]
data = pd.json_normalize(data)

In [9]:
data

Unnamed: 0,time,high,low,open,volumefrom,volumeto,close,conversionType,conversionSymbol
0,1279324800,0.04951,0.04951,0.04951,20.00,9.902000e-01,0.04951,direct,
1,1279411200,0.08585,0.05941,0.04951,75.01,5.092000e+00,0.08584,direct,
2,1279497600,0.09307,0.07723,0.08584,574.00,4.966000e+01,0.08080,direct,
3,1279584000,0.08181,0.07426,0.08080,262.00,2.059000e+01,0.07474,direct,
4,1279670400,0.07921,0.06634,0.07474,575.00,4.226000e+01,0.07921,direct,
...,...,...,...,...,...,...,...,...,...
4633,1679616000,28417.00000,27047.96000,28345.69000,56653.76,1.580517e+09,27491.73000,direct,
4634,1679702400,27810.74000,27188.56000,27491.73000,26897.42,7.398358e+08,27493.43000,direct,
4635,1679788800,28212.64000,27447.43000,27493.43000,29291.76,8.151863e+08,27996.81000,direct,
4636,1679875200,28044.15000,26565.75000,27996.81000,48705.37,1.332455e+09,27145.09000,direct,


In [15]:
data["high"] / data["volumeto"]

0       0.050000
1       0.016860
2       0.001874
3       0.003973
4       0.001874
          ...   
4633    0.000018
4634    0.000038
4635    0.000035
4636    0.000021
4637    0.000423
Length: 4638, dtype: float64

In [13]:
data["volumeto"] * data["close"]

0       4.902480e-02
1       4.370973e-01
2       4.012528e+00
3       1.538897e+00
4       3.347415e+00
            ...     
4633    4.345115e+13
4634    2.034062e+13
4635    2.282262e+13
4636    3.616960e+13
4637    1.732729e+12
Length: 4638, dtype: float64

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4638 entries, 0 to 4637
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   time              4638 non-null   int64  
 1   high              4638 non-null   float64
 2   low               4638 non-null   float64
 3   open              4638 non-null   float64
 4   volumefrom        4638 non-null   float64
 5   volumeto          4638 non-null   float64
 6   close             4638 non-null   float64
 7   conversionType    4638 non-null   object 
 8   conversionSymbol  4638 non-null   object 
dtypes: float64(6), int64(1), object(2)
memory usage: 326.2+ KB


In [22]:
# ingest data into excel file (ingestion point for data pipeline)
data.to_csv(rf"{ROOT_DIR}/data/btc_price.csv", index=False)

In [23]:
# spark session start
from pyspark.sql import SparkSession

In [24]:
spark=SparkSession.builder.appName("crypto_analysis").getOrCreate()

In [25]:
df = spark.read.option("header", "true").csv(rf"{ROOT_DIR}/data/btc_price.csv")

In [26]:
df.show()

+----------+-------+-------+-------+----------+--------+-------+--------------+----------------+
|      time|   high|    low|   open|volumefrom|volumeto|  close|conversionType|conversionSymbol|
+----------+-------+-------+-------+----------+--------+-------+--------------+----------------+
|1279324800|0.04951|0.04951|0.04951|      20.0|  0.9902|0.04951|        direct|            null|
|1279411200|0.08585|0.05941|0.04951|     75.01|   5.092|0.08584|        direct|            null|
|1279497600|0.09307|0.07723|0.08584|     574.0|   49.66| 0.0808|        direct|            null|
|1279584000|0.08181|0.07426| 0.0808|     262.0|   20.59|0.07474|        direct|            null|
|1279670400|0.07921|0.06634|0.07474|     575.0|   42.26|0.07921|        direct|            null|
|1279756800|0.08181| 0.0505|0.07921|    2160.0|  129.78| 0.0505|        direct|            null|
|1279843200|0.06767| 0.0505| 0.0505|    2402.5|  141.07|0.06262|        direct|            null|
|1279929600|0.06161|0.05049|0.

In [27]:
df.printSchema()

root
 |-- time: string (nullable = true)
 |-- high: string (nullable = true)
 |-- low: string (nullable = true)
 |-- open: string (nullable = true)
 |-- volumefrom: string (nullable = true)
 |-- volumeto: string (nullable = true)
 |-- close: string (nullable = true)
 |-- conversionType: string (nullable = true)
 |-- conversionSymbol: string (nullable = true)

