In [1]:
!pip install pyspark



In [2]:
import requests
import json
import pandas as pd
from datetime import datetime
from pyspark.sql import HiveContext, Row, SparkSession

In [3]:
spark = SparkSession.builder.appName("HiveContext_Bitcoin").enableHiveSupport().getOrCreate()
hiveCtx = HiveContext(spark.sparkContext)



In [4]:
# Fetch Bitcoin Price Index API data

url = "https://data-api.cryptocompare.com/index/cc/v1/historical/days?market=cadli&instrument=BTC-USD&limit=365&aggregate=1&fill=true&apply_mapping=true&response_format=JSON"
response = requests.get(url)
data = response.json()

In [5]:
# Process data
filtered_data = []

if 'Data' in data:
    for entry in data['Data']:
        # Transfer timestamp to YYYY-MM-DD format
        timestamp = entry.get('TIMESTAMP')
        close_price = entry.get('CLOSE')

        if timestamp and close_price:
            dt_object = datetime.fromtimestamp(timestamp)
            date_str = dt_object.strftime('%Y-%m-%d')

            # Filter out the data: from 2024-10-01 to 2024-12-31
            if "2024-10-01" <= date_str <= "2024-12-31":
                filtered_data.append({
                    "date": date_str,
                    "price": float(close_price)
                })

print(f"Extract {len(filtered_data)} observation.\n")

# Print first 2 observation
print("First 2 observation:")
filtered_data[:2]

Extract 17 observation.

First 2 observation:


[{'date': '2024-12-15', 'price': 104389.474858669},
 {'date': '2024-12-16', 'price': 106079.090411121}]

In [6]:
with open("bitcoin_jan2023.json", "w") as f:
    json.dump(filtered_data, f)

In [7]:
# Load Json file into PySpark df
df = hiveCtx.read.json("bitcoin_jan2023.json")

In [8]:
# Register df as a table
df.registerTempTable("bitcoin_table")



In [9]:
# Display the dataFrame schema
print("DataFrame Schema:")
df.printSchema()

DataFrame Schema:
root
 |-- date: string (nullable = true)
 |-- price: double (nullable = true)



### SQL Query:
**Calculate and display the average Bitcoin closing price during the period**

In [10]:
# Query with HiveQL
print("Calculating 2024 Q4 average price...")

result = hiveCtx.sql("""
  SELECT AVG(price) as average_closing_price
  FROM bitcoin_table
""")

result.show()

Calculating 2024 Q4 average price...
+---------------------+
|average_closing_price|
+---------------------+
|    97776.07012522963|
+---------------------+



In [11]:
# Close Spark
spark.stop()