In [5]:
import requests
import os


def get_crypto_price():
    url = f"https://api.coinpaprika.com/v1/tickers/btc-bitcoin"
    
    headers = {
    'Accept-Encoding': 'gzip',
    'Authorization': f'Bearer {os.getenv("COINCAP_API_KEY")}',
}
    
    response = requests.get(url, headers=headers)
    data = response.json()
    try:
        diction = data['quotes']['USD'] 
        diction['symbol'] = data['symbol']
        diction['beta_value'] = data['beta_value']
        diction['timestamp'] = data['last_updated']
        del diction['ath_price']
        del diction['ath_date']
        del diction['percent_from_price_ath']
        return diction
    except:
        print(f"Error retrieving price")
        return None

In [13]:
data = get_crypto_price()

In [15]:
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.functions import col, sum as _sum, last, window, from_unixtime, lag, round, when, lit


# Initialize a Spark session
spark = SparkSession.builder.appName("BTCData").getOrCreate()

# Creating a Pandas DataFrame to filter the required fields
df = pd.DataFrame([data])

# Filtering only the required percentage change columns
filtered_df = df[['percent_change_15m', 'percent_change_30m', 'percent_change_1h', 
                  'percent_change_6h', 'percent_change_12h', 'percent_change_24h']]

# Converting the Pandas DataFrame to Spark DataFrame
spark_df = spark.createDataFrame(filtered_df)

df_final = spark_df.withColumn(
            "target",
            when(col("percent_change_15m") > 0, 1).otherwise(0)
        ).drop('percent_change_15m')

# Show the Spark DataFrame
df_final.show()

+------------------+-----------------+-----------------+------------------+------------------+------+
|percent_change_30m|percent_change_1h|percent_change_6h|percent_change_12h|percent_change_24h|target|
+------------------+-----------------+-----------------+------------------+------------------+------+
|             -0.03|             0.09|             0.74|              1.09|              1.49|     0|
+------------------+-----------------+-----------------+------------------+------------------+------+



In [17]:
import re
from pyspark.ml.feature import VectorAssembler



      # Convert feature columns into a single vector column
feature_columns = [x.name for x in df_final.schema if re.search(r'percent', x.name)]
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
            
assembled_data_train = assembler.transform(df_final).select(['features', 'target'])

In [2]:
from btc_streamer.ml.model_utils import load_model

In [3]:
model = load_model('../models/xgboost_model')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/12 16:04:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [18]:
model.transform(assembled_data_train).show()

2024-10-12 16:16:17,555 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs

+--------------------+------+--------------------+----------+--------------------+
|            features|target|       rawPrediction|prediction|         probability|
+--------------------+------+--------------------+----------+--------------------+
|[-0.03,0.09,0.74,...|     0|[1.64740657806396...|       0.0|[0.83854019641876...|
+--------------------+------+--------------------+----------+--------------------+



                                                                                