In [4]:
from xgboost.spark import SparkXGBClassifier
from btc_streamer.ml.preprocessing import BTCDataloader


In [51]:
btc = BTCDataloader()
btc.setup_spark()
df = btc.load_data('../data/')
train_data, test_data, preproc_spark = btc.preproc_split(df)

                                                                                

> [0;32m/Users/michieldekoninck/code/Michiel-DK/kafka_streamer/btc_streamer/ml/preprocessing.py[0m(90)[0;36mpreproc_split[0;34m()[0m
[0;32m     89 [0;31m        [0;31m# Convert feature columns into a single vector column[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 90 [0;31m        [0mfeature_columns[0m [0;34m=[0m [0;34m[[0m[0mx[0m[0;34m.[0m[0mname[0m [0;32mfor[0m [0mx[0m [0;32min[0m [0mdf_final[0m[0;34m.[0m[0mschema[0m [0;32mif[0m [0mre[0m[0;34m.[0m[0msearch[0m[0;34m([0m[0;34mr'percent'[0m[0;34m,[0m [0mx[0m[0;34m.[0m[0mname[0m[0;34m)[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     91 [0;31m        [0massembler[0m [0;34m=[0m [0mVectorAssembler[0m[0;34m([0m[0minputCols[0m[0;34m=[0m[0mfeature_columns[0m[0;34m,[0m [0moutputCol[0m[0;34m=[0m[0;34m'features'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
['percent_change_30m', 'percent_change_1h', 'percent_change_6h', 'percent_change_12h', 'percent_change_

In [35]:
xgb_classifier = SparkXGBClassifier(
            features_col='features',
            label_col='target',
            num_workers=4,
            device='cpu',
            booster='gbtree',
            eval_metric='logloss',
        )

xgb_classifier.setParams(early_stopping_rounds=5)


In [36]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from xgboost.spark import SparkXGBClassifier

# Define the classifier
#xgb_classifier = SparkXGBClassifier()

# Create a parameter grid to search over
paramGrid = ParamGridBuilder() \
    .addGrid(xgb_classifier.max_depth, [3, 5, 7]) \
    .addGrid(xgb_classifier.learning_rate, [0.1, 0.05, 0.01]) \
    .addGrid(xgb_classifier.n_estimators, [100, 200]) \
    .addGrid(xgb_classifier.subsample, [0.8, 1.0]) \
    .build()


In [37]:
evaluator = BinaryClassificationEvaluator(
    labelCol='target',
    rawPredictionCol='rawPrediction',
    metricName='areaUnderROC'
)

#evaluator = MulticlassClassificationEvaluator()


In [38]:
crossval = CrossValidator(estimator=xgb_classifier,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)  

In [39]:
cv_model = crossval.fit(train_data)

24/09/26 12:02:59 WARN CacheManager: Asked to cache already cached data.
24/09/26 12:02:59 WARN CacheManager: Asked to cache already cached data.
2024-09-26 12:03:05,257 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 4 workers with
	booster params: {'booster': 'gbtree', 'device': 'cpu', 'eval_metric': 'logloss', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
INFO:XGBoost-PySpark:Running xgboost-2.1.1 on 4 workers with
	booster params: {'booster': 'gbtree', 'device': 'cpu', 'eval_metric': 'logloss', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-26 12:03:06,496 INFO XGBoost-PySpark: _train_booster Training on CPUs 4]
[12:03:07

In [42]:
best_model = cv_model.bestModel
predictions = best_model.transform(test_data)
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy}")

2024-09-26 12:12:00,198 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
                                                                                

Test Accuracy: 0.7925291544333053


In [47]:
best_sklearn = best_model.__dict__['_xgb_sklearn_model']

In [53]:
best_sklearn.feature_importances_

array([0.85061383, 0.04248043, 0.03901724, 0.03337708, 0.03451145],
      dtype=float32)

In [50]:
df.show(10)

+----------+-------------------+-------+--------+--------+--------+--------+-----------+-----------------+
|      unix|               date| symbol|    open|    high|     low|   close| Volume BTC|       Volume USD|
+----------+-------------------+-------+--------+--------+--------+--------+-----------+-----------------+
|1514764740|2017-12-31 23:59:00|BTC/USD|13913.28|13913.28|13867.18| 13880.0| 0.59174759|     8213.4565492|
|1514764680|2017-12-31 23:58:00|BTC/USD|13913.26|13953.83|13884.69|13953.77| 1.39878396| 19518.3096575292|
|1514764620|2017-12-31 23:57:00|BTC/USD|13908.73|13913.26|13874.99|13913.26| 0.77501206| 10782.9442939156|
|1514764560|2017-12-31 23:56:00|BTC/USD| 13827.0|13908.69| 13827.0|13859.58| 0.66645895|   9236.841134241|
|1514764500|2017-12-31 23:55:00|BTC/USD|13825.05|13825.05|13825.05|13825.05|  0.0655014|     905.56013007|
|1514764440|2017-12-31 23:54:00|BTC/USD|13884.14|13884.14|13823.88|13854.28|  4.8107019|  66648.811119132|
|1514764380|2017-12-31 23:53:00|BTC/U

In [52]:
import matplotlib.pyplot as plt
import wandb

wandb.sklearn.plot_feature_importances(best_sklearn, ['percent_change_30m', 'percent_change_1h', 'percent_change_6h', 'percent_change_12h', 'percent_change_24h']
)



Error: You must call wandb.init() before wandb.log()

In [None]:
wandb.sklearn.plot_learning_curve(best_sklearn, test_data, y)

In [22]:
from btc_streamer.ml.preprocessing import BTCDataloader
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from xgboost.spark import SparkXGBClassifier
import re
import logging
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import col

evaluator = BinaryClassificationEvaluator(
    labelCol='target',
    rawPredictionCol='rawPrediction',
    metricName='areaUnderROC'
)
predictions = model.transform(test_data)
roc_auc = evaluator.evaluate(predictions)

preds = model.transform(test_data).select("target", "prediction", 'probability')

preds = preds.withColumn("probability_array", vector_to_array(col("probability")))
preds = preds.withColumn("index_0_probability", col("probability_array")[0])
preds = preds.withColumn("index_1_probability", col("probability_array")[1])

preds_numpy = np.array(preds.select("target", "prediction", 'index_0_probability','index_1_probability').collect())

y_true = preds_numpy[:,0]
y_pred = preds_numpy[:,1]
y_prob = preds_numpy[:,-2:]

acc = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)

scores = {
    'roc_auc': roc_auc,
    'accuracy': acc,
    'precision': precision,
    'recall':recall
}


2024-09-26 11:59:31,164 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-09-26 11:59:36,121 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs


In [23]:
scores

{'roc_auc': 0.7909111057560808,
 'accuracy': 0.7216613628881817,
 'precision': 0.7314635718891038,
 'recall': 0.7256156060121522}