In [11]:
import pandas as pd

# Load the data
dogecoin = pd.read_parquet('../databases/dogecoin_pure.parquet')

# Preview the data
print(dogecoin.head())

                               Open      High       Low     Close     Volume
Date                                                                        
2022-09-18 00:00:00+00:00  0.062210  0.062210  0.057238  0.057520  265405124
2022-09-19 00:00:00+00:00  0.057519  0.058744  0.056180  0.058612  301268389
2022-09-20 00:00:00+00:00  0.058612  0.060262  0.057738  0.058417  294929293
2022-09-21 00:00:00+00:00  0.058423  0.060478  0.056378  0.057404  406017754
2022-09-22 00:00:00+00:00  0.057386  0.059838  0.056945  0.059599  241738855


In [12]:
dogecoin_reset = dogecoin.reset_index()

# Rename columns for Prophet
dogecoin_prophet = dogecoin_reset[['Date', 'Close', 'Open', 'High', 'Low', 'Volume']].rename(columns={'Date': 'ds', 'Close': 'y'})

# Remove timezone if present in 'ds'
dogecoin_prophet['ds'] = pd.to_datetime(dogecoin_prophet['ds']).dt.tz_localize(None)

dogecoin_prophet

Unnamed: 0,ds,y,Open,High,Low,Volume
0,2022-09-18,0.057520,0.062210,0.062210,0.057238,265405124
1,2022-09-19,0.058612,0.057519,0.058744,0.056180,301268389
2,2022-09-20,0.058417,0.058612,0.060262,0.057738,294929293
3,2022-09-21,0.057404,0.058423,0.060478,0.056378,406017754
4,2022-09-22,0.059599,0.057386,0.059838,0.056945,241738855
...,...,...,...,...,...,...
727,2024-09-14,0.105467,0.107113,0.107558,0.104643,426157956
728,2024-09-15,0.102933,0.105467,0.106359,0.102577,428890116
729,2024-09-16,0.099693,0.102938,0.103132,0.098458,528755469
730,2024-09-17,0.101137,0.099696,0.102467,0.098844,480290242


In [16]:
from prophet import Prophet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
from sklearn.model_selection import TimeSeriesSplit

# Initialize TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

# Lists to store the metrics for each fold
mae_list = []
mse_list = []
rmse_list = []
r2_list = []

# Iterate over the train-test splits
for train_index, test_index in tscv.split(dogecoin_prophet):
    train_df, test_df = dogecoin_prophet.iloc[train_index], dogecoin_prophet.iloc[test_index]
    
    # Initialize a new Prophet model for each fold
    model = Prophet()
    
    # Add the regressors
    model.add_regressor('Open')
    model.add_regressor('High')
    model.add_regressor('Low')
    model.add_regressor('Volume')
    
    # Train the model on the train set
    model.fit(train_df)
    
    # Predict on the test set
    forecast_test = model.predict(test_df[['ds', 'Open', 'High', 'Low', 'Volume']])
    
    # Calculate the performance metrics
    mae_test = mean_absolute_error(test_df['y'], forecast_test['yhat'])
    mse_test = mean_squared_error(test_df['y'], forecast_test['yhat'])
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(test_df['y'], forecast_test['yhat'])
    
    # Append the metrics to the lists
    mae_list.append(mae_test)
    mse_list.append(mse_test)
    rmse_list.append(rmse_test)
    r2_list.append(r2_test)
    
    # Print the metrics for the current fold
    print(f"Fold results:")
    print(f"MAE: {mae_test}")
    print(f"MSE: {mse_test}")
    print(f"RMSE: {rmse_test}")
    print(f"R²: {r2_test}")
    print("-" * 30)

# After the loop, you can print the average metrics across all folds
print(f"Average MAE across folds: {np.mean(mae_list)}")
print(f"Average MSE across folds: {np.mean(mse_list)}")
print(f"Average RMSE across folds: {np.mean(rmse_list)}")
print(f"Average R² across folds: {np.mean(r2_list)}")


11:32:09 - cmdstanpy - INFO - Chain [1] start processing
11:32:09 - cmdstanpy - INFO - Chain [1] done processing
11:32:09 - cmdstanpy - INFO - Chain [1] start processing
11:32:09 - cmdstanpy - INFO - Chain [1] done processing


Fold results:
MAE: 0.0010110303765614886
MSE: 1.7676830675270952e-06
RMSE: 0.0013295424278777625
R²: 0.9649875187527454
------------------------------
Fold results:
MAE: 0.0007257211286717256
MSE: 7.62984824783246e-07
RMSE: 0.0008734900255774224
R²: 0.9728109801407517
------------------------------


11:32:09 - cmdstanpy - INFO - Chain [1] start processing
11:32:09 - cmdstanpy - INFO - Chain [1] done processing
11:32:09 - cmdstanpy - INFO - Chain [1] start processing
11:32:09 - cmdstanpy - INFO - Chain [1] done processing


Fold results:
MAE: 0.001091774818374478
MSE: 2.2144388026789273e-06
RMSE: 0.001488099056742839
R²: 0.9863737123539337
------------------------------


11:32:09 - cmdstanpy - INFO - Chain [1] start processing
11:32:09 - cmdstanpy - INFO - Chain [1] done processing


Fold results:
MAE: 0.002801690579368385
MSE: 1.8317817439936142e-05
RMSE: 0.0042799319433766865
R²: 0.9893629628381245
------------------------------
Fold results:
MAE: 0.002046117293306413
MSE: 6.407865501736435e-06
RMSE: 0.0025313762070732267
R²: 0.9867368081763539
------------------------------
Average MAE across folds: 0.0015352668392564983
Average MSE across folds: 5.894157927332369e-06
Average RMSE across folds: 0.0021004879321295873
Average R² across folds: 0.9800543964523818


In [17]:
import joblib

# Save the multivariate Prophet model
joblib.dump(model, '../models/prophet_model_multi_cross.joblib')

['../models/prophet_model_multi_cross.joblib']