In [1]:
!pip install pyreadr

Collecting pyreadr
  Downloading pyreadr-0.5.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading pyreadr-0.5.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (418 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m418.3/418.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyreadr
Successfully installed pyreadr-0.5.3


In [2]:
import pandas as pd
import numpy as np
import pyreadr
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Load the .rds file
result = pyreadr.read_r('btc_features.rds')
data_df = result[None] # [None] is the default key for a single .rds object

# Convert date and set index
data_df['date'] = pd.to_datetime(data_df['date'])
data_df = data_df.set_index('date')

print(data_df.head())

                    open          high           low         close  \
date                                                                 
2018-02-19  10552.599609  11273.799805  10513.200195  11225.299805   
2018-02-20  11231.799805  11958.500000  11231.799805  11403.700195   
2018-02-21  11372.200195  11418.500000  10479.099609  10690.400391   
2018-02-22  10660.400391  11039.099609   9939.089844  10005.000000   
2018-02-23   9937.070312  10487.299805   9734.559570  10301.099609   

                  volume      adjusted  log_returns     rsi_14       sma_20  \
date                                                                          
2018-02-19  7.652090e+09  11225.299805     0.061874  57.789772  9103.446533   
2018-02-20  9.926540e+09  11403.700195     0.015768  58.746797  9162.576563   
2018-02-21  9.405340e+09  10690.400391    -0.064592  53.521679  9238.569580   
2018-02-22  8.040080e+09  10005.000000    -0.066261  49.010805  9297.282080   
2018-02-23  7.739500e+09  10301.099

In [3]:
# Create a copy to avoid warnings
df_scaled = data_df.copy()

# Log-transform volume to reduce its extreme scale
df_scaled['volume'] = np.log(df_scaled['volume'])

# We will scale ALL features between 0 and 1
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df_scaled)

# We also need a separate scaler for JUST the 'close' price
# This is so we can inverse-transform our predictions back to dollars
scaler_close = MinMaxScaler(feature_range=(0, 1))
scaler_close.fit(data_df[['close']])

In [4]:
# We will use a "lookback" of 60 days
def create_dataset(dataset, look_back=60):
    X, Y = [], []
    for i in range(look_back, len(dataset)):
        # X is the 60 days of features
        X.append(dataset[i-look_back:i, :])
        # Y is the 61st day's 'close' price (which is at column index 4)
        Y.append(dataset[i, 4])
    return np.array(X), np.array(Y)

look_back = 60
num_features = scaled_data.shape[1]

# Create the full dataset
X_full, y_full = create_dataset(scaled_data, look_back)

# Split into train/test (80/20 split, same as R)
split_point = int(len(X_full) * 0.8)
X_train, X_test = X_full[:split_point], X_full[split_point:]
y_train, y_test = y_full[:split_point], y_full[split_point:]

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

X_train shape: (2202, 60, 12)
y_train shape: (2202,)


In [5]:
# Build the LSTM Model
model_lstm = Sequential()
model_lstm.add(LSTM(units=50, return_sequences=True, input_shape=(look_back, num_features)))
model_lstm.add(Dropout(0.2))
model_lstm.add(LSTM(units=50, return_sequences=False))
model_lstm.add(Dropout(0.2))
model_lstm.add(Dense(units=25))
model_lstm.add(Dense(units=1)) # Output layer (predicts 1 value)

# Compile the model
model_lstm.compile(optimizer='adam', loss='mean_squared_error')

# Train the model (this will take a few minutes)
print("Training LSTM Model...")
model_lstm.fit(X_train, y_train, batch_size=32, epochs=50, validation_data=(X_test, y_test))
print("LSTM Training complete.")

  super().__init__(**kwargs)


Training LSTM Model...
Epoch 1/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 61ms/step - loss: 0.0362 - val_loss: 0.0101
Epoch 2/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 69ms/step - loss: 0.0062 - val_loss: 0.0083
Epoch 3/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - loss: 0.0053 - val_loss: 0.0080
Epoch 4/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - loss: 0.0058 - val_loss: 0.0077
Epoch 5/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 68ms/step - loss: 0.0050 - val_loss: 0.0075
Epoch 6/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 54ms/step - loss: 0.0046 - val_loss: 0.0090
Epoch 7/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - loss: 0.0045 - val_loss: 0.0109
Epoch 8/50
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 71ms/step - loss: 0.0046 - val_loss: 0.0075
Epoch 9/50
[1m69/69[0m 

In [6]:
# Make predictions
predictions_scaled = model_lstm.predict(X_test)

# Inverse transform the predictions back to dollar amounts
predictions_lstm = scaler_close.inverse_transform(predictions_scaled)

# Save predictions to a CSV file
pred_df_lstm = pd.DataFrame(predictions_lstm, columns=['lstm_pred'])
pred_df_lstm.to_csv('lstm_predictions.csv', index=False)

print("LSTM predictions saved to 'lstm_predictions.csv'")

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step
LSTM predictions saved to 'lstm_predictions.csv'


In [7]:
# Load the .rds file with the ARIMA residuals
result_resid = pyreadr.read_r('arimax_residuals.rds')
residuals = result_resid[None].values.flatten() # Get as a flat numpy array

print(f"Loaded {len(residuals)} residuals from ARIMAX model.")

Loaded 2250 residuals from ARIMAX model.


In [8]:
# We will use the same scaled features (df_scaled) from before,
# but our target (Y) is now the residuals.

# Scale the residuals
scaler_resid = MinMaxScaler(feature_range=(0, 1))
scaled_residuals = scaler_resid.fit_transform(residuals.reshape(-1, 1))

# Align features and residuals (residuals start after ARIMAX training)
aligned_features = scaled_data[len(scaled_data) - len(scaled_residuals):, :]

# Create the dataset for the hybrid model
def create_hybrid_dataset(features, residuals, look_back=60):
    X, Y = [], []
    for i in range(look_back, len(residuals)):
        X.append(features[i-look_back:i, :])
        Y.append(residuals[i, 0]) # Target is the residual
    return np.array(X), np.array(Y)

X_full_hybrid, y_full_hybrid = create_hybrid_dataset(aligned_features, scaled_residuals, look_back)

# Split into train/test
split_point_hybrid = int(len(X_full_hybrid) * 0.8)
X_train_hybrid, X_test_hybrid = X_full_hybrid[:split_point_hybrid], X_full_hybrid[split_point_hybrid:]
y_train_hybrid, y_test_hybrid = y_full_hybrid[:split_point_hybrid], y_full_hybrid[split_point_hybrid:]

print(f"X_train_hybrid shape: {X_train_hybrid.shape}")

X_train_hybrid shape: (1752, 60, 12)


In [9]:
# Build the Hybrid LSTM Model
model_hybrid = Sequential()
model_hybrid.add(LSTM(units=50, return_sequences=True, input_shape=(look_back, num_features)))
model_hybrid.add(Dropout(0.2))
model_hybrid.add(LSTM(units=50, return_sequences=False))
model_hybrid.add(Dropout(0.2))
model_hybrid.add(Dense(units=25))
model_hybrid.add(Dense(units=1))

# Compile
model_hybrid.compile(optimizer='adam', loss='mean_squared_error')

# Train
print("Training Hybrid (Residual) LSTM Model...")
model_hybrid.fit(X_train_hybrid, y_train_hybrid, batch_size=32, epochs=50, validation_data=(X_test_hybrid, y_test_hybrid))
print("Hybrid Training complete.")

Training Hybrid (Residual) LSTM Model...
Epoch 1/50


  super().__init__(**kwargs)


[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 74ms/step - loss: 0.0703 - val_loss: 0.0072
Epoch 2/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 68ms/step - loss: 0.0145 - val_loss: 0.0081
Epoch 3/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 55ms/step - loss: 0.0130 - val_loss: 0.0083
Epoch 4/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 58ms/step - loss: 0.0120 - val_loss: 0.0065
Epoch 5/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 68ms/step - loss: 0.0120 - val_loss: 0.0106
Epoch 6/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 54ms/step - loss: 0.0112 - val_loss: 0.0084
Epoch 7/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 74ms/step - loss: 0.0104 - val_loss: 0.0085
Epoch 8/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 54ms/step - loss: 0.0115 - val_loss: 0.0066
Epoch 9/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [10]:
# 1. Predict the scaled residuals
predicted_scaled_residuals = model_hybrid.predict(X_test_hybrid)

# 2. Inverse-transform the residuals
predicted_residuals = scaler_resid.inverse_transform(predicted_scaled_residuals)

# 3. Load the ARIMAX forecast from R
# We need the ARIMAX test-set predictions. Let's upload that file.
# NOTE: Go back to RStudio, add this line to 03_modeling.R and re-run:
# saveRDS(as.vector(arimax_forecast$mean), here::here("output", "models", "arimax_test_forecast.rds"))
#
# Then, upload 'arimax_test_forecast.rds' to Colab.

# --- Once 'arimax_test_forecast.rds' is uploaded, run this ---
result_arimax_fc = pyreadr.read_r('arimax_test_forecast.rds')
arimax_forecast_values = result_arimax_fc[None].values.flatten()

# 4. Align the forecasts (they may have different start points)
len_arimax = len(arimax_forecast_values)
len_hybrid = len(predicted_residuals)

# Get the part of the ARIMAX forecast that aligns with the hybrid
aligned_arimax_forecast = arimax_forecast_values[len_arimax - len_hybrid:]

# 5. Create final hybrid forecast
final_hybrid_forecast = aligned_arimax_forecast + predicted_residuals.flatten()

# 6. Save to CSV
pred_df_hybrid = pd.DataFrame(final_hybrid_forecast, columns=['hybrid_pred'])
pred_df_hybrid.to_csv('hybrid_predictions.csv', index=False)

print("Hybrid predictions saved to 'hybrid_predictions.csv'")

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 59ms/step
Hybrid predictions saved to 'hybrid_predictions.csv'
