**Imports** 👌

In [56]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.signal import detrend
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense

**Functions** 🤌

In [57]:
def create_sequences(X, y, time_steps=60):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X.iloc[i:(i + time_steps)].values)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)



**Preprocessing & Feature Engineering** 👻

In [58]:
# Load the dataset
data_path = "kc_btc_60min_ha_ti.csv"
df = pd.read_csv(data_path)

# Convert 'time' column to datetime format and set as index
df['time'] = pd.to_datetime(df['time'], unit='s')  # Assuming 'time' is in Unix timestamp format
df.set_index('time', inplace=True)

# Handle missing values and duplicates
df.fillna(method='ffill', inplace=True)  # Forward fill
df.fillna(method='bfill', inplace=True)  # Backward fill
df = df.drop_duplicates()

# Replace infinite values
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Convert categorical variables to numerical
df["color_num"] = df["color"].map({'green': 1, 'red': 0}) 

# Create new column for color change
df['color_change'] = df['color_num'].diff().abs()
df['color_change'].fillna(0, inplace=True)

# Drop the original 'color' column
df = df.drop('color', axis=1)

# Create lag features
window_size = 10  # Increasing the number of lags
for i in range(window_size):
    df[f"lag_{i+1}"] = df["close"].shift(i + 1)

# Drop rows with missing values (created by lag features)
df = df.dropna()

# Include more features
feature_cols = ['open', 'high', 'low', 'close', 'volume', 'turnover', 'avg_vol_last_100', 'MACD_12_26_9', 'MACDh_12_26_9', 'MACDs_12_26_9', 'PP', 'R1', 'S1', 'R2', 'S2', 'R3', 'S3', 'BBL_5_2.0', 'BBM_5_2.0', 'BBU_5_2.0', 'BBB_5_2.0', 'BBP_5_2.0', 'RSI', 'SMA_5', 'SMA_10', 'SMA_20', 'EMA_5', 'EMA_10', 'EMA_20', 'STOCHk_14_3_3', 'STOCHd_14_3_3', 'ATR', 'ROC', 'CCI'] + [f"lag_{i+1}" for i in range(window_size)]

# Volume Change
df['volume_change'] = df['volume'].pct_change()
df['volume_change'].fillna(0, inplace=True)

# Price Change (Close Price)
df['price_change'] = df['close'].pct_change()
df['price_change'].fillna(0, inplace=True)

# Volatility (e.g., standard deviation of price changes over past N periods)
N = 5  # Choose a value for N
df['volatility'] = df['price_change'].rolling(window=N).std()
df['volatility'].fillna(0, inplace=True)

# Extract trend and seasonal components
result = seasonal_decompose(df['close'], model='additive', period=24)  # for hourly data, 24 could be a starting point for period
df['trend'] = result.trend
df['seasonal'] = result.seasonal

# Calculate rolling variance
df['rolling_var'] = df['close'].rolling(window=24).var()
df['rolling_var'].fillna(0, inplace=True)

# Detrend the 'close' column
df['detrended_close'] = detrend(df['close'])

# Update feature_cols list
feature_cols += ['volume_change', 'price_change', 'volatility', 'trend', 'seasonal', 'rolling_var', 'detrended_close']

# Normalize, but exclude 'RSI' and 'color_num'
to_normalize = [col for col in feature_cols if col not in ['RSI', 'color_num']]
scaler = MinMaxScaler(feature_range=(0, 1))
df[to_normalize] = scaler.fit_transform(df[to_normalize])

# Create sequences
features = df[feature_cols]
target = df['color_change']
X, y = create_sequences(features, target, time_steps)

# Check the shape of the data
print("Shape of X: ", X.shape)
print("Shape of y: ", y.shape)

# Checking the first few rows
df.head()



Shape of X:  (1923, 60, 51)
Shape of y:  (1923,)


Unnamed: 0_level_0,open,close,high,low,volume,turnover,avg_vol_last_100,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,...,lag_8,lag_9,lag_10,volume_change,price_change,volatility,trend,seasonal,rolling_var,detrended_close
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-01-03 14:00:00,0.006596,0.006319,0.009867,0.00305,0.163359,0.112966,0.124722,0.387319,0.514991,0.390588,...,0.009328,0.006722,0.005338,0.043062,0.464726,0.0,,0.759607,0.0,0.361351
2023-01-03 15:00:00,0.005775,0.003007,0.005017,0.001242,0.09742,0.067041,0.133524,0.383776,0.505716,0.389828,...,0.008244,0.009328,0.006722,0.023002,0.431732,0.0,,0.993017,0.0,0.355817
2023-01-03 16:00:00,0.003696,0.002151,0.00312,0.000803,0.104484,0.071915,0.143028,0.380366,0.498641,0.388473,...,0.00794,0.008244,0.009328,0.046555,0.456176,0.0,,0.801137,0.0,0.354038
2023-01-03 17:00:00,0.002225,0.0,0.001562,0.0,0.028038,0.019186,0.139485,0.376111,0.490782,0.386458,...,0.008873,0.00794,0.008244,0.007632,0.443234,0.0,,0.762805,0.0,0.350279
2023-01-03 18:00:00,0.000407,0.001061,0.0,0.001926,0.023794,0.016285,0.13545,0.373692,0.489276,0.384316,...,0.009092,0.008873,0.00794,0.03692,0.475342,0.073654,,0.908196,0.0,0.351431


**Train-Test**

**Model** 🏴‍☠️

In [61]:
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.metrics import mean_squared_error

# Train-test split
train_size = int(len(df) * 0.7)  # 70% of data for training
test_size = len(df) - train_size

train, test = df.iloc[0:train_size], df.iloc[train_size:len(df)]
print(len(train), len(test))

# Create train and test sets for features and target
X_train, y_train = create_sequences(train[feature_cols], train['color_change'], time_steps)
X_test, y_test = create_sequences(test[feature_cols], test['color_change'], time_steps)

# Check the shape of the data
print("Shape of X_train: ", X_train.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_test: ", y_test.shape)

# Define LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Make predictions
y_pred = model.predict(X_test)

# Since this is a regression problem, you might want to use Mean Squared Error (MSE) as the evaluation metric
print("Model MSE: ", mean_squared_error(y_test, y_pred))


1388 595
Shape of X_train:  (1328, 60, 51)
Shape of y_train:  (1328,)
Shape of X_test:  (535, 60, 51)
Shape of y_test:  (535,)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


ValueError: Input contains NaN.