<a href="https://colab.research.google.com/github/Neelakash-gituser/ETHUSDT-Prediction/blob/main/Crypto_Prediction_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Statements

In [None]:
!wget http://prdownloads.sourceforge.net/ta-lib/ta-lib-0.4.0-src.tar.gz
!tar -xzvf ta-lib-0.4.0-src.tar.gz
%cd ta-lib
!./configure --prefix=/usr
!make
!make install
!pip install Ta-Lib

In [None]:
import warnings
warnings.filterwarnings('ignore')

import talib as tb
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from tqdm import tqdm

from sklearn.preprocessing import Normalizer
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error

# Data Loading

In [None]:
# read in the dataset

FILENAME = "ETHUSDT_data_new.csv"
PATH = f"/content/{FILENAME}"
data = pd.read_csv(PATH, sep=";")

In [None]:
# glimpse of the data

data.head()

Unnamed: 0,symbol,datetime,open,high,low,close,volume,symbol_id
0,ETHUSDT,2021-01-01 00:00:00,737.18,740.0,730.44,731.64,46772.61,334
1,ETHUSDT,2021-01-01 00:15:00,731.7,732.99,730.0,732.36,20375.178,334
2,ETHUSDT,2021-01-01 00:30:00,732.36,735.1,732.21,734.18,14593.525,334
3,ETHUSDT,2021-01-01 00:45:00,734.18,736.35,733.04,734.6,16351.214,334
4,ETHUSDT,2021-01-01 01:00:00,734.61,744.49,734.0,744.47,42580.2,334


# Data Cleaning

In [None]:
def clean_dataset(df:pd.DataFrame, dtypes:dict) -> pd.DataFrame:
    """
    Given a dataframe - Remove whitespaces, use proper dtypes

    parameters:
        df: input dataframe
        dtypes: a dictionary with column_names as key and desired dtypes as values (D: Datetime, I:Integer, S:String, F:Float)
    returns:
        df: dataframe
    """
    # make a copy of the df and use it to clean
    data = df.copy()

    # remove any leading or trailing whitespace in columns
    data.columns = pd.Series(data.columns).apply(lambda x: x.strip())

    assert list(dtypes.keys()) == list(data.columns), "one or more columns in dictionary not present in dataframe"

    # convert to desirable dtypes
    for key, val in dtypes.items():
        try:
            if val.upper() == "D":
                data[key] = pd.to_datetime(data[key], format="mixed")
                # sort by chronological order
                data.sort_values(by=key, inplace=True)
            elif val.upper() == "I":
                data[key] = data[key].astype(int)
            elif val.upper() == "F":
                data[key] = data[key].astype(float)
            elif val.upper() == "S":
                data[key] = data[key].astype(str)
        except Exception as e:
            print(e)

    return data

In [None]:
# clean the data of inconsistencies

data_types_cols = {"symbol":"S", "datetime":"D", "open":"F", "high":"F", "low":"F", "close":"F", "volume":"F", "symbol_id":"I"}
cleanedDf = clean_dataset(df=data, dtypes=data_types_cols)

time data '2021-01-01 00:00:00' does not match format 'mixed' (match)


In [None]:
# check for missing or null values

cleanedDf.isnull().sum()

symbol       0
datetime     0
open         0
high         0
low          0
close        0
volume       0
symbol_id    0
dtype: int64

# Feature Engineering

In [None]:
indicators = ["SMA_10", "SMA_20", "EMA_10", "EMA_20", "RSI", "MACD", "BB", "OBV"]

# Calculate the indicator values
for indicator in indicators:
    if "SMA" in indicator:
        cleanedDf[indicator] = tb.SMA(cleanedDf['close'], timeperiod=int(indicator.split("_")[-1]))
    elif "EMA" in indicator:
        cleanedDf[indicator] = tb.EMA(cleanedDf['close'], timeperiod=int(indicator.split("_")[-1]))
    elif "RSI" in indicator:
        cleanedDf[indicator] = tb.RSI(cleanedDf['close'])
    elif "MACD" in indicator:
        macd, macdsignal, macdhist = tb.MACD(cleanedDf['close'])
        cleanedDf["MACD"] = macd
        cleanedDf["MACD_Signal"] = macdsignal
        cleanedDf["MACD_HIST"] = macdhist
    elif "BB" in indicator:
        low, high, simple = tb.BBANDS(cleanedDf['close'])
        cleanedDf["BB_LOW"] = low
        cleanedDf["BB_HIGH"] = high
    elif "OBV" in indicator:
        cleanedDf[indicator] = tb.OBV(cleanedDf['close'], cleanedDf['volume'])

In [None]:
cleanedDf.head()

Unnamed: 0,symbol,datetime,open,high,low,close,volume,symbol_id,SMA_10,SMA_20,EMA_10,EMA_20,RSI,MACD,MACD_Signal,MACD_HIST,BB_LOW,BB_HIGH,OBV
0,ETHUSDT,2021-01-01 00:00:00,737.18,740.0,730.44,731.64,46772.61,334,,,,,,,,,,,46772.61
1,ETHUSDT,2021-01-01 00:15:00,731.7,732.99,730.0,732.36,20375.178,334,,,,,,,,,,,67147.788
2,ETHUSDT,2021-01-01 00:30:00,732.36,735.1,732.21,734.18,14593.525,334,,,,,,,,,,,81741.313
3,ETHUSDT,2021-01-01 00:45:00,734.18,736.35,733.04,734.6,16351.214,334,,,,,,,,,,,98092.527
4,ETHUSDT,2021-01-01 01:00:00,734.61,744.49,734.0,744.47,42580.2,334,,,,,,,,,744.734826,735.45,140672.727


In [None]:
# Insert some custom features
cleanedDf['ohlc'] = -np.sign(cleanedDf['open'] - cleanedDf['close'])
cleanedDf['normalised_ohcl'] = (cleanedDf['close'] - cleanedDf['open']) / (cleanedDf['high'] - cleanedDf['low'])

cleanedDf['SMA_Diff'] = cleanedDf['SMA_10'] - cleanedDf['SMA_20']
cleanedDf['SMA_Diff_Signal'] = np.sign(cleanedDf['SMA_10'] - cleanedDf['SMA_20']) # if positive shows upward trend

cleanedDf['EMA_Diff'] = cleanedDf['EMA_10'] - cleanedDf['EMA_20']
cleanedDf['EMA_Diff_Signal'] = np.sign(cleanedDf['EMA_10'] - cleanedDf['EMA_20']) # if positive shows upward trend

In [None]:
# Calculate shifted values
steps = 5

for i in range(1, steps):
    cleanedDf[f"shifted_{i}"] = cleanedDf['close'].shift(i) # shifted close values give an idea about how the nex

# Label Calculation

In [None]:
# 15 min trading labels

cleanedDf['15min_trading_label'] = cleanedDf['close'].pct_change().apply(lambda x: 1 if x > 0 else 0)

In [None]:
# checking for distribution of labels

cleanedDf['15min_trading_label'].value_counts()

1    17501
0    17093
Name: 15min_trading_label, dtype: int64

In [None]:
# cleanup NaN values

cleanedDf.dropna(inplace=True)

In [None]:
# set date as index
cleanedDf.set_index("datetime", inplace=True)
cleanedDf.index = pd.to_datetime(cleanedDf.index)

In [None]:
train_start, train_end, test_start, test_end = "2021-01-01", "2021-10-01", "2021-10-02", "2021-12-31" # keeping a 9 days gap to prevent any lookahead

In [None]:
drop_cols = ["15min_trading_label", "symbol", "symbol_id"] # cols to drop
features = list(set(cleanedDf.columns) - set(drop_cols))

# 15 min set
X_train, X_test = cleanedDf.loc[train_start:train_end, features], cleanedDf.loc[test_start:test_end, features]
y_train, y_test = cleanedDf.loc[train_start:train_end, drop_cols[0]], cleanedDf.loc[test_start:test_end, drop_cols[0]]

In [None]:
# normalise the datatset

norm = Normalizer()

In [None]:
X_train, X_test = norm.fit_transform(X_train), norm.transform(X_test) # normalised features

In [None]:
# distribution of labels in train set
y_train.value_counts()

1    14056
0    13438
Name: 15min_trading_label, dtype: int64

# Model Building

In [None]:
def prepare_time_series_data(data, n_time_steps, n_features, n_samples):
    """
    Prepare time series data for training a neural network.

    Args:
    data (numpy.ndarray): The original time series data with shape (n_samples, n_features).
    n_time_steps (int): The number of time steps to consider in each sequence.

    Returns:
    X_train (numpy.ndarray): The formatted input data with shape (n_samples, n_time_steps, n_features).
    """
    n_samples, n_features = data.shape
    X_train = []

    for i in range(n_samples - n_time_steps + 1):
        X_train.append(data[i:i + n_time_steps])

    X_train = np.array(X_train)

    return X_train

n_time_steps = 50  # You can adjust this value as needed


In [None]:
X_train_, X_test_ = prepare_time_series_data(X_train, n_time_steps=n_time_steps, n_features=26, n_samples=X_train.shape[0]), \
                            prepare_time_series_data(X_test, n_time_steps=n_time_steps,  n_features=26, n_samples=X_test.shape[0]) # lstm data format

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Define your model
model = Sequential()

# Add LSTM layers
model.add(LSTM(units=64, input_shape=(X_train_.shape[1], X_train_.shape[2]), return_sequences=True))
model.add(LSTM(units=64, return_sequences=True))
model.add(LSTM(units=64))

# Add dropout for regularization (optional)
model.add(Dropout(0.2))

# Add a dense layer for classification
model.add(Dense(units=1, activation='sigmoid'))  # Use 'softmax' for multi-class classification

# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
model.fit(X_train_, y_train[n_time_steps-1:], epochs=50, batch_size=32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7bcc5ff230a0>

In [None]:
# Evaluate the model

loss, accuracy = model.evaluate(X_test_, y_test[n_time_steps-1:])
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')

Test Loss: 0.6516
Test Accuracy: 0.6574


# Backtest

In [None]:
def backtestCalculator(cash:float, backtest_df:pd.DataFrame, weights:dict) -> tuple:
    """
    Given cash value, backtest_df and weights it calculates the value invested, current value, balance left and shares.

    parameters
        cash: Cash value before trading
        backtest_df: Contains price information of t and t+1 th days
        weights: weight to assign to each asset
    returns
        cash_invested, current_value_of_investment, shares
    """
    amount_allocation = {}
    shares, total, balance, total_invested = {}, 0, 0, 0
    prices = backtest_df.iloc[0].to_dict()
    new_prices = backtest_df.iloc[len(backtest_df)-1].to_dict()

    for keys in backtest_df.columns:
        amount_allocation[keys] = weights[keys] * cash

    for keys in backtest_df.columns:
        shares[keys] = (amount_allocation[keys] // prices[keys])

    for keys in backtest_df.columns:
        total_invested = total_invested + (shares[keys] * prices[keys])

    balance = cash - total_invested

    for keys in backtest_df.columns:
        total = total + (shares[keys] * new_prices[keys])

    return total_invested, total, balance, shares[keys]

In [None]:
# iterator to keep track of timestamps

cash = 1000000 # initial cash
K = 0 # keep track of timestamps
dates = cleanedDf.loc[test_start:test_end].index.to_list()[n_time_steps-1:] # available timesteps of backtest
port_values, dates_portfolio = [], [] # store portfolio values

# read-in the predictions
preds = pd.DataFrame(data = model.predict(X_test_), columns = ["Prediction Sigmoid"])
preds['Date'] = dates



In [None]:
preds['Prediction'] = preds["Prediction Sigmoid"].apply(lambda x: 1 if x >= 0.5 else 0) # 0.5 is threshold
preds.set_index("Date", inplace=True)

In [None]:
# Run backtest
for j in tqdm(range(len(dates) - 1)):
    t, t_1 = dates[K], dates[K+1] # pick current time and the t+1 timestep
    testDf = cleanedDf.loc[t:t_1, ['close']] # take close of t and t+1 timesteps
    testDf.rename(columns={"close":"ETHUSDT"}, inplace=True)

    # receive the value invested at (t)th time, value at (t+1)th time, balance left and shares held
    cash_invested, current_value, balance, shares = backtestCalculator(cash=cash, backtest_df=testDf, weights={"ETHUSDT": 1 if preds.loc[t, "Prediction"] > 0 else 0})

    # portfolio value
    portfolio = balance + current_value
    cash = portfolio
    port_values.append(cash)
    dates_portfolio.append(t_1)

    K = K + 1 # goes to next trading time

100%|██████████| 7014/7014 [00:09<00:00, 760.08it/s]


In [None]:
# creates a timestamp wise portfolio dataframe

portfolioDf = pd.DataFrame({"Timestamp": dates_portfolio, "Portfolio Value": port_values})

In [None]:
portfolioDf['returns'] = portfolioDf['Portfolio Value'].pct_change().fillna(0) # returns calculated
portfolioDf['cum_returns'] = (1+portfolioDf['returns']).cumprod() # cumulative returns

# Backtest Results

In [None]:
# Example cumulative returns data
cumulative_returns = portfolioDf['cum_returns'].to_list()

# Define the number of trading minutes
trading_days_per_year = 252 * 96 # (in 15 minute interval, for 252 days)

# annual returns
cagr = (cumulative_returns[-1] / (len(cumulative_returns))) * trading_days_per_year * 100 # in % terms

# Calculate the annualized volatility (standard deviation of daily returns)
annualized_volatility = np.std(portfolioDf['returns'].to_list(), ddof=1) * np.sqrt(trading_days_per_year) * 100 # in % terms

# Specify the risk-free rate (replace with your preferred rate)
risk_free_rate = 0.0 # 0% as an example

# Calculate the Sharpe ratio
sharpe_ratio = (cagr - risk_free_rate) / annualized_volatility

# Create a DataFrame to display the results
results = pd.DataFrame({
    'CAGR': [f'{str(round(cagr, 2))} %'],
    'Annualized Volatility': [f'{str(round(annualized_volatility, 2))} %'],
    'Sharpe Ratio': [round(sharpe_ratio, 2)]
}, index=["Metrics"])

results.T

Unnamed: 0,Metrics
CAGR,862.51 %
Annualized Volatility,288.35 %
Sharpe Ratio,2.99


# Inference

The improvements are significant over the previous result, achieved an `Annual Return` of **8.6%** and `Sharpe Ratio` of almost **3**, which is very optimistic.

In [None]:
portfolioDf.tail()

Unnamed: 0,Timestamp,Portfolio Value,returns,cum_returns
7009,2021-12-28 23:00:00,2499882.46,0.0,2.510655
7010,2021-12-28 23:15:00,2498970.62,-0.000365,2.509739
7011,2021-12-28 23:30:00,2500381.02,0.000564,2.511156
7012,2021-12-28 23:45:00,2489963.74,-0.004166,2.500693
7013,2021-12-29 00:00:00,2489963.74,0.0,2.500693


In [78]:
portfolioDf.head()

Unnamed: 0,Timestamp,Portfolio Value,returns,cum_returns
0,2021-10-06 12:00:00,995709.33,0.0,1.0
1,2021-10-06 12:15:00,995709.33,0.0,1.0
2,2021-10-06 12:30:00,999162.85,0.003468,1.003468
3,2021-10-06 12:45:00,1003921.73,0.004763,1.008248
4,2021-10-06 13:15:00,1003921.73,0.0,1.008248
