# Import Statements

In [1]:
import warnings
warnings.filterwarnings('ignore')

import talib as tb
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from tqdm import tqdm
from financiallib.plots import max_drawdown_plots, return_plots, candlestick_plot
from financiallib.finstats import financial_summary

from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error

# Data Loading

Data loading is a fundamental step in the data analysis and processing pipeline. It refers to the process of bringing external data into a software environment or system where it can be analyzed, manipulated, and utilized for various tasks

Loading the relevant dataset for research and analysis - `ETHUSDT`

In [2]:
# read in the dataset

FILENAME = "ETHUSDT_data_new.csv"
PATH = f"./{FILENAME}"
data = pd.read_csv(PATH, sep=";")

In [3]:
# glimpse of the data

data.head()

Unnamed: 0,symbol,datetime,open,high,low,close,volume,symbol_id
0,ETHUSDT,2021-01-01 00:00:00,737.18,740.0,730.44,731.64,46772.61,334
1,ETHUSDT,2021-01-01 00:15:00,731.7,732.99,730.0,732.36,20375.178,334
2,ETHUSDT,2021-01-01 00:30:00,732.36,735.1,732.21,734.18,14593.525,334
3,ETHUSDT,2021-01-01 00:45:00,734.18,736.35,733.04,734.6,16351.214,334
4,ETHUSDT,2021-01-01 01:00:00,734.61,744.49,734.0,744.47,42580.2,334


In [4]:
data.tail()

Unnamed: 0,symbol,datetime,open,high,low,close,volume,symbol_id
34589,ETHUSDT,28/12/21 23:00,3808.65,3814.98,3793.3,3807.11,9375.533,334
34590,ETHUSDT,28/12/21 23:15,3807.11,3817.56,3805.72,3805.72,6684.468,334
34591,ETHUSDT,28/12/21 23:30,3811.25,3816.14,3801.26,3807.87,6094.621,334
34592,ETHUSDT,28/12/21 23:45,3807.87,3809.02,3788.94,3791.99,7712.899,334
34593,ETHUSDT,29/12/21 0:00,3792.0,3819.0,3783.0,3817.27,17594.83,334


# Data Cleaning

Perform data cleaning and preprocessing on the loaded dataset to make it usable for analysis. Also, identifying and handling missing data, outliers, and anomalies is a crucial part of EDA. It helps ensure that the data used for analysis is accurate and reliable.

In [5]:
def clean_dataset(df:pd.DataFrame, dtypes:dict) -> pd.DataFrame:
    """
    Given a dataframe - Remove whitespaces, use proper dtypes
    
    parameters:
        df: input dataframe
        dtypes: a dictionary with column_names as key and desired dtypes as values (D: Datetime, I:Integer, S:String, F:Float)
    returns:
        df: dataframe
    """
    # make a copy of the df and use it to clean
    data = df.copy()
    
    # remove any leading or trailing whitespace in columns
    data.columns = pd.Series(data.columns).apply(lambda x: x.strip())
    
    assert list(dtypes.keys()) == list(data.columns), "one or more columns in dictionary not present in dataframe"
    
    # convert to desirable dtypes
    for key, val in dtypes.items():
        try:
            if val.upper() == "D":
                data[key] = pd.to_datetime(data[key], format="mixed")
                # sort by chronological order
                data.sort_values(by=key, inplace=True)
            elif val.upper() == "I":
                data[key] = data[key].astype(int)
            elif val.upper() == "F":
                data[key] = data[key].astype(float)
            elif val.upper() == "S":
                data[key] = data[key].astype(str)
        except Exception as e:
            print(e)
            
    return data

In [6]:
# clean the data of inconsistencies

data_types_cols = {"symbol":"S", "datetime":"D", "open":"F", "high":"F", "low":"F", "close":"F", "volume":"F", "symbol_id":"I"}
cleanedDf = clean_dataset(df=data, dtypes=data_types_cols)

In [7]:
# check for missing or null values

cleanedDf.isnull().sum()

symbol       0
datetime     0
open         0
high         0
low          0
close        0
volume       0
symbol_id    0
dtype: int64

# Exploratory Data Analysis

It is an essential step in the data analysis process, particularly in the field of statistics and data science. EDA involves the examination, visualization, and summary of data to gain insights, identify patterns, spot anomalies, and generally understand the characteristics of a dataset before performing more advanced analyses or building predictive models.

In [8]:
def plot_time_series(y:list, x:list, y_title:str, x_title:str, chart_title:str, width:int=1600, height:int=600, fontsize:int=12) -> go.Figure:
    """
    Takes two list and plots a time-series
    
    parameters
        y: y_axis values
        x: x_axis values
        x_title: x-axis title
        y_title: y-axis title
        chart_title: Title of the chart
        width: width of the chart
        height: height of the chart
        fontsize: font size of the chart 
    return
        plotly graph object
    """
    try:
        fig = go.Figure()

        fig.add_trace(go.Scatter(
            x = x,
            y = y
        ))

        fig.update_layout(plot_bgcolor="white", yaxis_title=y_title, xaxis_title=x_title, title=chart_title, width=width, height=height, font=dict(family = "Courier New, monospace",
                                                                                                                   size = fontsize,
                                                                                                                   color = "RebeccaPurple"
                                                                                                                ))
        # formatting background
        fig.update_xaxes(
            mirror=True,
            ticks='outside',
            showline=True,
            linecolor='black',
            gridcolor='white')

        fig.update_yaxes(
                    mirror=True,
                    ticks='outside',
                    showline=True,
                    linecolor='black',
                    gridcolor='white')
    
        return fig
    
    except Exception as e:
        print(e)

In [9]:
# plot the close value of the series

plot_time_series(y=cleanedDf['close'].to_list(), x=cleanedDf['datetime'].to_list(), x_title="Time", y_title="Price (USD $)", chart_title="Price vs Time | ETHUSDT")

In [10]:
# calculate the returns

cleanedDf['returns'] = cleanedDf['close'].pct_change() * 100

In [11]:
# statistical overview of returns

cleanedDf['returns'].describe()

count    34593.000000
mean         3.835257
std         34.677670
min        -78.623766
25%         -0.339580
50%          0.005587
75%          0.350756
max        365.943908
Name: returns, dtype: float64

Lets now see a distribution chart to understand better

In [12]:
def plot_distribution(dist:pd.Series, type_of_dist:str) -> go.Figure:
    """
    Given a series plots the distribution of the same.

    parameters
        dist: series whose distribution is needed
        tyep_of_dist: type of column - return or close values
    returns
        plotly graph objects
    """
    # mean and median
    mean = dist.mean()
    median = dist.median()

    # std
    posi_2_std = mean + (2*dist.std())
    neg_2_std = mean - (2*dist.std())

    maxi, mini = round(dist.max()), round(dist.min())
    bin_size = round((1 + 3.322*np.log(len(dist))))
    steps = (maxi - mini) / bin_size

    # Define the bin edges and bin centers
    bin_edges = np.arange(dist.min(), dist.max() + steps, steps)
    bin_centers = bin_edges[:]
    
    counts_percent = []
    for j in range(len(bin_centers)-1):
        upper = bin_centers[j+1]
        lower = bin_centers[j]
        counts_percent.append(len(dist[(dist >= lower) & (dist <= upper)]))

    # last interval
    lower = bin_centers[j]
    upper = bin_centers[j+1]
    counts_percent.append(len(dist[(dist >= lower) & (dist <= upper)]))

    total_sum = sum(counts_percent)
    counts_percent = pd.Series(counts_percent).apply(lambda x: round((x/total_sum)*100, 2))

    # Compute the histogram values
    hist, _ = np.histogram(dist, bins=bin_edges)

    # Create the histogram trace with bin ticks
    histogram_trace = go.Histogram(x=dist, xbins=dict(start=dist.min(), end=dist.max(), size=steps), text=[f'{round((percent*total_sum)/100)}' for percent in counts_percent], 
                                   marker=dict(color="#99ebff"), textposition="outside") 
    
    # Create the layout object with explicit tickvals
    layout = go.Layout(plot_bgcolor="white", title=f"{type_of_dist} Distribution", xaxis=dict(title=f'{type_of_dist}', tickvals=bin_centers, 
                                ticktext=[f'{round((tick))}' for tick in bin_centers]), yaxis=dict(title=f'Count'), font=dict(family = "Courier New, monospace",
                                size = 12,
                                color = "RebeccaPurple"
                            ), width = 1600, height = 600)
    
    # Create the figure object and add the trace and layout
    fig = go.Figure(data=[histogram_trace], layout=layout)
    
    # formatting background
    fig.update_xaxes(
        mirror=True,
        ticks='outside',
        showline=True,
        linecolor='black',
        gridcolor='white')

    fig.update_yaxes(
                mirror=True,
                ticks='outside',
                showline=True,
                linecolor='black',
                gridcolor='white')
    
    

    return fig

The distribution of returns doesn't make much sense here, since these are `15 mins` returns, they are mostly centered around the mean, with a little skewness towrads the positive side.

In [13]:
# plot the distribution
plot_distribution(cleanedDf['returns'], type_of_dist="Returns (%)")

Lets plot the drawdowns and returns using a library called `Financiallib` made by my colleagues and me. (`Financiallib` is hosted in `pypi`)

In [14]:
# Lets see some other charts 
daily_data = cleanedDf.set_index("datetime").resample("D").last().dropna() # resampling the data to see the daily max drawdowns
max_drawdown_plots(daily_data[['close']])

The cumulative return of the asset over the entire period - `2021` is shown below. 

In [15]:
# Return charts -> Cumulative Return over the years

daily_data.index.name = "Dates"
return_plots(daily_data[['close']])

Lets also see the candlestick patterns to identify trends and patterns using `Financiallib`

In [16]:
# Renaming some of the columns in the clean dataset to match the plotting libraries conventions

df = cleanedDf.reset_index()
df.columns = pd.Series(df.columns).apply(lambda x: x.capitalize()).to_list()


start_timestamps, end_timestamps = 0, 200
figure = candlestick_plot(df.iloc[start_timestamps:end_timestamps], date_column_name="Datetime", graph_width=1600, graph_height=600) # plotting candlesticks for first few timestamps
figure.update_layout(font=dict(family = "Courier New, monospace",
                               size = 12,
                               color = "RebeccaPurple"
                            ))

# formatting background
figure.update_xaxes(
      mirror=True,
      ticks='outside',
      showline=True,
      linecolor='black',
      gridcolor='white')

figure.update_yaxes(
            mirror=True,
            ticks='outside',
            showline=True,
            linecolor='black',
            gridcolor='white')

In [17]:
# Plotting it for next few days to identify patterns

start_timestamps, end_timestamps = 200, 400
figure = candlestick_plot(df.iloc[start_timestamps:end_timestamps], date_column_name="Datetime", graph_width=1600, graph_height=600) # plotting candlesticks for next few timestamps
figure.update_layout(font=dict(family = "Courier New, monospace",
                               size = 12,
                               color = "RebeccaPurple"
                            ))

# formatting background
figure.update_xaxes(
      mirror=True,
      ticks='outside',
      showline=True,
      linecolor='black',
      gridcolor='white')

figure.update_yaxes(
            mirror=True,
            ticks='outside',
            showline=True,
            linecolor='black',
            gridcolor='white')

The above `Candlestick` patterns reveal many trading opportunities which is easily identifiable from the `Candlesticks`. `Candlesticks` can be useful while making the buy-sell predictions.

# Technical Analysis using Indicators

In [18]:
# Calculating SMA's based on last 10 timestamps of the close values

cleanedDf['SMA_10'] = tb.SMA(cleanedDf['close'], timeperiod=10)

In [19]:
# Candle Stick 

figure.show()

In [20]:
# Let's plot the calculated moving averages 

plot_time_series(cleanedDf['SMA_10'].to_list()[start_timestamps:end_timestamps], x=cleanedDf['datetime'].to_list()[start_timestamps:end_timestamps], y_title="Prices", 
                                                                        x_title="Time", chart_title="Simple Moving Average of Close Prices")

The `SMA` seems to be a good indicator in identifying trends in the data. 

# Feature Engineering

Feature engineering is a crucial process in machine learning and data analysis. It refers to the practice of selecting, transforming, or creating relevant and informative features (variables or attributes) from raw data to improve the performance of a machine learning model or enhance the interpretability of data.


- Feature Selection: This involves choosing the most relevant features from the existing set of features. Irrelevant or redundant features can introduce noise and lead to overfitting. Feature selection methods aim to identify and keep the most important attributes while discarding others.

- Feature Transformation: Feature transformation techniques involve changing the representation of the data to make it more suitable for modeling. Common transformations include scaling features (e.g., normalization or standardization), encoding categorical variables, and applying mathematical operations (e.g., logarithms) to make the data more linear or interpretable.

- Feature Creation: Sometimes, the existing features may not capture the underlying patterns in the data effectively. In such cases, feature engineering can involve creating entirely new features by combining, aggregating, or deriving information from existing ones. For example, you might create a feature that represents the ratio of two existing features or generate time-based features from timestamps.

In [21]:
cleanedDf.set_index("datetime", inplace=True) # set datetime as index

1. **MACD (Moving Average Convergence Divergence):**
   - The MACD is a popular momentum indicator used by traders and analysts to identify potential trend reversals and gauge the strength of a trend.
   - It is calculated by subtracting the 26-period Exponential Moving Average (EMA) from the 12-period EMA.
   - The result is plotted as a line, known as the MACD line.
   - Additionally, a 9-period EMA of the MACD line, called the signal line, is plotted alongside the MACD line.
   - Traders often look for crossovers between the MACD and its signal line as buy or sell signals.
   - MACD also includes a histogram that represents the difference between the MACD line and the signal line, helping traders visualize momentum changes.

2. **RSI (Relative Strength Index):**
   - The RSI is a momentum oscillator that measures the speed and change of price movements.
   - It ranges from 0 to 100 and is typically displayed as an oscillating line.
   - RSI is used to identify overbought and oversold conditions in a market.
   - When the RSI crosses above 70, it suggests that the asset may be overbought and due for a potential pullback.
   - Conversely, when the RSI falls below 30, it indicates that the asset may be oversold and could experience a rebound.
   - Traders often use RSI to confirm trends and identify potential reversal points.

3. **SMA (Simple Moving Average):**
   - The Simple Moving Average is a basic technical analysis tool used to smooth out price data over a specified period.
   - It calculates the average of closing prices over a set number of periods.
   - SMA assigns equal weight to each data point in the calculation.
   - Traders use SMAs to identify trends and potential support or resistance levels.
   - Short-term SMAs respond more quickly to price changes, while long-term SMAs provide a smoother trend representation.

4. **EMA (Exponential Moving Average):**
   - The Exponential Moving Average is similar to the SMA but gives more weight to recent data points, making it more responsive to current price movements.
   - EMA is calculated by applying a smoothing factor to the previous EMA and the current price.
   - EMAs react faster to price changes and are useful for identifying short-term trends.
   - Like SMAs, EMAs can be used to identify potential support and resistance levels and determine trend direction.

In [22]:
indicators = ["SMA_10", "SMA_20", "EMA_10", "EMA_20", "RSI", "MACD", "BB", "OBV"]

# Calculate the indicator values
for indicator in indicators:
    if "SMA" in indicator:
        cleanedDf[indicator] = tb.SMA(cleanedDf['close'], timeperiod=int(indicator.split("_")[-1]))
    elif "EMA" in indicator:
        cleanedDf[indicator] = tb.EMA(cleanedDf['close'], timeperiod=int(indicator.split("_")[-1]))
    elif "RSI" in indicator:
        cleanedDf[indicator] = tb.RSI(cleanedDf['close'])
    elif "MACD" in indicator:
        macd, macdsignal, macdhist = tb.MACD(cleanedDf['close'])
        cleanedDf["MACD"] = macd
        cleanedDf["MACD_Signal"] = macdsignal
        cleanedDf["MACD_HIST"] = macdhist
    elif "BB" in indicator:
        low, high, simple = tb.BBANDS(cleanedDf['close'])
        cleanedDf["BB_LOW"] = low
        cleanedDf["BB_HIGH"] = high
    elif "OBV" in indicator:
        cleanedDf[indicator] = tb.OBV(cleanedDf['close'], cleanedDf['volume'])

In [23]:
# These are the calculated values which will serve as our features

cleanedDf.head()

Unnamed: 0_level_0,symbol,open,high,low,close,volume,symbol_id,returns,SMA_10,SMA_20,EMA_10,EMA_20,RSI,MACD,MACD_Signal,MACD_HIST,BB_LOW,BB_HIGH,OBV
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2021-01-01 00:00:00,ETHUSDT,737.18,740.0,730.44,731.64,46772.61,334,,,,,,,,,,,,46772.61
2021-01-01 00:15:00,ETHUSDT,731.7,732.99,730.0,732.36,20375.178,334,0.098409,,,,,,,,,,,67147.788
2021-01-01 00:30:00,ETHUSDT,732.36,735.1,732.21,734.18,14593.525,334,0.248512,,,,,,,,,,,81741.313
2021-01-01 00:45:00,ETHUSDT,734.18,736.35,733.04,734.6,16351.214,334,0.057207,,,,,,,,,,,98092.527
2021-01-01 01:00:00,ETHUSDT,734.61,744.49,734.0,744.47,42580.2,334,1.343588,,,,,,,,,744.734826,735.45,140672.727


- Take the negative of the `Open - Close` value, so if `+1` means close is greater than open indicating uptrend, else other way round.
- Capture variation in OHLC data
- Take difference of SMA's and their signals
- Shifted Values to capture previous pattern

In [24]:
# Insert some custom features
cleanedDf['ohlc'] = -np.sign(cleanedDf['open'] - cleanedDf['close']) 
cleanedDf['normalised_ohcl'] = (cleanedDf['close'] - cleanedDf['open']) / (cleanedDf['high'] - cleanedDf['low'])

cleanedDf['SMA_Diff'] = cleanedDf['SMA_10'] - cleanedDf['SMA_20']
cleanedDf['SMA_Diff_Signal'] = np.sign(cleanedDf['SMA_10'] - cleanedDf['SMA_20']) # if positive shows upward trend 

cleanedDf['EMA_Diff'] = cleanedDf['EMA_10'] - cleanedDf['EMA_20']
cleanedDf['EMA_Diff_Signal'] = np.sign(cleanedDf['EMA_10'] - cleanedDf['EMA_20']) # if positive shows upward trend 

In [25]:
# Calculate shifted values
steps = 5

for i in range(1, steps):
    cleanedDf[f"shifted_{i}"] = cleanedDf['close'].shift(i) # shifted close values give an idea about how the next move is going to be.

In [26]:
cleanedDf.iloc[30] # snapshot of a row

symbol                ETHUSDT
open                   736.75
high                   738.68
low                    735.55
close                  737.73
volume              11056.998
symbol_id                 334
returns              0.124863
SMA_10                739.808
SMA_20                 742.43
EMA_10             739.472359
EMA_20             740.698552
RSI                 48.943597
MACD                      NaN
MACD_Signal               NaN
MACD_HIST                 NaN
BB_LOW             742.246885
BB_HIGH                 738.2
OBV                161903.528
ohlc                      1.0
normalised_ohcl      0.313099
SMA_Diff               -2.622
SMA_Diff_Signal          -1.0
EMA_Diff            -1.226192
EMA_Diff_Signal          -1.0
shifted_1              736.81
shifted_2              735.45
shifted_3              740.76
shifted_4              740.25
Name: 2021-01-01 07:30:00, dtype: object

# Label Calculation

In a general sense, "label calculation" might refer to the process of determining or assigning labels to data points or objects based on certain criteria, rules, or algorithms. This could be applicable in various domains such as machine learning, data analysis, data processing, or even in manufacturing and logistics where products are labeled based on certain attributes or criteria.

In [27]:
labels_for_intervals = [("15m", 1), ("30m", 2), ("1hr", 4), ("1d", 96)] # for 15m, 30m, 1hr, 1day - 1, 2, 4, 96 are multiples of 15min

for label in labels_for_intervals:
    cleanedDf[f"label_{label[0]}"] = cleanedDf['close'].pct_change(-label[1]) # percentage of change in close 

## Train Test Split

In [28]:
train_start, train_end, test_start, test_end = "2021-01-01", "2021-10-01", "2021-10-10", "2021-12-31" # keeping a 9 days gap to prevent any lookahead

In [29]:
cleanedDf.dropna(inplace=True)

label = ["label_15m", "label_30m", "label_1hr", "label_1d"]
features = list(set(cleanedDf.columns) - set(label))

# remove symbol and symbol_id - they are of no use to us
features.remove("symbol")
features.remove("symbol_id")

# 15 min set
X_train, X_test, y_train, y_test = cleanedDf.loc[train_start:train_end, features], cleanedDf.loc[test_start:test_end, features], \
                                                    cleanedDf.loc[train_start:train_end, label[0]], cleanedDf.loc[test_start:test_end, label[0]]

In [30]:
def prediction_model_reg(model, model_name:str, X:pd.DataFrame, y:pd.DataFrame, X_test:pd.DataFrame, y_test:pd.DataFrame, freq:str) -> pd.DataFrame:
    """
    Given model instances and data of train and test, fits model, predicts and gives metric scores

    parameters
        model - model instance or object class
        model_name: Name of model
        X: train feature set
        y: train labels set
        X_test: test feature set
        y_test: test label set
    returns
        tuple of train and test metrics (mae, mape, mse)
    """
    # Fit the model 
    model.fit(X, y)
    y_pred = model.predict(X_test) # take predictions
    y_pred_train = model.predict(X_train) # take prediction on train set

    pred_vals = pd.Series(y_pred, index=X_test.index)

    # saving the prediction
    pred_vals.to_csv(f"result_store/{model_name}_{freq}.csv")

    mA, mB, mC = mean_absolute_error(y_true=y_test, y_pred=y_pred), mean_absolute_percentage_error(y_true=y_test, y_pred=y_pred), \
                                                                        mean_squared_error(y_true=y_test, y_pred=y_pred)    # metric A, B, C
    
    mA_train, mB_train, mC_train = mean_absolute_error(y_true=y_train, y_pred=y_pred_train), mean_absolute_percentage_error(y_true=y_train, y_pred=y_pred_train), \
                                                                        mean_squared_error(y_true=y_train, y_pred=y_pred_train)    # metric A, B, C on train set
    
    return mA, mB, mC, mA_train, mB_train, mC_train

Training various models and storing their performances - we will take the best performing model for using it to backtest

In [31]:
model_list = [LinearRegression(), Lasso(0.25), Ridge(), ElasticNet(), RandomForestRegressor(), 
              XGBRegressor(), GradientBoostingRegressor()]

model_names = ['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet', 'RandomForestRegressor', 'XGBRegressor',
              'GradientBoostingRegressor']

# Df to store the result of the trainings
results_test = pd.DataFrame(columns=["MAPE", "MAE", "MSE"], index=model_names)
results_train = pd.DataFrame(columns=["MAPE", "MAE", "MSE"], index=model_names)


for model, model_name in tqdm(zip(model_list, model_names)):
    mae, mape, mse, mae_train, mape_train, mse_train = prediction_model_reg(
                                         model=model,
                                         model_name=model_name,
                                         X=X_train,
                                         y=y_train,
                                         X_test=X_test,
                                         y_test=y_test,
                                         freq="15m")
    
    # store test results
    results_test.loc[model_name, "MAPE"] = mape
    results_test.loc[model_name, "MAE"] = mae
    results_test.loc[model_name, "MSE"] = mse

    # store train results
    results_train.loc[model_name, "MAPE"] = mape_train
    results_train.loc[model_name, "MAE"] = mae_train
    results_train.loc[model_name, "MSE"] = mse_train

7it [03:03, 26.16s/it]


In [32]:
# Test results 

results_test

Unnamed: 0,MAPE,MAE,MSE
LinearRegression,45.794942,0.029248,0.006145
Lasso,52.378206,0.032159,0.00732
Ridge,45.79482,0.029248,0.006145
ElasticNet,57.618137,0.035136,0.00745
RandomForestRegressor,110.731682,0.05851,0.006332
XGBRegressor,48.648399,0.029495,0.002834
GradientBoostingRegressor,19.118791,0.015016,0.001842


# Backtest - Using Best Model

In [33]:
def backtestCalculator(cash:float, backtest_df:pd.DataFrame, weights:dict) -> tuple:
    """
    Given cash value, backtest_df and weights it calculates the value invested, current value, balance left and shares.

    parameters
        cash: Cash value before trading
        backtest_df: Contains price information of t and t+1 th days
        weights: weight to assign to each asset
    returns
        cash_invested, current_value_of_investment, shares 
    """
    amount_allocation = {}
    shares, total, balance, total_invested = {}, 0, 0, 0
    prices = backtest_df.iloc[0].to_dict()
    new_prices = backtest_df.iloc[len(backtest_df)-1].to_dict()

    for keys in backtest_df.columns:
        amount_allocation[keys] = weights[keys] * cash

    for keys in backtest_df.columns:
        shares[keys] = (amount_allocation[keys] // prices[keys])

    for keys in backtest_df.columns:
        total_invested = total_invested + (shares[keys] * prices[keys])

    balance = cash - total_invested

    for keys in backtest_df.columns:
        total = total + (shares[keys] * new_prices[keys])

    return total_invested, total, balance, shares[keys]

## 15 Minute Trading - Backtest

In [34]:
# iterator to keep track of timestamps
cash = 1000000 # initial cash 
K = 0 # keep track of timestamps
dates = X_test.index.to_list() # available timesteps of backtest
port_values, dates_portfolio = [], [] # store portfolio values

# read-in the predictions
preds = pd.read_csv("result_store/GradientBoostingRegressor_15m.csv", index_col=0)
preds.columns = ["Prediction"]
preds.index = pd.to_datetime(preds.index)

# Run backtest 
for j in tqdm(range(len(dates) - 1)):
    t, t_1 = dates[K], dates[K+1] # pick current time and the t+1 timestep
    testDf = cleanedDf.loc[t:t_1, ['close']] # take close of t and t+1 timesteps
    testDf.rename(columns={"close":"ETHUSDT"}, inplace=True)

    # receive the value invested at (t)th time, value at (t+1)th time, balance left and shares held
    cash_invested, current_value, balance, shares = backtestCalculator(cash=cash, backtest_df=testDf, weights={"ETHUSDT": 1 if preds.loc[t, "Prediction"] > 0 else 0})

    # portfolio value 
    portfolio = balance + current_value
    cash = portfolio
    port_values.append(cash)
    dates_portfolio.append(t_1)

    K = K + 1 # goes to next trading time

  0%|          | 0/6590 [00:00<?, ?it/s]

100%|██████████| 6590/6590 [00:03<00:00, 1744.21it/s]


In [35]:
# creates a timestamp wise portfolio dataframe

portfolioDf = pd.DataFrame({"Timestamp": dates_portfolio, "Portfolio Value": port_values})
portfolioDf = portfolioDf.set_index("Timestamp").resample("D").last().dropna().reset_index() # resampling to daily EOD portfolio values to get performance review 
portfolioDf['returns'] = portfolioDf['Portfolio Value'].pct_change() # daily EOD returns
portfolioDf.dropna(inplace=True)

In [36]:
# Stats of Performance

financial_summary(df_rets=portfolioDf, benchmark_rets=portfolioDf, col_name_cagr="Portfolio Value", date_col="Timestamp")

Unnamed: 0,Unnamed: 1,Meta Data,Unnamed: 3,Summary,Unnamed: 5,Statistics
0,Start Date,2021-10-11 00:00:00,Annual Return,-314.18%,Sharpe Ratio,-1.55
1,End Date,2021-12-27 00:00:00,Annual Volatility,203.06%,Kurtosis,9.49
2,Time Period (in Day),77,CAGR,-100.0%,Information Ratio,
3,Strategy,,Sortino Ratio,-1.61,Turnover,0.0%


# 30 Minute Trading 

In [37]:
# 30 min set
X_train, X_test, y_train, y_test = cleanedDf.loc[train_start:train_end, features], cleanedDf.loc[test_start:test_end, features], \
                                                    cleanedDf.loc[train_start:train_end, label[1]], cleanedDf.loc[test_start:test_end, label[1]]



# Df to store the result of the trainings
results_test = pd.DataFrame(columns=["MAPE", "MAE", "MSE"], index=model_names)
results_train = pd.DataFrame(columns=["MAPE", "MAE", "MSE"], index=model_names)


for model, model_name in tqdm(zip(model_list, model_names)):
    mae, mape, mse, mae_train, mape_train, mse_train = prediction_model_reg(
                                         model=model,
                                         model_name=model_name,
                                         X=X_train,
                                         y=y_train,
                                         X_test=X_test,
                                         y_test=y_test,
                                         freq="30m")
    
    # store test results
    results_test.loc[model_name, "MAPE"] = mape
    results_test.loc[model_name, "MAE"] = mae
    results_test.loc[model_name, "MSE"] = mse

    # store train results
    results_train.loc[model_name, "MAPE"] = mape_train
    results_train.loc[model_name, "MAE"] = mae_train
    results_train.loc[model_name, "MSE"] = mse_train

7it [03:00, 25.76s/it]


In [38]:
# Test results

results_test

Unnamed: 0,MAPE,MAE,MSE
LinearRegression,4982344142.640738,0.013586,0.00168
Lasso,20179765940.294235,0.016977,0.001939
Ridge,4982647312.467808,0.013586,0.00168
ElasticNet,22801935530.766804,0.018738,0.002007
RandomForestRegressor,5451639820.225951,0.031579,0.002653
XGBRegressor,235495809363.64096,0.043376,0.006507
GradientBoostingRegressor,38813698608.702255,0.018495,0.002222


In [39]:
# iterator to keep track of timestamps
cash = 1000000 # initial cash 
K = 0 # keep track of timestamps
dates = X_test.resample("30T").last().index.to_list() # available timesteps of backtest
port_values, dates_portfolio = [], [] # store portfolio values

# read-in the predictions
preds = pd.read_csv("result_store/GradientBoostingRegressor_30m.csv", index_col=0)
preds.columns = ["Prediction"]
preds.index = pd.to_datetime(preds.index)

# Run backtest 
for j in tqdm(range(len(dates) - 1)):
    try:
        t, t_1 = dates[K], dates[K+1] # pick current time and the t+1 timestep
        testDf = cleanedDf.loc[t:t_1, ['close']] # take close of t and t+1 timesteps
        testDf.rename(columns={"close":"ETHUSDT"}, inplace=True)

        # receive the value invested at (t)th time, value at (t+1)th time, balance left and shares held
        cash_invested, current_value, balance, shares = backtestCalculator(cash=cash, backtest_df=testDf, weights={"ETHUSDT": 1 if preds.loc[t, "Prediction"] > 0 else 0})

        # portfolio value 
        portfolio = balance + current_value
        cash = portfolio
        port_values.append(cash)
        dates_portfolio.append(t_1)

        K = K + 1 # goes to next trading time
    except:
        pass

  0%|          | 0/6590 [00:00<?, ?it/s]

100%|██████████| 6590/6590 [00:03<00:00, 1976.09it/s]


In [40]:
# creates a timestamp wise portfolio dataframe

portfolioDf = pd.DataFrame({"Timestamp": dates_portfolio, "Portfolio Value": port_values})
portfolioDf = portfolioDf.set_index("Timestamp").resample("D").last().dropna().reset_index() # resampling to daily EOD portfolio values to get performance review 
portfolioDf['returns'] = portfolioDf['Portfolio Value'].pct_change() # daily EOD returns
portfolioDf.dropna(inplace=True)

In [41]:
# Stats of Performance

financial_summary(df_rets=portfolioDf, benchmark_rets=portfolioDf, col_name_cagr="Portfolio Value", date_col="Timestamp")

Unnamed: 0,Unnamed: 1,Meta Data,Unnamed: 3,Summary,Unnamed: 5,Statistics
0,Start Date,2021-10-11 00:00:00,Annual Return,-257.79%,Sharpe Ratio,-3.29
1,End Date,2021-10-30 00:00:00,Annual Volatility,78.45%,Kurtosis,3.65
2,Time Period (in Day),19,CAGR,-89.74%,Information Ratio,
3,Strategy,,Sortino Ratio,-4.38,Turnover,0.0%


# 1 Hour Trading

In [42]:
# 1 hr set
X_train, X_test, y_train, y_test = cleanedDf.loc[train_start:train_end, features], cleanedDf.loc[test_start:test_end, features], \
                                                    cleanedDf.loc[train_start:train_end, label[2]], cleanedDf.loc[test_start:test_end, label[2]]



# Df to store the result of the trainings
results_test = pd.DataFrame(columns=["MAPE", "MAE", "MSE"], index=model_names)
results_train = pd.DataFrame(columns=["MAPE", "MAE", "MSE"], index=model_names)


for model, model_name in tqdm(zip(model_list, model_names)):
    mae, mape, mse, mae_train, mape_train, mse_train = prediction_model_reg(
                                         model=model,
                                         model_name=model_name,
                                         X=X_train,
                                         y=y_train,
                                         X_test=X_test,
                                         y_test=y_test,
                                         freq="1hr")
    
    # store test results
    results_test.loc[model_name, "MAPE"] = mape
    results_test.loc[model_name, "MAE"] = mae
    results_test.loc[model_name, "MSE"] = mse

    # store train results
    results_train.loc[model_name, "MAPE"] = mape_train
    results_train.loc[model_name, "MAE"] = mae_train
    results_train.loc[model_name, "MSE"] = mse_train

7it [02:47, 23.96s/it]


In [43]:
# Test results

results_test

Unnamed: 0,MAPE,MAE,MSE
LinearRegression,8956534408.521397,0.014066,0.002334
Lasso,11099574076.627705,0.016549,0.00252
Ridge,8956573589.829142,0.014066,0.002334
ElasticNet,15221163613.33231,0.01852,0.002645
RandomForestRegressor,33674048666.79524,0.042841,0.007242
XGBRegressor,52668950412.28054,0.086865,0.019552
GradientBoostingRegressor,1210611123.192556,0.020098,0.002536


In [44]:
# iterator to keep track of timestamps
cash = 1000000 # initial cash 
K = 0 # keep track of timestamps
dates = X_test.resample("1H").last().index.to_list() # available timesteps of backtest
port_values, dates_portfolio = [], [] # store portfolio values

# read-in the predictions
preds = pd.read_csv("result_store/GradientBoostingRegressor_1hr.csv", index_col=0)
preds.columns = ["Prediction"]
preds.index = pd.to_datetime(preds.index)

# Run backtest 
for j in tqdm(range(len(dates) - 1)):
    try:
        t, t_1 = dates[K], dates[K+1] # pick current time and the t+1 timestep
        testDf = cleanedDf.loc[t:t_1, ['close']] # take close of t and t+1 timesteps
        testDf.rename(columns={"close":"ETHUSDT"}, inplace=True)

        # receive the value invested at (t)th time, value at (t+1)th time, balance left and shares held
        cash_invested, current_value, balance, shares = backtestCalculator(cash=cash, backtest_df=testDf, weights={"ETHUSDT": 1 if preds.loc[t, "Prediction"] > 0 else 0})

        # portfolio value 
        portfolio = balance + current_value
        cash = portfolio
        port_values.append(cash)
        dates_portfolio.append(t_1)

        K = K + 1 # goes to next trading time
    except:
        pass

  0%|          | 0/6590 [00:00<?, ?it/s]

100%|██████████| 6590/6590 [00:03<00:00, 2029.87it/s]


In [45]:
# creates a timestamp wise portfolio dataframe

portfolioDf = pd.DataFrame({"Timestamp": dates_portfolio, "Portfolio Value": port_values})
portfolioDf = portfolioDf.set_index("Timestamp").resample("D").last().dropna().reset_index() # resampling to daily EOD portfolio values to get performance review 
portfolioDf['returns'] = portfolioDf['Portfolio Value'].pct_change() # daily EOD returns
portfolioDf.dropna(inplace=True)

In [46]:
# Stats of Performance

financial_summary(df_rets=portfolioDf, benchmark_rets=portfolioDf, col_name_cagr="Portfolio Value", date_col="Timestamp")

Unnamed: 0,Unnamed: 1,Meta Data,Unnamed: 3,Summary,Unnamed: 5,Statistics
0,Start Date,2021-10-11 00:00:00,Annual Return,115.3%,Sharpe Ratio,0.82
1,End Date,2021-11-01 00:00:00,Annual Volatility,140.36%,Kurtosis,9.88
2,Time Period (in Day),21,CAGR,-92.05%,Information Ratio,
3,Strategy,,Sortino Ratio,1.85,Turnover,0.0%
