In [1]:
%pip install lightgbm joblib

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from typing import List
import lightgbm as lgb
import joblib
import logging
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

from utils import (
    TRAIN_DATASET_FILE_PATH,
    VALIDATION_DATASET_FILE_PATH,
    SAMPLE_SYMBOLS)

logging.getLogger('lightgbm').setLevel(logging.WARNING)


TRAIN_START_DATE updated to: 2020-06-01, TRAIN_END_DATE updated to: 2024-07-03


# Train dataset

In [3]:
# Load the train dataset
train_data = pd.read_parquet(TRAIN_DATASET_FILE_PATH)
train_data['date'] = pd.to_datetime(train_data['date'])
train_data.head()

Unnamed: 0,date,symbol,target,open,high,low,close,volume,name,keywords,...,cpi_Brazil,cpi_Russia,cpi_South Korea,cpi_Mexico,cpi_Japan,cpi_Saudi Arabia,cpi_Australia,cpi_Canada,cpi_France,year
0,2020-06-01,0xBTC,0.0,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
1,2020-06-01,JUV,0.0,7.194488,7.584891,7.17679,7.532392,146975,Juventus Fan Token,Juventus Fan Token,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
2,2020-06-01,ZANO,0.0,0.725818,0.73945,0.675988,0.736188,92454,Zano,Zano,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
3,2020-06-01,LA,0.0,0.027122,0.035149,0.026726,0.034941,15897,LATOKEN,LATOKEN,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0
4,2020-06-01,XMV,0.0,0.001817,0.001838,0.001717,0.00183,21,MoneroV,MoneroV,...,124.0019,123.837,105.027448,122.5766,101.8071,107.8078,96.96129,106.97271,106.01,2020.0


# Validation dataset

In [4]:
# Load the train dataset
validation_data = pd.read_parquet(VALIDATION_DATASET_FILE_PATH)
validation_data['date'] = pd.to_datetime(train_data['date'])
validation_data.head()

Unnamed: 0,date,symbol,target,open,high,low,close,volume,name,keywords,...,cpi_Brazil,cpi_Russia,cpi_South Korea,cpi_Mexico,cpi_Japan,cpi_Saudi Arabia,cpi_Australia,cpi_Canada,cpi_France,year
0,2020-06-01,DOME,0.0,0.000713,0.000741,0.000667,0.000678,2084441,Everdome,Everdome,...,157.1481,,,151.0167,,114.227,101.9929,,121.49,2023.0
1,2020-06-01,EL,0.0,0.00208,0.002188,0.00206,0.002187,136486,ELYSIA,ELYSIA,...,157.1481,,,151.0167,,114.227,101.9929,,121.49,2023.0
2,2020-06-01,OMG,0.5,0.640361,0.669655,0.585167,0.616681,48801552,OMG Network,OMG Network,...,157.1481,,,151.0167,,114.227,101.9929,,121.49,2023.0
3,2020-06-01,CGPT,0.0,0.05221,0.055227,0.050684,0.052477,5099763,ChainGPT,ChainGPT,...,157.1481,,,151.0167,,114.227,101.9929,,121.49,2023.0
4,2020-06-01,NULS,0.5,0.212408,0.219739,0.206767,0.214189,1847556,NULS,NULS coin,...,157.1481,,,151.0167,,114.227,101.9929,,121.49,2023.0


In [5]:
def train_and_save_model(data: pd.DataFrame, feature_cols: List[str], model_file_path: str):
    # Initialize the LightGBM model
    model = lgb.LGBMRegressor(
        n_estimators=2000,
        learning_rate=0.01,
        max_depth=5,
        num_leaves=2 ** 5,
        colsample_bytree=0.1,
        verbosity=-1  # Suppress LightGBM messages
    )

    # Train the model
    model.fit(
        data[feature_cols],
        data["target"]
    )

    print(f'Saving the model to: {model_file_path}')

    # Save the model
    joblib.dump(model, model_file_path)
    
    return model

In [6]:
# Generate training features
def generate_ohlcv_training_features() -> List[str]:
    return ['open', 'high', 'low', 'close', 'volume']

def generate_fear_greed_training_features() -> List[str]:
    return ['fear_greed_value']

def generate_supply_training_features() -> List[str]:
    return ['circulating_supply', 'market_cap', 'infinite_supply']

def generate_google_trends_training_features() -> List[str]:
    return ['google_trend_score']

def generate_interest_rates_training_features(df: pd.DataFrame) -> List[str]:
    return [col for col in df.columns if col.startswith('interest_rate')]

def generate_cpi_training_features(df: pd.DataFrame) -> List[str]:
    return [col for col in df.columns if col.startswith('cpi')]

def generate_gdp_training_features(df: pd.DataFrame) -> List[str]:
    return [col for col in df.columns if col.startswith('gdp')]

def generate_inflation_training_features(df: pd.DataFrame) -> List[str]:
    return [col for col in df.columns if col.startswith('inflation_rate')]

# Add training features for each (symbol, date)
ohlcv_cols = generate_ohlcv_training_features()
ohlcv_fear_cols = ohlcv_cols + generate_fear_greed_training_features()
ohlcv_fear_supply_cols = ohlcv_fear_cols + generate_supply_training_features()
ohlcv_fear_supply_trends_cols = ohlcv_fear_supply_cols + generate_google_trends_training_features()
ohlcv_fear_supply_trends_interest_rate_cols = ohlcv_fear_supply_trends_cols + generate_interest_rates_training_features(train_data)
ohlcv_fear_supply_trends_interest_rate_cpi_cols = ohlcv_fear_supply_trends_interest_rate_cols + generate_cpi_training_features(train_data)
ohlcv_fear_supply_trends_interest_gdp_rate_cols = ohlcv_fear_supply_trends_interest_rate_cols + generate_gdp_training_features(train_data)
ohlcv_fear_supply_trends_complete_cols = ohlcv_fear_supply_trends_interest_gdp_rate_cols + generate_inflation_training_features(train_data)

# Call the function to train and save the model
model1 = train_and_save_model(train_data, ohlcv_cols, '../data/model1.pkl')
model2 = train_and_save_model(train_data, ohlcv_fear_cols, '../data/model2.pkl')
model3 = train_and_save_model(train_data, ohlcv_fear_supply_cols, '../data/model3.pkl')
model4 = train_and_save_model(train_data, ohlcv_fear_supply_trends_cols, '../data/model4.pkl')
model5 = train_and_save_model(train_data, ohlcv_fear_supply_trends_interest_rate_cols, '../data/model5.pkl')
model6 = train_and_save_model(train_data, ohlcv_fear_supply_trends_interest_rate_cpi_cols, '../data/model6.pkl')
model7 = train_and_save_model(train_data, ohlcv_fear_supply_trends_interest_gdp_rate_cols, '../data/model7.pkl')
model8 = train_and_save_model(train_data, ohlcv_fear_supply_trends_complete_cols, '../data/model8.pkl')

Saving the model to: ../data/model1.pkl
Saving the model to: ../data/model2.pkl
Saving the model to: ../data/model3.pkl
Saving the model to: ../data/model4.pkl
Saving the model to: ../data/model5.pkl
Saving the model to: ../data/model6.pkl
Saving the model to: ../data/model7.pkl
Saving the model to: ../data/model8.pkl


In [9]:
def calculate_nmse(model, validation_data: pd.DataFrame, feature_cols: List[str]):
    predictions = model.predict(validation_data[feature_cols])
    mse = mean_squared_error(validation_data["close"], predictions)
    nmse = mse / validation_data["close"].var()
    return nmse

nmse1 = calculate_nmse(model1, validation_data, ohlcv_cols)
nmse2 = calculate_nmse(model2, validation_data, ohlcv_fear_cols)
nmse3 = calculate_nmse(model3, validation_data, ohlcv_fear_supply_cols)
nmse4 = calculate_nmse(model4, validation_data, ohlcv_fear_supply_trends_cols)
nmse5 = calculate_nmse(model5, validation_data, ohlcv_fear_supply_trends_interest_rate_cols)
nmse6 = calculate_nmse(model6, validation_data, ohlcv_fear_supply_trends_interest_rate_cpi_cols)
nmse7 = calculate_nmse(model7, validation_data, ohlcv_fear_supply_trends_interest_gdp_rate_cols)
nmse8 = calculate_nmse(model8, validation_data, ohlcv_fear_supply_trends_complete_cols)

print(f'NMSE for model1: {nmse1}')
print(f'NMSE for model2: {nmse2}')
print(f'NMSE for model3: {nmse3}')
print(f'NMSE for model4: {nmse4}')
print(f'NMSE for model5: {nmse5}')
print(f'NMSE for model6: {nmse6}')
print(f'NMSE for model7: {nmse7}')
print(f'NMSE for model8: {nmse8}')

NMSE for model1: 1.004197789705909
NMSE for model2: 1.0041981628762982
NMSE for model3: 1.004197418033201
NMSE for model4: 1.004197550576641
NMSE for model5: 1.0041972794644172
NMSE for model6: 1.0041983791329157
NMSE for model7: 1.004198061531351
NMSE for model8: 1.0041980803654982


In [None]:
def generate_ranked_signal(model, data: pd.DataFrame, feature_cols: List[str]) -> pd.DataFrame:
    data["signal"] = model.predict(data[feature_cols])
    data["signal_ranked"] = data["signal"].rank(pct=True)
    return data


ranked_signals = generate_ranked_signal(model7, validation_data, ohlcv_fear_supply_trends_complete_cols)
ranked_signals.head(30)
