In [1]:
%pip install lightgbm joblib

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from typing import List
import lightgbm as lgb
import joblib
import logging
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

from utils import (
    TRAIN_DATASET_FILE_PATH,
    VALIDATION_DATASET_FILE_PATH,
    SAMPLE_SYMBOLS)

logging.getLogger('lightgbm').setLevel(logging.WARNING)


# Train dataset

In [3]:
# Load the train dataset
train_data = pd.read_parquet(TRAIN_DATASET_FILE_PATH)
train_data['date'] = pd.to_datetime(train_data['date'])
train_data.head()

Unnamed: 0,date,symbol,target,open,high,low,close,volume,name,keywords,...,inflation_rate_India,inflation_rate_Italy,inflation_rate_Japan,inflation_rate_Korea Rep,inflation_rate_Mexico,inflation_rate_Russian Federation,inflation_rate_Saudi Arabia,inflation_rate_United Kingdom,inflation_rate_United States,year
0,2020-06-01,0xBTC,0.0,0.142721,0.152836,0.133416,0.145287,1227624,0xBitcoin,0xBitcoin,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
1,2020-06-01,NEO,0.25,10.959956,12.6478,10.917013,12.424067,783678511,Neo,Neo,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
2,2020-06-01,DAWN,0.0,0.111987,0.18855,0.049547,0.088647,82537,Dawn Protocol,Dawn Protocol,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
3,2020-06-01,PNT,0.0,6.6e-05,7e-05,6.1e-05,6.8e-05,11798,pNetwork,pNetwork,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0
4,2020-06-01,NEX,0.25,0.935004,1.030365,0.929747,0.963004,1287319,Nash,Nash,...,182.988823,110.471259,105.457901,115.777368,146.350488,186.862622,122.479242,120.806362,118.690502,2020.0


# Validation dataset

In [4]:
# Load the train dataset
validation_data = pd.read_parquet(VALIDATION_DATASET_FILE_PATH)
validation_data['date'] = pd.to_datetime(train_data['date'])
validation_data.head()

Unnamed: 0,date,symbol,target,open,high,low,close,volume,name,keywords,...,inflation_rate_India,inflation_rate_Italy,inflation_rate_Japan,inflation_rate_Korea Rep,inflation_rate_Mexico,inflation_rate_Russian Federation,inflation_rate_Saudi Arabia,inflation_rate_United Kingdom,inflation_rate_United States,year
0,2020-06-01,LTO,0.5,0.070485,0.071733,0.067215,0.068773,1995042,LTO Network,LTO Network,...,216.862025,128.617292,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0
1,2020-06-01,WMT,0.5,0.154297,0.163586,0.150708,0.15475,2509539,World Mobile Token,World Mobile Token,...,216.862025,128.617292,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0
2,2020-06-01,GOZ,0.0,0.713936,0.723211,0.692981,0.714705,161040,Göztepe S.K. Fan Token,Göztepe S.K. Fan Token,...,216.862025,128.617292,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0
3,2020-06-01,ALEX,1.0,0.055003,0.058988,0.053239,0.054397,1350074,ALEX Lab,ᛤ ALEX 🟧 THE Finance Layer on Bitcoin ᛤᛤᛤ,...,216.862025,128.617292,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0
4,2020-06-01,NSURE,0.0,0.004474,0.005108,0.004111,0.005091,577050,Nsure.Network,Nsure.Network,...,216.862025,128.617292,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0


In [5]:
def train_and_save_model(data: pd.DataFrame, feature_cols: List[str], model_file_path: str):
    # Initialize the LightGBM model
    model = lgb.LGBMRegressor(
        n_estimators=2000,
        learning_rate=0.01,
        max_depth=5,
        num_leaves=2 ** 5,
        colsample_bytree=0.1,
        verbosity=-1  # Suppress LightGBM messages
    )

    # Train the model
    model.fit(
        data[feature_cols],
        data["target"]
    )

    print(f'Saving the model to: {model_file_path}')

    # Save the model
    joblib.dump(model, model_file_path)
    
    return model

In [6]:
# Generate training features
def generate_ohlcv_training_features() -> List[str]:
    return ['open', 'high', 'low', 'close', 'volume']

def generate_fear_greed_training_features() -> List[str]:
    return ['fear_greed_value']

def generate_supply_training_features() -> List[str]:
    return ['circulating_supply', 'market_cap', 'infinite_supply']

def generate_google_trends_training_features() -> List[str]:
    return ['google_trend_score']

def generate_interest_rates_training_features(df: pd.DataFrame) -> List[str]:
    return [col for col in df.columns if col.startswith('interest_rate')]

def generate_gdp_training_features(df: pd.DataFrame) -> List[str]:
    return [col for col in df.columns if col.startswith('gdp')]

def generate_inflation_training_features(df: pd.DataFrame) -> List[str]:
    return [col for col in df.columns if col.startswith('inflation_rate')]

# Add training features for each (symbol, date)
ohlcv_cols = generate_ohlcv_training_features()
ohlcv_fear_cols = ohlcv_cols + generate_fear_greed_training_features()
ohlcv_fear_supply_cols = ohlcv_fear_cols + generate_supply_training_features()
ohlcv_fear_supply_trends_cols = ohlcv_fear_supply_cols + generate_google_trends_training_features()
ohlcv_fear_supply_trends_interest_rate_cols = ohlcv_fear_supply_trends_cols + generate_interest_rates_training_features(train_data)
ohlcv_fear_supply_trends_interest_gdp_rate_cols = ohlcv_fear_supply_trends_interest_rate_cols + generate_gdp_training_features(train_data)
ohlcv_fear_supply_trends_complete_cols = ohlcv_fear_supply_trends_interest_gdp_rate_cols + generate_inflation_training_features(train_data)

# Call the function to train and save the model
model1 = train_and_save_model(train_data, ohlcv_cols, '../data/model1.pkl')
model2 = train_and_save_model(train_data, ohlcv_fear_cols, '../data/model2.pkl')
model3 = train_and_save_model(train_data, ohlcv_fear_supply_cols, '../data/model3.pkl')
model4 = train_and_save_model(train_data, ohlcv_fear_supply_trends_cols, '../data/model4.pkl')
model5 = train_and_save_model(train_data, ohlcv_fear_supply_trends_interest_rate_cols, '../data/model5.pkl')
model6 = train_and_save_model(train_data, ohlcv_fear_supply_trends_interest_gdp_rate_cols, '../data/model6.pkl')
model7 = train_and_save_model(train_data, ohlcv_fear_supply_trends_complete_cols, '../data/model7.pkl')

Saving the model to: ../data/model1.pkl
Saving the model to: ../data/model2.pkl
Saving the model to: ../data/model3.pkl
Saving the model to: ../data/model4.pkl
Saving the model to: ../data/model5.pkl
Saving the model to: ../data/model6.pkl
Saving the model to: ../data/model7.pkl


In [7]:
def calculate_nmse(model, validation_data: pd.DataFrame, feature_cols: List[str]):
    predictions = model.predict(validation_data[feature_cols])
    mse = mean_squared_error(validation_data["close"], predictions)
    nmse = mse / validation_data["close"].var()
    return nmse

nmse1 = calculate_nmse(model1, validation_data, ohlcv_cols)
nmse2 = calculate_nmse(model2, validation_data, ohlcv_fear_cols)
nmse3 = calculate_nmse(model3, validation_data, ohlcv_fear_supply_cols)
nmse4 = calculate_nmse(model4, validation_data, ohlcv_fear_supply_trends_cols)
nmse5 = calculate_nmse(model5, validation_data, ohlcv_fear_supply_trends_interest_rate_cols)
nmse6 = calculate_nmse(model6, validation_data, ohlcv_fear_supply_trends_interest_gdp_rate_cols)
nmse7 = calculate_nmse(model7, validation_data, ohlcv_fear_supply_trends_complete_cols)

print(f'NMSE for model1: {nmse1}')
print(f'NMSE for model2: {nmse2}')
print(f'NMSE for model3: {nmse3}')
print(f'NMSE for model4: {nmse4}')
print(f'NMSE for model5: {nmse5}')
print(f'NMSE for model6: {nmse6}')
print(f'NMSE for model7: {nmse7}')

NMSE for model1: 1.00416815883314
NMSE for model2: 1.004168531469433
NMSE for model3: 1.0041691598662352
NMSE for model4: 1.0041691828863417
NMSE for model5: 1.0041685779783895
NMSE for model6: 1.0041698678420417
NMSE for model7: 1.004170767180865


In [8]:
def generate_ranked_signal(model, data: pd.DataFrame, feature_cols: List[str]) -> pd.DataFrame:
    data["signal"] = model.predict(data[feature_cols])
    data["signal_ranked"] = data["signal"].rank(pct=True)
    return data


ranked_signals = generate_ranked_signal(model7, validation_data, ohlcv_fear_supply_trends_complete_cols)
ranked_signals.head(30)


Unnamed: 0,date,symbol,target,open,high,low,close,volume,name,keywords,...,inflation_rate_Japan,inflation_rate_Korea Rep,inflation_rate_Mexico,inflation_rate_Russian Federation,inflation_rate_Saudi Arabia,inflation_rate_United Kingdom,inflation_rate_United States,year,signal,signal_ranked
0,2020-06-01,LTO,0.5,0.070485,0.071733,0.067215,0.068773,1995042,LTO Network,LTO Network,...,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0,0.364225,0.857561
1,2020-06-01,WMT,0.5,0.154297,0.163586,0.150708,0.15475,2509539,World Mobile Token,World Mobile Token,...,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0,0.427159,0.96216
2,2020-06-01,GOZ,0.0,0.713936,0.723211,0.692981,0.714705,161040,Göztepe S.K. Fan Token,Göztepe S.K. Fan Token,...,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0,0.031135,0.282455
3,2020-06-01,ALEX,1.0,0.055003,0.058988,0.053239,0.054397,1350074,ALEX Lab,ᛤ ALEX 🟧 THE Finance Layer on Bitcoin ᛤᛤᛤ,...,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0,0.302754,0.69183
4,2020-06-01,NSURE,0.0,0.004474,0.005108,0.004111,0.005091,577050,Nsure.Network,Nsure.Network,...,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0,0.253508,0.585468
5,2020-06-01,LM,0.0,0.008541,0.008855,0.008524,0.008726,484277,LeisureMeta,LeisureMeta,...,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0,-0.004754,0.115316
6,2020-06-01,XTN,0.0,0.083254,0.085357,0.070198,0.076645,30519,Neutrino Index,Neutrino Index,...,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0,-0.015458,0.075738
7,2020-06-01,DERO,0.0,4.541214,4.912441,4.518896,4.649829,32870,Dero,Dero,...,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0,0.106778,0.492266
8,2020-06-01,CUMMIES,0.0,0.001826,0.001932,0.001821,0.0019,29665,CumRocket,CumRocket,...,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0,0.005934,0.159938
9,2020-06-01,PLASTIK,0.0,0.016273,0.016392,0.015632,0.016273,65879,Plastiks,Plastiks,...,111.364036,129.190176,176.116004,199.372063,132.36437,142.740891,139.735794,2023.0,-0.024275,0.049685
