## Install sktime and  dependencies

In [1]:
!python --version

Python 3.10.5


In [2]:
%%capture --no-display

!pip install sktime
!pip install pmdarima
!pip install matplotlib
!pip install seaborn
!pip install xgboost

In [3]:
import warnings
# hide warnings
warnings.filterwarnings('ignore')
#warnings.filterwarnings(action='once')

## BBPro/Gems Script

In [41]:
# -*- coding: utf-8 -*-

"""##Imports"""
import pandas as pd
import numpy as np
import traceback

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

from sktime.utils.plotting import plot_series
from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.datasets import load_longley
from sktime.forecasting.arima import ARIMA
from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.compose import ForecastX
from sktime.forecasting.var import VAR
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.compose import make_reduction
from sktime.forecasting.compose import EnsembleForecaster

"""##Constants"""

global DATA_DIR
DATA_DIR = '/data/'

global TEST_DATA_FILE
#TEST_DATA_FILE = 'historicalData2020_20201.csv'
TEST_DATA_FILE = 'aire.csv'
global COLUMNS
COLUMNS = [
    'date',
    'slug',
    'price',
    'marketCap',
    'volume24h',
    'twitterFollowers',
    'redditSubscribers',
    'redditAccountsActive48h',
    'avgRedditComments48h',
    'avgRedditPosts48h',
    'developerForks',
    'developerStars',
    'developerSubscribers',
    'pullRequestMerged',
    'pullRequestContributors',
]

global MIN_COIN_GECKO_MARKET_MEAN_BBPRO
global MIN_COIN_GECKO_VOLUME_MEAN_BBPRO
global MAX_COIN_GECKO_MARKET_MEAN_BBGEMS
global MIN_COIN_GECKO_VOLUME_MEAN_BBGEMS
global MAX_DELTA_PRICE_PERCENT
global COIN_GECKO_VOLUME_MEAN_WEEKS
global COIN_GECKO_MARKET_MEAN_WEEKS

MIN_COIN_GECKO_MARKET_MEAN_BBPRO = 7e7
MIN_COIN_GECKO_VOLUME_MEAN_BBPRO = 3.5e5

#OLD
#MIN_COIN_GECKO_VOLUME_MEAN_BBPRO = 3.5e6

MAX_COIN_GECKO_MARKET_MEAN_BBGEMS = 1.5e7
MIN_COIN_GECKO_VOLUME_MEAN_BBGEMS = 5e4

#OLD
#MIN_COIN_GECKO_VOLUME_MEAN_BBGEMS = 5e5
MAX_DELTA_PRICE_PERCENT = 20
COIN_GECKO_VOLUME_MEAN_WEEKS = 1
COIN_GECKO_MARKET_MEAN_WEEKS = 1

global EVAL_WEEKS
global MIN_TRAIN_WEEKS
global MIN_DATA_WEEKS
global MIN_PERCENT_WITH_DATA

EVAL_WEEKS = 1
MIN_TRAIN_WEEKS = 1
MIN_DATA_WEEKS = MIN_TRAIN_WEEKS + EVAL_WEEKS + 1
MIN_PERCENT_WITH_DATA = 0.8

global EVAL_TOPS
global MAX_TOPS

EVAL_TOPS = 10
MAX_TOPS = 100

"""## Analysis Functions"""

def pre_process(df):
    # Only relevant columns
    df_preprocess = df[COLUMNS].copy()
    
    print('Projects before pre-processing', df_preprocess.groupby('date')['slug'].count().tail(1))

    df_preprocess.drop_duplicates(inplace=True)
    df_preprocess.fillna(0, inplace=True)

    # Converts date field to date, removing time, and sorting
    df_preprocess['date'] = pd.to_datetime(df_preprocess['date'])
    df_preprocess['date'] = df_preprocess['date'].dt.date
    df_preprocess = df_preprocess.sort_values(by='date')

    # Shift to closest saturday to unify weeks
    df_preprocess['date'] = df_preprocess.date + pd.offsets.Week(n=0, weekday=6) - pd.DateOffset(1)

    # Drop duplicates by date and slug
    df_preprocess.drop_duplicates(subset=['date', 'slug'],inplace=True)
    
    # Add month field to evaluate its impact in the prediction
    df_preprocess['month'] = df_preprocess['date'].dt.month
    
    # Add week for sktime
    df_preprocess['week'] = df_preprocess['date'].dt.to_period('W')

    # Add delta price
    df_pivot = df_preprocess.pivot_table(index=['date'], columns='slug', values=['price'], aggfunc='first',
                                         fill_value=0)
    df_pivot.columns = [col[1] for col in df_pivot.columns.values]
    df_returns = df_pivot.pct_change()
    df_returns = df_returns[1:]

    # Replace empty
    df_returns.fillna(0, inplace=True)
    df_returns.replace(np.inf, 0, inplace=True)
    df_returns.mask(df_returns == -1).fillna(0, inplace=True)

    df_returns = df_returns.stack(level=0).reset_index()
    df_returns = df_returns.rename(columns={df_returns.columns[1]: 'slug', df_returns.columns[2]: 'deltaPricePercent'})

    df_preprocess = pd.merge(df_preprocess, df_returns, on=['slug', 'date'], how='inner')

    print('Projects After pre-processing', df_preprocess.groupby('date')['slug'].count().tail(1))

    return df_preprocess


def filter_func_market_cap(x, min_m_cap, max_m_cap):
    x = x.tail(COIN_GECKO_MARKET_MEAN_WEEKS)
    return (x['marketCap'].mean() > min_m_cap) & (x['marketCap'].mean() < max_m_cap)


def filter_func_market_vol(x, min_m_vol, max_m_vol):
    x = x.tail(COIN_GECKO_VOLUME_MEAN_WEEKS)
    return (x['volume24h'].mean() > min_m_vol) & (x['volume24h'].mean() < max_m_vol)


def filter_func_data_percentage(x, total=MIN_DATA_WEEKS): 
    x = x.tail(total)
    return x['date'].count() >= (total * MIN_PERCENT_WITH_DATA)


def filter_delta_price_percent(df):
    df = df.mask(df.isin([np.nan, np.inf, -np.inf])).fillna(0)
    df.loc[df.deltaPricePercent > MAX_DELTA_PRICE_PERCENT, 'deltaPricePercent'] = 0

    return df

def filter_func_date(x, date_filter): 
    return x['date'].iloc[-1] == date_filter


def filter(df, min_m_cap=0, max_m_cap=sys.maxsize, min_m_vol=0, max_m_vol=sys.maxsize):
    # Filter out by Market Cap & Vol

    df = df.groupby('slug').filter(filter_func_market_cap, True, min_m_cap, max_m_cap)
    df = df.groupby('slug').filter(filter_func_market_vol, True, min_m_vol, max_m_vol)

    # Filters out by delta price

    df = filter_delta_price_percent(df)

    # Filters out by weeks w/o data

    df = df.groupby('slug').filter(filter_func_data_percentage, True)
    df = df.groupby('slug').filter(filter_func_date, date_filter=df['date'].iloc[-1])

    return df


def label_top(ranking):
    return 1 if ranking <= EVAL_TOPS else 0


def label_data(df):
    # Obtain cumulative return per date
    df_pivot = df.pivot_table(index=['date'], columns='slug', values=['deltaPricePercent'], aggfunc='first',
                              fill_value=0)
    df_pivot.columns = [col[1] for col in df_pivot.columns.values]

    df_cum = (1 + df_pivot).cumprod() - 1
    df_cum = df_cum.stack(level=0).reset_index()

    # df_cum_returns = df_returns.add(1).cumprod(axis=1)
    # df_cum_returns = df_cum_returns.stack(level=0).reset_index()
    df_cum = df_cum.rename(columns={df_cum.columns[1]: 'slug', df_cum.columns[2]: 'compound'})
    df_cum['realRankCompound'] = df_cum.groupby('date')['compound'].rank(method="first", ascending=False)

    # Obtain real top

    df_compound = pd.merge(df, df_cum, on=['slug', 'date'], how='inner')
    df_compound['realTop'] = df_compound.apply(lambda x: label_top(x['realRankCompound']), axis=1)

    # Shift compound, RealRank, realTop 1 week back:
    # if week w has a real compound Interest of c(w)
    # we want to predict c(w) = f(w-1), this is, with the data available a week earlier

    df_compound_delay = df_compound[['slug', 'date', 'compound', 'realRankCompound', 'realTop']].copy()
    df_compound_delay = df_compound_delay.rename(
        columns={'compound': 'compoundW1', 'realRankCompound': 'realRankCompoundW1', 'realTop': 'realTopW1'})
    df_compound_delay['date'] = df_compound_delay['date'] - pd.to_timedelta(1, unit='w')
    df_compound = pd.merge(df_compound, df_compound_delay, on=['slug', 'date'], how='left')
    df_compound.fillna(0, inplace=True)

    return df_compound


def get_regression(y, X, forecaster_y, forecaster_X, fh):
    pipe = ForecastX(forecaster_y=forecaster_y, forecaster_X=forecaster_X,)
    pipe = pipe.fit(y, X=X, fh=fh)
    
    return pipe.predict(fh=fh) 

    
def eval_regression(df):
    all_rmse = []
    all_nrmse = []
    all_mae = []
    all_nmae = []

    all_weeks = df['week'].unique()

    for week in all_weeks:

        try:
            df_week = df[df['week'] == week]
            nf_week = df_week['compoundW1'].max() - df_week['compoundW1'].min()
            rmse_week = mean_squared_error(df_week['compoundW1'], df_week['compoundW1pred'])
            mae_week = mean_absolute_error(df_week['compoundW1'], df_week['compoundW1pred'])
            all_rmse.append(rmse_week)
            all_nrmse.append(rmse_week / nf_week)
            all_mae.append(mae_week)
            all_nmae.append(mae_week / nf_week)

        except:
            print(f"Cannot eval week {week}", flush=True)
            traceback.print_exc()

    return [np.mean(all_rmse), np.mean(all_nrmse), np.mean(all_mae), np.mean(all_nmae)]


def prep_classification(df):
    df_eval = df.copy()
    df_eval['rankCompoundW1pred'] = df_eval.groupby('week')['compoundW1pred'].rank(method="first",
                                                                                             ascending=False)
    df_eval = pd.merge(df, df_eval, how='left', left_on=['week', 'slug'], right_on=['week', 'slug'],
                       suffixes=('', '_y'))
    df_eval = df_eval[[c for c in df_eval.columns if not c.endswith('_y')]]
    df_eval['topW1pred'] = df_eval.apply(lambda x: label_top(x['rankCompoundW1pred']), axis=1)

    return df_eval


def eval_classification(df):
    all_f1 = []
    all_ba = []

    all_weeks = df['week'].unique()

    for week in all_weeks:
        
        try:
            df_week = df[df['week'] == week]
            
            #display(df_week[df_week['realTopW1']==1][['slug', 'realTopW1','topW1pred']])
            #print(f"real:{df_week[df_week['realTopW1']==1]['slug'].nunique()}, pred:{df_week[df_week['topW1pred']==1]['slug'].nunique()}")
            
            f1_week = f1_score(df_week['realTopW1'], df_week['topW1pred'], pos_label=1, average='binary')
            ba_week = balanced_accuracy_score(df_week['realTopW1'], df_week['topW1pred'])
            all_f1.append(f1_week)
            all_ba.append(ba_week)

        except:
            print(f"Cannot eval week {week}", flush=True)
            traceback.print_exc()

    return [np.mean(all_f1), np.mean(all_ba)]


def get_tops(df, forecaster_y, forecaster_X):
    df_labeled = label_data(df)
    
    # Prepare for sktime
    df_labeled = df_labeled.set_index(['slug', 'week'])
    df_labeled.sort_index(inplace=True)
    
    # Train and eval
    
    numeric_feature_columns = list(df_labeled.columns.difference(
        ['slug', 'date', 'month', 'deltaPricePercent', 'compound', 'realRankCompound', 'realTop', 'compoundW1',
         'realRankCompoundW1', 'realTopW1']).values)
    test_size = EVAL_WEEKS
    y_train, y_test = temporal_train_test_split(df_labeled, test_size=EVAL_WEEKS)

    fh = list(range(1, EVAL_WEEKS + 1))
    #fh = ForecastingHorizon(y_test.index.get_level_values(1).unique(), is_relative=False)

    y_pred = get_regression(
        y_train[['compoundW1']], y_train[numeric_feature_columns], 
        forecaster_y=forecaster_y, forecaster_X=forecaster_X,
        fh=fh)
    
    df_prediction_regression = y_pred.join(y_test[['compoundW1', 'realTopW1']], lsuffix='pred')
    df_prediction_regression.reset_index(inplace=True)
    
    evaluation_regression = eval_regression(df_prediction_regression)
    
    df_prediction_classification = prep_classification(df_prediction_regression)
    weeks = list(df_prediction_classification.week.unique())
    eval_weeks = weeks[-test_size:]
    evaluation_classification = eval_classification(
            df_prediction_classification[df_prediction_classification.week.isin(eval_weeks)])
    
    # Predict tops for next week
    
    fh = [1]
    
    y_pred = get_regression(
        df_labeled[['compoundW1']], df_labeled[numeric_feature_columns], 
        forecaster_y=forecaster_y, forecaster_X=forecaster_X,
        fh=fh)
        
    df_prediction_regression = y_pred.reset_index()
    df_prediction_regression.rename(columns={"compoundW1": "compoundW1pred"}, inplace=True)
    
    df_prediction_classification = prep_classification(df_prediction_regression)      
    df_predicted_tops_next_week = df_prediction_classification[
        ['slug', 'rankCompoundW1pred']].sort_values("rankCompoundW1pred", ascending=True)
    df_predicted_tops_next_week.rename(columns={"rankCompoundW1pred": "rank"}, inplace=True)    
    
    return [df_predicted_tops_next_week.head(MAX_TOPS), evaluation_classification + evaluation_regression]


def execute(df_analysis, last_n_weeks=0, forecaster_y=NaiveForecaster(), forecaster_X=NaiveForecaster()):
    # Subset by weeks

    if last_n_weeks > 0:
        all_weeks = list(df_analysis['date'].unique())

        df_analysis = df_analysis[~(df_analysis['date'].isin(all_weeks[-last_n_weeks:]))]

    prediction_week = df_analysis['date'].iloc[-1] + pd.to_timedelta(1, unit='w')

    print(f"Obtaining prediction for week {prediction_week}...", flush=True)

    # Filter by type of project

    df_analysis_bbpro = filter(
        df_analysis, min_m_cap=MIN_COIN_GECKO_MARKET_MEAN_BBPRO, min_m_vol=MIN_COIN_GECKO_VOLUME_MEAN_BBPRO)

    df_analysis_bbgems = filter(
        df_analysis, max_m_cap=MAX_COIN_GECKO_MARKET_MEAN_BBGEMS, min_m_vol=MIN_COIN_GECKO_VOLUME_MEAN_BBGEMS)

    print(f"\tProjects in BBPro: {df_analysis_bbpro['slug'].nunique()}", flush=True)
    print(f"\tProjects in BBGems: {df_analysis_bbgems['slug'].nunique()}", flush=True)

    # Obtain tops and evaluation

    [tops_bbpro, eval_bbpro] = get_tops(df_analysis_bbpro, forecaster_y, forecaster_X)
    [tops_bbgems, eval_bbgems] = get_tops(df_analysis_bbgems, forecaster_y, forecaster_X)

    print(f"\tBBPro evaluation (F1, BA, RMSE, NRMSE, MAE, NMAE): {eval_bbpro}", flush=True)
    print(f"\tBBGems evaluation ((F1, BA, RMSE, NRMSE, MAE, NMAE): {eval_bbgems}", flush=True)

    # Store

    prediction_week_str = prediction_week.strftime("%Y_%m_%d")

    return [tops_bbpro, tops_bbgems]

## Obtain tops with different forecasters

In [42]:
data = pd.read_csv('aire.csv')

# Prepare data

df_analysis = pre_process(data)

last_week = df_analysis['date'].iloc[-1]
prediction_week = last_week + pd.to_timedelta(1, unit='w')

print(f"Last week with data {last_week}...", flush=True)

Projects before pre-processing date
2020-01-13    180
Name: slug, dtype: int64
Projects After pre-processing date
2020-01-18    180
Name: slug, dtype: int64
Last week with data 2020-01-18 00:00:00...


#### ARIMA (endogenous) and VAR (exogenous)

In [43]:
[tops_bbpro, tops_bbgems] = execute(
    df_analysis, forecaster_X=VAR(trend='n'),forecaster_y=ARIMA(with_intercept=False))

Obtaining prediction for week 2020-01-25 00:00:00...
	Projects in BBPro: 21
	Projects in BBGems: 55




	BBPro evaluation (F1, BA, RMSE, NRMSE, MAE, NMAE): [0.0, 0.5238095238095238, 1.504379938696733, inf, 0.7206544501740527, inf]
	BBGems evaluation ((F1, BA, RMSE, NRMSE, MAE, NMAE): [0.0, 0.8181818181818182, 0.2241295225787308, inf, 0.28997594002687216, inf]


In [40]:
display(tops_bbpro)
display(tops_bbgems)

Unnamed: 0,slug,rank
9,dash,1.0
4,bitcoin-diamond,2.0
11,ethereum-classic,3.0
5,bitcoin-gold,4.0
3,bitcoin,5.0
13,monacoin,6.0
7,cardano,7.0
20,zencash,8.0
2,binancecoin,9.0
15,omisego,10.0


Unnamed: 0,slug,rank
52,unibright,1.0
43,reserve-rights-token,2.0
28,melon,3.0
22,force-protocol,4.0
6,chronobank,5.0
32,noia-network,6.0
38,pirate-chain,7.0
12,dero,8.0
16,eosforce,9.0
41,pumapay,10.0


#### ExtraTreesRegressor

In [None]:
regressor = ExtraTreesRegressor()

#Using make_reduction
regression_forecaster = make_reduction(regressor, window_length=10, strategy="recursive")

[tops_bbpro, tops_bbgems] = execute(
    df_analysis, forecaster_X=regression_forecaster, forecaster_y=regression_forecaster)

Obtaining prediction for week 2021-07-24 00:00:00...
	Projects in BBPro: 213
	Projects in BBGems: 363
	BBPro evaluation (F1, BA, RMSE, NRMSE, MAE, NMAE): [0.57, 0.8221770855015148, 1636.1188705144891, 17.624818491747497]
	BBGems evaluation ((F1, BA, RMSE, NRMSE, MAE, NMAE): [0.7200000000000001, 0.9046956039925395, 923.3297090214164, 7.221462426899317]


In [None]:
display(tops_bbpro)
display(tops_bbgems)

Unnamed: 0,slug,rank
101,kusama,1.0
142,pirate-chain,2.0
191,uquid-coin,3.0
94,kardiachain,4.0
173,swissborg,5.0
...,...,...
169,stratis,96.0
91,iotex,97.0
203,wazirx,98.0
5,aragon,99.0


Unnamed: 0,slug,rank
143,free-coin,1.0
350,wownero,2.0
195,lcx,3.0
94,degenerator,4.0
261,quadrant-protocol,5.0
...,...,...
300,suqa,96.0
90,datx,97.0
342,viacoin,98.0
98,digitalnote,99.0


#### NaiveForecaster

In [None]:
forecaster = NaiveForecaster(strategy='mean', window_length=5)

[tops_bbpro, tops_bbgems] = execute(df_analysis, forecaster_X=forecaster, forecaster_y=forecaster)

Obtaining prediction for week 2021-07-24 00:00:00...
	Projects in BBPro: 213
	Projects in BBGems: 363
	BBPro evaluation (F1, BA, RMSE, NRMSE, MAE, NMAE): [0.5900000000000001, 0.8326696963389532, 1005.6393215423366, inf, 15.734518994835984, inf]
	BBGems evaluation ((F1, BA, RMSE, NRMSE, MAE, NMAE): [0.7200000000000001, 0.9046956039925395, 579.6979267539541, inf, 6.413916787223494, inf]


In [None]:
display(tops_bbpro)
display(tops_bbgems)

Unnamed: 0,slug,rank
191,uquid-coin,1.0
101,kusama,2.0
181,theta-fuel,3.0
142,pirate-chain,4.0
80,helium,5.0
...,...,...
20,bitcoin-gold,96.0
27,cartesi,97.0
170,streamr-datacoin,98.0
52,digibyte,99.0


Unnamed: 0,slug,rank
195,lcx,1.0
350,wownero,2.0
94,degenerator,3.0
97,dfohub,4.0
205,matryx,5.0
...,...,...
43,bitwhite,96.0
250,poa-network,97.0
150,genaro-network,98.0
50,bomb,99.0


## Appendix. Sktime

In [None]:
#LIST ALL

from sktime.registry import all_estimators

all_estimators(
    "forecaster", as_dataframe=True, filter_tags={"ignores-exogeneous-X": [False], }
)

#Hierarchical
all_estimators(
    "forecaster", as_dataframe=True, return_tags=['y_inner_mtype','X_inner_mtype']
)

Unnamed: 0,name,estimator,y_inner_mtype,X_inner_mtype
0,ARDL,<class 'sktime.forecasting.ardl.ARDL'>,pd.Series,pd.DataFrame
1,ARIMA,<class 'sktime.forecasting.arima.ARIMA'>,pd.Series,pd.DataFrame
2,AutoARIMA,<class 'sktime.forecasting.arima.AutoARIMA'>,pd.Series,pd.DataFrame
3,AutoETS,<class 'sktime.forecasting.ets.AutoETS'>,pd.Series,pd.DataFrame
4,AutoEnsembleForecaster,<class 'sktime.forecasting.compose._ensemble.A...,pd.Series,pd.DataFrame
5,BATS,<class 'sktime.forecasting.bats.BATS'>,pd.Series,pd.DataFrame
6,BaggingForecaster,<class 'sktime.forecasting.compose._bagging.Ba...,pd.Series,pd.DataFrame
7,ColumnEnsembleForecaster,<class 'sktime.forecasting.compose._column_ens...,"[pd.DataFrame, pd-multiindex, pd_multiindex_hier]","[pd.DataFrame, pd-multiindex, pd_multiindex_hier]"
8,ConformalIntervals,<class 'sktime.forecasting.conformal.Conformal...,pd.Series,pd.DataFrame
9,Croston,<class 'sktime.forecasting.croston.Croston'>,pd.Series,pd.DataFrame
