In [18]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import matplotlib.pylab as plt
import ruptures as rpt

In [19]:
import pandas as pd
import os

def load_df(asset_name: str, base_dir: str = None):
    if base_dir is None:
        # Get the current working directory and locate the 'crypto_qts_data_processed' folder
        base_dir = os.path.join(os.getcwd(), 'crypto_qts_data_processed')

    file_path = os.path.join(base_dir, f"{asset_name}_price_processed.parquet")

    if os.path.exists(file_path):
        return pd.read_parquet(file_path)
    else:
        raise FileNotFoundError(f"File not found: {file_path}")





In [20]:
import numpy as np
def segmentation_rule(crypto: str,
                      jump = 5,
                      pen = 30, plot = False, dates= True):
    df = load_df(crypto)
    daily = df.loc[::24]
    daily['returns'] = daily['price_close'].pct_change()**2
    daily = daily[['price_close']]

    c = rpt.Pelt(model = "rbf", min_size=3, jump=jump).fit(daily.values)
    model = c.predict(pen=pen)

    if plot:
        fig, ax = plt.subplots()
        ax.plot(daily.values, color = 'b')
        for r in model:
            ax.axvline(x = r, color = 'r', linestyle = '--')


    
    if dates:

        y = daily.reset_index()
        dates =[]
        dates.append(df.index[0])
        for m in model:
            m = m-1
            dates.append(y.loc[m,'index'])
    return dates






In [21]:
def sparse_crypto(crypto = 'crypto',
                      jump = 5,
                      pen = 30,
                    plot = False,
                    dates = True):


    dates = segmentation_rule(crypto = crypto,
                      jump = jump,
                      pen = pen, plot = plot, dates = True)

    dfs = {}
    for j in range(len(dates)-1):
        start = dates[j]
        end = dates[j+1]
        df = load_df(crypto).copy()
        df.reset_index(inplace = True)
        df_filtered = df[(df['index']>= start) & (df['index']< end)]
        df_filtered['start_date'] = start
        df_filtered['end_date'] = end

        dfs[f"df_{j}"] = df_filtered





    return dfs


In [22]:
# Each segmented df is stored in a dictionary
btc_dfs = sparse_crypto(crypto = 'BTC',
                      jump = 5,
                      pen = 30,
                    plot = False,
                    dates = True)

In [23]:
def summary_table(crypto = 'BTC',
                      jump = 5,
                      pen = 30,
                    plot = False,
                    dates = True):
    dfs = sparse_crypto(crypto = crypto,
                      jump = jump,
                      pen = pen ,
                    plot = plot,
                    dates = True)

    dfs_summary = {}

    for k, v in dfs.items():
        mean = v['price_close'].mean()
        stdv = v['price_close'].std()
        start_date = (v['start_date'].values[1])
        end_date = (v['end_date'].values[1])


        dfs_summary[k] = {'mean':mean, 'stdv':stdv, 'start_date':start_date, 'end_date':end_date}

    dfs_summary_df = pd.DataFrame.from_dict(dfs_summary, orient='index')

    return dfs_summary_df





In [24]:
from datetime import timedelta
import numpy as np
def normalize_data(crypto = 'BTC'):

    zscore_data = summary_table(
                    crypto = crypto,
                    jump = 5,
                    pen = 30,
                    plot = False,
                    dates = True)



    dfs = sparse_crypto(crypto = crypto,
                      jump = 5,
                      pen = 30,
                    plot = False,
                    dates = True)

    for k,v in dfs.items():

        start_date = v['start_date'].values[0]

        mean = zscore_data[zscore_data['start_date']==start_date]['mean'].values[0]
        std = zscore_data[zscore_data['start_date']==start_date]['stdv'].values[0]

        v['normalized_price'] = (v['price_close'] - mean)/std
        v['returns_normalized'] = v['normalized_price'].pct_change()




    return dfs


In [25]:
def concat_dicts(dates, crypto = 'BTC'):
    dfs = normalize_data(crypto = crypto)

    token = load_df(crypto)

    y = token.loc[:dates[0]]



    for i in range(len(dfs)):

        y = pd.concat([y, dfs[f'df_{i}']], ignore_index = True)

    return y


In [None]:
coins = set(['arb', 'avax', 'btc', 'eth', 'matic', 'sol', 'tron'])
for coin in coins:
    dates = segmentation_rule(coin.upper())
    
    df = concat_dicts(dates, crypto = coin.upper())
    outpath = f'Data/data_segmented/{coin}_data.parquet'
    df.to_parquet(outpath, engine="pyarrow", index=True)