In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import matplotlib.pylab as plt
import ruptures as rpt

In [11]:
import pandas as pd
import os

def load_df(asset_name: str, base_dir: str = None):
    df = pd.read_parquet('data/btc_data.parquet')
    return df





In [12]:
import numpy as np
def segmentation_rule(crypto: str,
                      jump = 5,
                      pen = 30, plot = False, dates= True):
    df = load_df(crypto)
    daily = df.loc[::24]
    daily['returns'] = daily['price_close'].pct_change()**2
    daily = daily[['price_close']]

    c = rpt.Pelt(model = "rbf", min_size=3, jump=jump).fit(daily.values)
    model = c.predict(pen=pen)

    if plot:
        fig, ax = plt.subplots()
        ax.plot(daily.values, color = 'b')
        for r in model:
            ax.axvline(x = r, color = 'r', linestyle = '--')


    
    if dates:

        y = daily.reset_index()
        dates =[]
        dates.append(df.index[0])
        for m in model:
            m = m-1
            dates.append(y.loc[m,'index'])
    return dates






In [13]:
def sparse_crypto(crypto = 'crypto',
                      jump = 5,
                      pen = 30,
                    plot = False,
                    dates = True):


    dates = segmentation_rule(crypto = crypto,
                      jump = jump,
                      pen = pen, plot = plot, dates = True)

    dfs = {}
    for j in range(len(dates)-1):
        start = dates[j]
        end = dates[j+1]
        df = load_df(crypto).copy()
        df.reset_index(inplace = True)
        df_filtered = df[(df['index']>= start) & (df['index']< end)]
        df_filtered['start_date'] = start
        df_filtered['end_date'] = end

        dfs[f"df_{j}"] = df_filtered





    return dfs


{'df_0':                           index                        time_open  \
 0     2015-01-14 18:00:00+00:00        2015-01-14 16:07:05+00:00   
 1     2015-01-14 19:00:00+00:00        2015-01-14 18:50:59+00:00   
 2     2015-01-14 21:00:00+00:00        2015-01-14 19:05:36+00:00   
 3     2015-01-14 22:00:00+00:00        2015-01-14 19:05:36+00:00   
 4     2015-01-14 23:00:00+00:00        2015-01-14 19:05:36+00:00   
 ...                         ...                              ...   
 23611 2017-10-10 08:00:00+00:00 2017-10-10 07:00:00.694000+00:00   
 23612 2017-10-10 09:00:00+00:00 2017-10-10 08:00:00.731000+00:00   
 23613 2017-10-10 10:00:00+00:00 2017-10-10 09:00:00.907000+00:00   
 23614 2017-10-10 11:00:00+00:00 2017-10-10 10:00:01.087000+00:00   
 23615 2017-10-10 12:00:00+00:00 2017-10-10 11:00:01.672000+00:00   
 
                             time_close  price_open  price_high  price_low  \
 0            2015-01-14 16:55:16+00:00      173.93      197.97     110.20   
 1    

In [14]:
# Each segmented df is stored in a dictionary
btc_dfs = sparse_crypto(crypto = 'BTC',
                      jump = 5,
                      pen = 30,
                    plot = False,
                    dates = True)

In [15]:
def summary_table(crypto = 'BTC',
                      jump = 5,
                      pen = 30,
                    plot = False,
                    dates = True):
    dfs = sparse_crypto(crypto = crypto,
                      jump = jump,
                      pen = pen ,
                    plot = plot,
                    dates = True)

    dfs_summary = {}

    for k, v in dfs.items():
        mean = v['price_close'].mean()
        stdv = v['price_close'].std()
        start_date = (v['start_date'].values[1])
        end_date = (v['end_date'].values[1])


        dfs_summary[k] = {'mean':mean, 'stdv':stdv, 'start_date':start_date, 'end_date':end_date}

    dfs_summary_df = pd.DataFrame.from_dict(dfs_summary, orient='index')

    return dfs_summary_df





In [16]:
from datetime import timedelta
import numpy as np
def normalize_data(crypto = 'BTC'):

    zscore_data = summary_table(
                    crypto = crypto,
                    jump = 5,
                    pen = 30,
                    plot = False,
                    dates = True)



    dfs = sparse_crypto(crypto = crypto,
                      jump = 5,
                      pen = 30,
                    plot = False,
                    dates = True)

    for k,v in dfs.items():

        start_date = v['start_date'].values[0]

        mean = zscore_data[zscore_data['start_date']==start_date]['mean'].values[0]
        std = zscore_data[zscore_data['start_date']==start_date]['stdv'].values[0]

        v['normalized_price'] = (v['price_close'] - mean)/std
        v['returns_normalized'] = v['normalized_price'].pct_change()




    return dfs


In [17]:
def concat_dicts(dates, crypto = 'BTC'):
    dfs = normalize_data(crypto = crypto)

    token = load_df(crypto)

    y = token.loc[:dates[0]]



    for i in range(len(dfs)):

        y = pd.concat([y, dfs[f'df_{i}']], ignore_index = True)

    return y


In [18]:
coins = set(['btc'])
for coin in coins:
    dates = segmentation_rule(coin.upper())
    
    df = concat_dicts(dates, crypto = coin.upper())
    outpath = os.path.join('data', f'{coin}.csv')

    df.to_parquet(outpath, engine="pyarrow", index=True)

In [24]:
dates

[Timestamp('2015-01-14 18:00:00+0000', tz='UTC'),
 Timestamp('2017-10-10 13:00:00+0000', tz='UTC'),
 Timestamp('2020-12-26 10:00:00+0000', tz='UTC'),
 Timestamp('2022-05-10 13:00:00+0000', tz='UTC'),
 Timestamp('2023-10-26 20:00:00+0000', tz='UTC'),
 Timestamp('2024-02-24 03:00:00+0000', tz='UTC'),
 Timestamp('2024-11-11 20:00:00+0000', tz='UTC'),
 Timestamp('2025-02-25 04:00:00+0000', tz='UTC')]

In [31]:
start_dates = df['start_date'].dt.date.unique()
end_dates = df['end_date'].dt.date.unique()

In [59]:
start_dates = pd.Series(start_dates)
end_dates = pd.Series(end_dates)

In [60]:
start_dates.dropna(inplace=True)
end_dates.dropna(inplace=True)

In [61]:
start_dates.name = 'dates'
end_dates.name = 'dates'

In [62]:
start_dates = start_dates.sort_values(ascending = True)
end_dates = end_dates.sort_values(ascending = True)

In [57]:
start_dates

1    2015-01-14
2    2017-10-10
3    2020-12-26
4    2022-05-10
5    2023-10-26
6    2024-02-24
7    2024-11-11
Name: dates, dtype: object

In [40]:
df['date'] = df['time_close'].dt.date

In [43]:
prices = df.groupby('date')['price_close'].last()

In [65]:
x = prices[start_dates]

In [66]:
y = prices[end_dates]

In [81]:
pct_change = (y.values- x.values)/x.values * 100

In [82]:
pct_change

array([3858.33333333,  457.37578947,   17.09907518,   10.16799027,
         50.9913274 ,   72.13420812,    4.04730253])