In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from functools import reduce
from tslearn.preprocessing import TimeSeriesScalerMeanVariance, TimeSeriesResampler
import plotly.graph_objects as go
import math

Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [2]:
#loading data

asset_pchange_list = []
asset_price_list = []
asset_name_list = []

df = pd.read_excel('PCA1.xlsx', sheet_name=0)
date_list = np.flip(df.iloc[:,0].to_numpy().astype('datetime64[D]'))
num_of_asset = len(df.columns)
for i in range(1, num_of_asset):
    asset_array = df.iloc[:,i].to_numpy()
    mask = ~np.isnan(asset_array)
    asset_array_no_na = asset_array[mask]
    asset_price_list.append(np.flip(asset_array_no_na))
    asset_name_list.append(df.columns[i])

p_date_list = date_list[1:]
for i in range(len(asset_price_list)):
    asset_price = asset_price_list[i]
    print(asset_name_list[i], i)
    if i < 12 or (len(asset_price_list)>20 and i<19):
    
        try:
            diff_list = np.diff(asset_price) / asset_price[:-1]
        except:
            print(i)
    else:
        diff_list = np.diff(asset_price)
    asset_pchange_list.append(diff_list)

# training_asset_pchange_list = np.array(asset_pchange_list)[:, :960]
# testing_asset_pchange_list = np.array(asset_pchange_list)[:, 960:]

# training_p_date_list = np.array(p_date_list)[:960]
# testing_p_date_list = np.array(p_date_list)[960:]

# training_asset_price_list = np.array(asset_price_list)[:,:960]
# testing_asset_price_list = np.array(asset_price_list)[:,960:]

# training_date_list = np.array(date_list)[:960]
# testing_date_list = np.array(date_list)[960:]



SPX index 0
MXEF Index 1
USCRWTIC Index 2
XAU Curncy 3
LMCADS03 Comdty 4
WEATTKHR index 5
USDEUR Curncy 6
USDAUD Curncy 7
USDKRW Curncy 8
USDINR Curncy 9
MOVE Index 10
VIX Index 11
USGG2YR Index 12
USGG10YR Index 13
USGG5YR INDEX 14
USYC2Y10 Index 15
USYC5Y30 Index 16


In [61]:
class All_TSMD():
    def __init__(self, price_lists, price_date_list, data_lists, data_date_list, asset_name_list, target_index, windows=10, n_closest=5, is_p_change = True):
        self.__price_lists = price_lists
        self.__price_date_list = price_date_list
        self.__data_lists = data_lists
        self.__data_date_list = data_date_list
        self.__target_index = target_index
        self.__windows = windows
        self.__n_closest = n_closest
        self.__is_p_change = is_p_change
        self.__asset_name_list = asset_name_list

        before_target_data_lists = np.array(data_lists)[:, :target_index]
        target_data_lists = np.array(data_lists)[:, target_index:target_index+windows]
        self.__before_target_data_lists = before_target_data_lists
        self.__target_data_lists = target_data_lists

    def fit(self):
        all_asset_e_dist_list = []
        windows = self.__windows
        is_p_change = self.__is_p_change
        before_target_data_lists = self.__before_target_data_lists
        n_closest = self.__n_closest
        target_data_lists = self.__target_data_lists
        for i in range(len(before_target_data_lists)):
            e_dist_list = []
            asset_data = before_target_data_lists[i]
            target_data = target_data_lists[i]
            for j in range(len(asset_data) - windows + 1):
                if is_p_change:
                    compare_data = asset_data[j:j+windows]

                    e_dist = np.square(np.array(compare_data) - np.array(target_data))
                    n = len(e_dist)
                    indices = np.arange(n)
                    factors = indices + 1
                    weighted_e_dist_arr = e_dist * factors
                    weighted_e_dist = np.sqrt(np.sum(weighted_e_dist_arr))


                    e_dist = np.linalg.norm(np.array(compare_data) - np.array(target_data))
                    e_dist_list.append(weighted_e_dist)
                else:
                    compare_data = TimeSeriesScalerMeanVariance().fit_transform([asset_data[j:j+windows]])[0]
                    e_dist = np.linalg.norm(np.array(compare_data) - TimeSeriesScalerMeanVariance().fit_transform([target_data])[0])
                    e_dist_list.append(e_dist)
            all_asset_e_dist_list.append(e_dist_list)
        cumulative_asset_e_dist_list = reduce(np.add, all_asset_e_dist_list)
        n_smallest_e_dist_list = np.partition(cumulative_asset_e_dist_list, n_closest)[:n_closest]
        n_smallest_index = np.where(np.isin(cumulative_asset_e_dist_list, n_smallest_e_dist_list))[0]
        print(n_smallest_index)
        self.cumulative_asset_e_dist_list = cumulative_asset_e_dist_list
        self.n_smallest_e_dist_list = n_smallest_e_dist_list
        self.n_smallest_index = n_smallest_index

    def plot(self, show_mean = True, show_all = True):
        is_p_change = self.__is_p_change
        asset_price_lists = self.__price_lists
        asset_price_date_list = self.__price_date_list

        target_index = self.__target_index
        windows = self.__windows
        n_smallest_index = self.n_smallest_index
        asset_name_list = self.__asset_name_list
        all_prediction_list = self.__all_prediction_list
        new_date_list = self.__new_date_list
        extra_windows = 0
        
        if is_p_change:
            extra_windows = 1
        for i in range(len(asset_price_lists)):
            
            asset_price_list = asset_price_lists[i]
            fig = go.Figure()
            fig.add_trace(go.Scatter(x=asset_price_date_list[:len(asset_price_list)], y=asset_price_list, name='index',  mode='lines'))
            for j in range(len(n_smallest_index)):
                index = n_smallest_index[j]
                fig.add_trace(go.Scatter(x=asset_price_date_list[index:index+windows+extra_windows], y=asset_price_list[index:index+windows+extra_windows], name= f"similar {j}", line = dict(color='red'),  mode='lines'))
                if show_all:
                    fig.add_trace(go.Scatter(x=new_date_list, y=all_prediction_list[i][j], name= f"predict {j}", line = dict(color='gray'),  mode='lines'))
            
            mean_list = self.__all_predict_mean_list[i]
            if show_mean:
                fig.add_trace(go.Scatter(x=new_date_list, y=mean_list, name= f"mean predict", line = dict(color='green'),  mode='lines'))
            fig.add_trace(go.Scatter(x=asset_price_date_list[target_index:target_index+windows+extra_windows], y=asset_price_list[target_index:target_index+windows+extra_windows], name= f"search target", line = dict(color='yellow'),  mode='lines'))
            
            fig.update_layout(title=asset_name_list[i],
                   xaxis_title='Date',
                   yaxis_title='Value')
            fig.show()

        pass

    def transform(self, n_data_points):
        self.__n_data_points = n_data_points 
        n_smallest_index = self.n_smallest_index
        data_lists = self.__data_lists
        data_date_list = self.__data_date_list
        
        price_lists = self.__price_lists
        asset_price_date_list = self.__price_date_list

        is_p_change = self.__is_p_change
        windows = self.__windows 
        target_index = self.__target_index
              
        all_prediction_list = []
        all_predict_mean_list = []
        all_predict_percentage_change_list =[]

        for i in range(len(data_lists)):
            data_list = data_lists[i]
            price_list = price_lists[i]
            asset_prediction_list = []
            predict_percentage_change_list = []
            for index in n_smallest_index:
                latest_price = price_list[target_index + windows]
                latest_date = asset_price_date_list[target_index + windows]
                new_data_list = [latest_price]
                new_date_list = [latest_date]
                for j in range(n_data_points - 1):
                    if i < 12 or (len(asset_price_list)>20 and i<19):
                    
                        latest_price *= (1 + data_list[index+windows+j])
                    else:
                        latest_price += data_list[index+windows+j]

                    latest_date += np.timedelta64(7, 'D')
                    new_date_list.append(latest_date)
                    new_data_list.append(latest_price)
                asset_prediction_list.append(new_data_list)
                predict_percentage_change_list.append(data_list[index+windows:index+windows+n_data_points])
            
            np_all_prediction_list = np.array(asset_prediction_list)
            mean_list = np.mean(np_all_prediction_list, axis=0)
            all_predict_mean_list.append(mean_list)
            all_prediction_list.append(asset_prediction_list)
            predict_percentage_change_mean = np.mean(np.array(predict_percentage_change_list), axis=0)
            all_predict_percentage_change_list.append(predict_percentage_change_mean)
        new_date_list = np.array(new_date_list).astype('datetime64[D]')
        self.__all_prediction_list = all_prediction_list
        self.__new_date_list = new_date_list
        self.__all_predict_mean_list = all_predict_mean_list
        self.__all_predict_percentage_change_list = np.array(all_predict_percentage_change_list)
        print(all_predict_mean_list)
    def plot_e_dist(self):
        cumulative_asset_e_dist_list = self.cumulative_asset_e_dist_list
        price_date_list = self.__price_date_list

        fig = go.Figure()
        fig.add_trace(go.Scatter(x=price_date_list[:len(cumulative_asset_e_dist_list)], y=cumulative_asset_e_dist_list, name='similarity'))
        fig.update_layout(title='Similarity',
                   xaxis_title='Date',
                   yaxis_title='1/similarity')
        fig.show()

    def score(self):
        all_predict_percentage_change_list = self.__all_predict_percentage_change_list
        windows = self.__windows
        target_index = self.__target_index
        data_lists = self.__data_lists
        n_data_points = self.__n_data_points
        total_score = 0
        for i in range(len(data_lists)):
            mean = all_predict_percentage_change_list[i]
            data_list = data_lists[i][target_index+windows:target_index+windows+len(mean)]
            total_score += np.linalg.norm(np.array(data_list) - np.array(mean))
        self.total_score = total_score/n_data_points
    

In [63]:
# indexs = [i for i in range(1190,1212)]
# for index in indexs:
all_tsmd = All_TSMD(asset_price_list, date_list, asset_pchange_list, p_date_list, asset_name_list, 1180, 12, 5, True)
all_tsmd.fit()
all_tsmd.transform(16)
#all_tsmd.score()
all_tsmd.plot(True, True)

[247 549 649 669 782]
[array([3992.93      , 4024.87179359, 4025.62269284, 4006.72317704,
       4048.09861661, 4011.92125715, 4056.28179323, 4046.52526876,
       4068.79922707, 4066.23130208, 4050.55165498, 4091.57380146,
       4077.49139301, 4087.05772687, 4107.9785918 , 4126.04902787]), array([935.73      , 959.33842231, 952.73882701, 957.88092035,
       967.90806503, 960.31063859, 965.49027406, 956.30584839,
       964.61027513, 962.97256481, 951.23602909, 959.62566658,
       956.11051689, 956.33765342, 965.05981689, 956.31877654]), array([88.96      , 89.44233749, 89.57170556, 92.26317411, 92.33775701,
       92.7351682 , 91.71722458, 92.16966595, 93.30215643, 95.44491207,
       95.48984876, 96.71080342, 97.1143123 , 94.58726789, 97.4107201 ,
       93.89031933]), array([1771.24      , 1784.95784981, 1759.65045595, 1757.68108608,
       1758.04419875, 1744.63423639, 1727.75702558, 1746.80644248,
       1756.4923766 , 1759.2579588 , 1743.14955664, 1743.95376147,
       1730.02

In [56]:

n_similar_list = [3, 4, 5, 6, 7]
n_data_points_list = [12, 14, 16, 18]
windows_list = [12, 14, 16, 18]

result = []
for n_similar in n_similar_list:
    for n_data_point in n_data_points_list:
        for windows in windows_list:
            total_score = 0
            for i in range(500, 1230 - n_data_point - windows -1):
                all_tsmd = All_TSMD(asset_price_list, date_list, asset_pchange_list, p_date_list, asset_name_list, i, windows, n_similar, True)
                all_tsmd.fit()
                all_tsmd.transform(n_data_point)
                all_tsmd.score()
                total_score += all_tsmd.total_score
            print(total_score)
            result.append({"n_smaimilar":n_similar, "n_data_point":n_data_point, "windows":windows, "total_score":total_score/(1230 - n_data_point - windows -1 - 500)})

KeyboardInterrupt: 

In [59]:
class New_TSMD():
    def __init__(self, asset_price_lists, asset_price_date_list, asset_percentage_change_lists, asset_percentage_change_date_list, asset_name_list):
        self.__asset_price_lists = asset_price_lists
        self.__asset_price_date_list = asset_price_date_list
        self.__asset_percentage_change_lists = asset_percentage_change_lists
        self.__asset_percentage_change_date_list = asset_percentage_change_date_list
        self.__asset_name_list = asset_name_list
    
    def fit(self, target_index, target_windows, n_similar, search_windows_range):
        self.__target_index = target_index
        self.__target_windows = target_windows
        asset_percentage_change_lists = self.__asset_percentage_change_lists
        before_target_data_lists = np.array(asset_percentage_change_lists)[:, :target_index]
        target_data_lists = np.array(asset_percentage_change_lists)[:, target_index:target_index+target_windows]
        self.__before_target_data_lists = before_target_data_lists
        self.__target_data_lists = target_data_lists

        num_of_asset = len(before_target_data_lists)
        num_of_data = len(before_target_data_lists[0])
        search_windows_list = search_windows_range

        all_sum_of_error_list = np.array([])
        all_details_list = []

        for search_windows in search_windows_list:
            for i in range(num_of_data - search_windows + 1):

                sum_of_error = 0
                for j in range(num_of_asset):
                    target_data = target_data_lists[j]
                    search_data = before_target_data_lists[j][i:i+search_windows]
                    resized_search_data = np.ravel(TimeSeriesResampler(sz=target_windows).fit_transform(search_data)[0])
                    e_dist = np.square(np.array(target_data) - np.array(resized_search_data))

                    n = len(e_dist)
                    indices = np.arange(n)
                    factors = indices + 1
                    weighted_e_dist_arr = e_dist * factors
                    weighted_e_dist = np.sqrt(np.sum(weighted_e_dist_arr))
                    sum_of_error += weighted_e_dist
                all_sum_of_error_list = np.append(all_sum_of_error_list, sum_of_error)
                all_details_list.append({"search_windows": search_windows, "index": i, "sum_of_error": sum_of_error})
        n_smallest_e_dist_list = np.partition(all_sum_of_error_list, n_similar)[:n_similar]
        n_smallest_index = np.where(np.isin(all_sum_of_error_list, n_smallest_e_dist_list))[0]
        self.__details_list = np.take(all_details_list, n_smallest_index)
        
    def transform(self, n_data_points):
        details_list = self.__details_list
        asset_price_lists = self.__asset_price_lists
        asset_percentage_change_lists = self.__asset_percentage_change_lists
        asset_price_date_list = self.__asset_price_date_list
        target_index = self.__target_index
        target_windows = self.__target_windows

        all_prediction_list = []
        all_predict_mean_list = []
        all_predict_percentage_change_list = np.array([])

        num_of_asset = len(asset_percentage_change_lists)
        for i in range(num_of_asset):
            percentage_change_list = asset_percentage_change_lists[i]
            price_list = asset_price_lists[i]

            asset_prediction_lists = []
            predict_percentage_change_list = []
            for details in details_list:
                latest_price = price_list[target_index + target_windows]
                latest_date = asset_price_date_list[target_index + target_windows]
                new_data_list = np.array([latest_price])
                new_date_list = np.array([latest_date])
                start_index = details["index"]
                search_windows = details["search_windows"]
                scale = search_windows/target_windows
                search_percentage_change_list = percentage_change_list[start_index+search_windows:start_index+search_windows+math.ceil(n_data_points*scale)]
                resized_percentage_change_list = np.ravel(TimeSeriesResampler(sz=n_data_points).fit_transform(search_percentage_change_list)[0])
                for resized_percentage_change in resized_percentage_change_list[:-1]:
                    if i < 12 or (len(asset_price_list)>20 and i<19):
                    
                        latest_price *= (1 + resized_percentage_change)
                    else:
                        latest_price += resized_percentage_change

                    latest_date += np.timedelta64(7, 'D')
                    new_date_list = np.append(new_date_list, latest_date)
                    new_data_list = np.append(new_data_list, latest_price)
                
                asset_prediction_lists.append(new_data_list)
                predict_percentage_change_list.append(resized_percentage_change_list)
            np_all_prediction_list = np.array(asset_prediction_lists)
            mean_list = np.mean(np_all_prediction_list, axis=0)
            all_predict_mean_list.append(mean_list)
            all_prediction_list.append(asset_prediction_lists)
            predict_percentage_change_mean = np.mean(np.array(predict_percentage_change_list), axis=0)
            all_predict_percentage_change_list = np.append(all_predict_percentage_change_list, predict_percentage_change_mean)
        new_date_list = new_date_list.astype('datetime64[D]')
        self.__all_prediction_list = all_prediction_list
        self.__new_date_list = new_date_list
        self.__all_predict_mean_list = np.array(all_predict_mean_list)
        self.__all_predict_percentage_change_list = all_predict_percentage_change_list
    def plot(self, show_mean = True, show_all = True):
        details_list = self.__details_list
        asset_price_lists = self.__asset_price_lists
        asset_price_date_list = self.__asset_price_date_list

        target_index = self.__target_index
        target_windows = self.__target_windows
        asset_name_list = self.__asset_name_list
        all_prediction_list = self.__all_prediction_list
        new_date_list = self.__new_date_list
        for i in range(len(asset_price_lists)):
            
            asset_price_list = asset_price_lists[i]
            fig = go.Figure()
            fig.add_trace(go.Scatter(x=asset_price_date_list[:len(asset_price_list)], y=asset_price_list, name='index',  mode='lines'))
            for j in range(len(details_list)):
                details = details_list[j]
                index = details["index"]
                search_windows = details["search_windows"]
                fig.add_trace(go.Scatter(x=asset_price_date_list[index:index+search_windows+1], y=asset_price_list[index:index+search_windows+1], name= f"similar {j}", line = dict(color='red'),  mode='lines'))
                if show_all:
                    fig.add_trace(go.Scatter(x=new_date_list, y=all_prediction_list[i][j], name= f"predict {j}", line = dict(color='gray'),  mode='lines'))
            
            mean_list = self.__all_predict_mean_list[i]
            if show_mean:
                fig.add_trace(go.Scatter(x=new_date_list, y=mean_list, name= f"mean predict", line = dict(color='green'),  mode='lines'))
            fig.add_trace(go.Scatter(x=asset_price_date_list[target_index:target_index+target_windows+1], y=asset_price_list[target_index:target_index+target_windows+1], name= f"search target", line = dict(color='yellow'),  mode='lines'))
            
            fig.update_layout(title=asset_name_list[i],
                   xaxis_title='Date',
                   yaxis_title='Value')
            fig.show()

In [67]:
newtsm = New_TSMD(asset_price_list, date_list, asset_pchange_list, p_date_list, asset_name_list)
newtsm.fit(1200, 12, 5, [12, 16, 18, 20])
newtsm.transform(16)
newtsm.plot()

In [53]:
class TSMD():
    def __init__(self, data_array):
        self.data_array = data_array
        self.e_dist_list = []
        self.n_smallest_e_dist_list = []
        self.n_smallest_index = []
        self.target_window = 0
    
    def search(self, search_target, n=3, p_change = True):
        if p_change:
            self.search_target = search_target
            target_window = len(self.search_target)
            self.target_window = target_window
            data_array = self.data_array

            e_dist_list = []
            for i in range(len(data_array) - target_window + 1):
                compare_array = data_array[i:i+target_window]
                E_dist = np.linalg.norm(np.array(compare_array) - np.array(search_target))
                e_dist_list.append(E_dist)
            self.e_dist_list = e_dist_list
            n_smallest_e_dist_list = np.partition(e_dist_list, n)[:n]

            self.n_smallest_e_dist_list = n_smallest_e_dist_list

            n_smallest_index = np.where(np.isin(e_dist_list, n_smallest_e_dist_list))[0]
            self.n_smallest_index = n_smallest_index
        else:
            self.search_target = search_target
            target_window = len(self.search_target)
            self.target_window = target_window
            data_array = self.data_array

            e_dist_list = []
            for i in range(len(data_array) - target_window + 1):
                compare_array = TimeSeriesScalerMeanVariance().fit_transform([data_array[i:i+target_window]])[0]
                E_dist = np.linalg.norm(np.array(compare_array) - TimeSeriesScalerMeanVariance().fit_transform([search_target])[0])
                e_dist_list.append(E_dist)
            self.e_dist_list = e_dist_list
            n_smallest_e_dist_list = np.partition(e_dist_list, n)[:n]

            self.n_smallest_e_dist_list = n_smallest_e_dist_list

            n_smallest_index = np.where(np.isin(e_dist_list, n_smallest_e_dist_list))[0]
            self.n_smallest_index = n_smallest_index

def plot(index_data_arr, index_date_arr, p_data_arr, p_date_arr, n_smallest_index, target_window, target_index):
    # plt.figure(figsize=(26, 9))
    # plt.plot(p_date_arr[:len(p_data_arr)], p_data_arr)
    # for index in n_smallest_index:
    #     plt.plot(p_date_arr[index:index+target_window], p_data_arr[index:index+target_window], 'r-')
    # plt.xlabel('Date')
    # plt.ylabel('Value')
    # plt.title('Time Series Data')
    # plt.show()

    plt.figure(figsize=(26, 9))
    plt.plot(index_date_arr[:len(index_data_arr)], index_data_arr)
    for index in n_smallest_index:
        plt.plot(index_date_arr[index:index+target_window], index_data_arr[index:index+target_window], 'r-')
    plt.plot(index_date_arr[target_index:target_index+target_window], index_data_arr[target_index:target_index+target_window], 'y-')
    plt.xlabel('Date')
    plt.ylabel('Value')
    plt.title('Time Series Data')
    plt.show()

In [None]:
start_index = 0
window = 10
n = 5
per = False




tsdm_result = []
for i in range(len(training_asset_pchange_list)):
    tsmd = TSMD(training_asset_pchange_list[i])
    tsmd.search(testing_asset_pchange_list[i][start_index:start_index+window], n)
    tsdm_result.append(tsmd.e_dist_list)
e_dist_list = reduce(np.add, tsdm_result)
n_smallest_e_dist_list = np.partition(e_dist_list, n)[:n]
n_smallest_index = np.where(np.isin(e_dist_list, n_smallest_e_dist_list))[0]
print(n_smallest_index)
for i in range(len(asset_pchange_list)):
    target_index = 960+start_index
    plot(asset_price_list[i], date_list, asset_pchange_list[i], p_date_list, n_smallest_index, window, target_index)


In [None]:
tsdm_result = []
for i in range(len(training_asset_price_list)):
    tsmd = TSMD(training_asset_price_list[i])
    tsmd.search(testing_asset_price_list[i][start_index:start_index+window], n, False)
    tsdm_result.append(tsmd.e_dist_list)
e_dist_list = reduce(np.add, tsdm_result)
n_smallest_e_dist_list = np.partition(e_dist_list, n)[:n]
n_smallest_index = np.where(np.isin(e_dist_list, n_smallest_e_dist_list))[0]
print(n_smallest_index)
for i in range(len(asset_pchange_list)):
    target_index = 960+start_index
    plot(asset_price_list[i], date_list, asset_pchange_list[i], p_date_list, n_smallest_index, window, target_index)