In [38]:
import numpy as np
import pandas as pd
import os
import logging
from progress.bar import Bar
import time
import matplotlib.pyplot as plt
import sys
from scipy.spatial import distance

logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)

period = 20

In [39]:
def read_origin():
    orig_result = {}
    
    stocks = os.listdir(RESULT_DIR)
    progress_bar = Bar('Loading stocks', max=len(stocks))
    progress_bar.check_tty = False
    samples_cnt = 0
    answers_cnt = 0
    start_time = time.time()
    for stock in stocks:

        result_files = os.listdir('{}\\{}'.format(RESULT_DIR, stock))

        orig_result[stock] = {}
        orig_result[stock]['answer'] = []

        for result_file in result_files:

            if result_file.startswith('sample'):
                orig_result[stock]['sample'] = pd.read_csv('{}\\{}\\{}'.format(RESULT_DIR, stock, result_file))
                samples_cnt += 1
            elif result_file.startswith('answer'):
                orig_result[stock]['answer'].append(pd.read_csv('{}\\{}\\{}'.format(RESULT_DIR, stock, result_file)))
                answers_cnt += 1

        progress_bar.next()

    progress_bar.finish()
    logging.info('{} stock results loaded, including {} samples and {} answers'.format(len(stocks), samples_cnt, answers_cnt))
    logging.info('average answers per sample is {}, {} seconds elapsed'.format(answers_cnt / samples_cnt, time.time() - start_time))
    
    return orig_result



#orig_result

In [4]:
def chebyshev_fit(series, deg):
    x = np.arange(0, series.shape[0], 1)
    y = series

    p = np.polynomial.Chebyshev.fit(x, y, deg)
    y_fit = p(x)

    return y_fit

def calculate_distance(sample_a, sample_b, method='cosine', reg=True, deg=9):
    change_a = sample_a['Change']
    if method == 'cosine':
        change_a = sample_a['Close']
        
    fit_a = change_a
    if reg is True:
        fit_a = chebyshev_fit(change_a, deg)
        
    change_b = sample_b['Change']
    if method == 'cosine':
        change_b = sample_b['Close']
        
    fit_b = change_b
    if reg is True:
        fit_b = chebyshev_fit(change_b, deg)
    

    length = min(fit_a.shape[0], fit_b.shape[0])
    dist = None
    if method == 'cosine':
        dist = distance.cosine(fit_a[:length], fit_b[:length])
    elif method == 'chebyshev':
        dist = distance.chebyshev(fit_a[:length], fit_b[:length])
        
    return dist
    

In [7]:
def parse_close(df):
    close = df['Close']
    move_change_list = []

    for i in np.arange(1, close.shape[0], 1):
        move_change_list.append((close.iloc[i] - close.iloc[0]) / close.iloc[0])
        
    np_move_change_list = np.array(move_change_list)
    return np_move_change_list.max(), np_move_change_list.min()

def parse_origin(reg=True):

    len(orig_result)

    columns = ['sample_change', 'sample_change_max', 'sample_change_min',
               'ans_dist_list', 'ans_dist_mean', 
               'ans_change_list', 'ans_change_mean', 'ans_change_std', 'ans_cnt', 
               'ans_change_max_list', 'ans_change_max_mean', 
               'ans_change_min_list', 'ans_change_min_mean']
    result_df = pd.DataFrame(columns=columns)
    progress_bar = Bar('Parsing origin result', max=len(orig_result.keys()))
    progress_bar.check_tty = False

    for stock_id in sorted(orig_result.keys()):

        sample_df = orig_result[stock_id]['sample']
        sample_name = sample_df['Name'][0]


        sample_df_part1 = sample_df.head(period)
        
        sample_df_part2 = None
        sample_change = None
        sample_change_max = None
        sample_change_min = None
        if sample_df.shape[0] > period:
            sample_df_part2 = sample_df.tail(sample_df.shape[0] - period)
            sample_change = (sample_df_part2['Close'].iloc[-1] - sample_df_part2['Close'].iloc[0]) / sample_df_part2['Close'].iloc[0]
            sample_change_max, sample_change_min = parse_close(sample_df_part2)

        ans_change_list = []
        ans_dist_list = []
        ans_change_max_list = []
        ans_change_min_list = []
        for ans_df in orig_result[stock_id]['answer']:

            # answer change list
            ans_df_part1 = ans_df.head(period)
            ans_df_part2 = ans_df.tail(ans_df.shape[0] - period)
            # total change rate
            total_change = 0
            if ans_df.shape[0] > period:
                first_close = ans_df_part2['Close'].iloc[0]
                last_close = ans_df_part2['Close'].iloc[-1]
                total_change = (last_close - first_close) / first_close
            #print('{} {} {}'.format(first_close, last_close, total_change))
            ans_change_list.append(total_change)
            
            # answer distance
            dist = calculate_distance(sample_df_part1, ans_df_part1, reg)
            ans_dist_list.append(dist)
            
            ans_change_max, ans_change_min = parse_close(ans_df_part2)
            ans_change_max_list.append(ans_change_max)
            ans_change_min_list.append(ans_change_min)
            

        np_ans_change_list = np.array(ans_change_list)
        np_ans_dist_list = np.array(ans_dist_list)
        np_ans_change_max_list = np.array(ans_change_max_list)
        np_ans_change_min_list = np.array(ans_change_min_list)
        #print(np_ans_change_list)

        # new row for per stock sample
        row = pd.Series({
            'sample_change':sample_change,
            'sample_change_max':sample_change_max,
            'sample_change_min':sample_change_min,
            
            'ans_dist_list':np_ans_dist_list,
            'ans_dist_mean':np_ans_dist_list.mean(),
            
            'ans_change_list':np_ans_change_list,
            'ans_change_mean':np_ans_change_list.mean(),
            'ans_change_std':np_ans_change_list.std(),
            'ans_cnt':np_ans_change_list.shape[0],
            
            'ans_change_max_list':np_ans_change_max_list,
            'ans_change_max_mean':np_ans_change_max_list.mean(),
            
            'ans_change_min_list':np_ans_change_min_list,
            'ans_change_min_mean':np_ans_change_min_list.mean()
        },
            name=sample_name)

        #print(row)
        # append row to result_df
        result_df = result_df.append(row)

        progress_bar.next()
    progress_bar.finish()
    
    return result_df


def get_fit_x(df, reg, method, deg):
    x = df['Change']
    if method == 'cosine':
        x = df['Close']
        
    fit_x = x
    if reg is True:
        fit_x = chebyshev_fit(x, deg)
        
    return fit_x

def get_metrix(df):
    if df is None:
        return None, None, None
    
    total_change = (df['Close'].iloc[-1] - df['Close'].iloc[0]) / df['Close'].iloc[0]
    
    move_change_list = []
    for i in np.arange(1, df.shape[0], 1):
        move_change_list.append((df['Close'].iloc[i] - df['Close'].iloc[0]) / df['Close'].iloc[0])
        
    array = np.array(move_change_list)
    
    return total_change, array.max(), array.min()


def parse_origin_plus(reg=True, method='cosine', deg=9):
    columns = ['dist1', 'dist2', #'dist1_std', 'dist2_std',
               'total_change1', 'total_change2',
               'max_raise1', 'max_raise2',
               'max_drop1', 'max_drop2']
    result_df = pd.DataFrame(columns=columns)
    progress_bar = Bar('Parsing origin result', max=len(orig_result.keys()))
    progress_bar.check_tty = False
    
    for stock_id in sorted(orig_result.keys()):

        sample_df = orig_result[stock_id]['sample']
        sample_name = sample_df['Name'][0]  
        
        sample_df_part1 = sample_df.head(period)
        sample_df_part2 = None
        if sample_df.shape[0] > period:
            sample_df_part2 = sample_df.tail(sample_df.shape[0] - period).head(20)
            
        X_part1 = []
        X_part1.append(get_fit_x(sample_df_part1, reg, method, deg))
        X_part2 = []
#         X_part2.append(get_fit_x(sample_df_part2, reg, method, deg))
        
    
        total_change1, max_raise1, max_drop1 = get_metrix(sample_df_part2)
    
        total_change2_list = []
        max_raise2_list = []
        max_drop2_list = []
        for ans_df in orig_result[stock_id]['answer']:

            # divide answer to 2 parts
            ans_df_part1 = ans_df.head(period)
            ans_df_part2 = ans_df.tail(ans_df.shape[0] - period).head(period)
            
            # get array to calculate similarity
            fit_x_part1 = get_fit_x(ans_df_part1, reg, method, deg)
            fit_x_part2 = get_fit_x(ans_df_part2, reg, method, deg)
            
            # input to calculate answer distance matrix
            X_part1.append(fit_x_part1.tolist())
            X_part2.append(fit_x_part2.tolist())
            
            # metrix
            total_change2, max_raise2, max_drop2 = get_metrix(ans_df_part2)
            total_change2_list.append(total_change2)
            max_raise2_list.append(max_raise2)
            max_drop2_list.append(max_drop2)
            
        
        Y_part1 = distance.pdist(X_part1, method)
        Y_part2 = distance.pdist(X_part2, method)

        ans_cnt = len(orig_result[stock_id]['answer'])

        # new row for per stock sample
        row = pd.Series({
            'dist1':Y_part1.mean(),
            'dist2':Y_part2.mean(),
#             'dist1_std':Y_part1.std(),
#             'dist2_std':Y_part2.std(),
            'total_change1':total_change1,
            'total_change2':np.array(total_change2_list).mean(),
            'max_raise1':max_raise1,
            'max_raise2':np.array(max_raise2_list).mean(),
            'max_drop1':max_drop1,
            'max_drop2':np.array(max_drop2_list).mean()
        },
            name=sample_name)
            
        # append row to result_df
        result_df = result_df.append(row)

        progress_bar.next()
    progress_bar.finish()
    
    return result_df

In [41]:
def plot_with_chebyshev(axs, x, y, deg, title):
    p = np.polynomial.Chebyshev.fit(x, y, deg)
    y_fit = p(x)
    
    if deg == 0:
        y_fit = y

    axs.plot(x, y, '', x, y_fit, 'r.')
    

def plot_close(axs, df, deg, title):
    x_close = np.arange(0, df.shape[0], 1)
    y_close = df['Close']
    plot_with_chebyshev(axs, x_close, y_close, deg, title)
    
    begin_time = None
    end_time = None
    if 'Date' in df.columns:
        begin_time = df['Date'].iloc[0]
        end_time = df['Date'].iloc[-1]
    else:
        begin_time = df.index[0].strftime('%Y-%m-%d')
        end_time = df.index[-1].strftime('%Y-%m-%d')
    axs.set_title('{} [{} {}]'.format(title, begin_time, end_time))
    
#     axs.set_ylabel('close')
    
def plot_change(axs, df, deg, title):
    x_close = np.arange(0, df.shape[0], 1)
    y_close = df['Change']
    plot_with_chebyshev(axs, x_close, y_close, deg, title)

    begin_time = None
    end_time = None
    if 'Date' in df.columns:
        begin_time = df['Date'].iloc[0]
        end_time = df['Date'].iloc[-1]
    else:
        begin_time = df.index[0].strftime('%Y-%m-%d')
        end_time = df.index[-1].strftime('%Y-%m-%d')
    axs.set_title('{} [{} {}]'.format(title, begin_time, end_time))
    
#     axs.set_ylabel('change')
    
def plot_sample_and_ans(sample_and_ans, super_title=None, deg=9):
    
    # Extract super title
    name = sample_and_ans['Name'][0]
    
    fig, axs = plt.subplots(1, 4, figsize=(15,4))
    fig.suptitle('{} {}'.format(super_title, name))
    
    # divide the sample_and_ans to sample df and answer df
    sample = sample_and_ans.head(period)
    ans = sample_and_ans.tail(sample_and_ans.shape[0] - period)
    
    # figure 1
    plot_close(axs[0], sample, deg, 'Close')
    plot_change(axs[1], sample, deg, 'Change')
    
    if sample_and_ans.shape[0] == 2 * period:
        plot_close(axs[2], ans, deg, 'Close Answer')
        plot_change(axs[3], ans, deg, 'Change Answer')
    elif sample_and_ans.shape[0] > 2 * period:
        plot_close(axs[2], ans.head(20), deg, 'Close Answer')
        plot_change(axs[3], ans.head(20), deg, 'Change Answer')

    

# plot_sample_and_ans(orig_result['SH#600000']['sample'], 'Sample')
# plot_sample_and_ans(orig_result['SH#600000']['answer'][1], 'Answer')
# plot_sample_and_ans(orig_result['SH#600000']['answer'][2], 'Answer')
# plot_sample_and_ans(orig_result['SH#600000']['answer'][3], 'Answer')
# plot_sample_and_ans(orig_result['SH#600000']['answer'][4], 'Answer')
# plot_sample_and_ans(orig_result['SH#600000']['answer'][5], 'Answer')



In [6]:
def get_x(n, df, deg=9):
    for m, stock_id in zip(np.arange(0, df.shape[0], 1), df.index):
        if m != n:
            continue
        
        print(df.loc[stock_id])
#         print('ans_change_list: {}'.format(df.loc[stock_id]['ans_change_list']))
#         print('ans_dist_list'.format(df.loc[stock_id]['ans_dist_list']))
        
        
        sample = orig_result[stock_id]['sample']
        plot_sample_and_ans(sample, 'Sample', deg)
        for ans in orig_result[stock_id]['answer']:
            plot_sample_and_ans(ans, 'Answer', deg)

        break
    
def get_name(stock_id, deg=9):
    sample = orig_result[stock_id]['sample']
    plot_sample_and_ans(sample, 'Sample', deg)
    for ans in orig_result[stock_id]['answer']:
        plot_sample_and_ans(ans, 'Answer', deg)

def traverse(df):
    print('index|stock|ans_dist_mean|[sample_change:ans_change_mean]|[sample_change_max:ans_change_max]|[sample_change_min:ans_change_min]')
    for id, stock_id in zip(np.arange(0, df.shape[0], 1), df.index):
        print('{} | {} | {} | [{}:{}] [{}:{}] [{}:{}]'.format(id, 
                                                   stock_id, 
                                                   round(df.loc[stock_id]['ans_dist_mean'], 5),
                                                              
                                                   round(df.loc[stock_id]['sample_change'], 5),
                                                   round(df.loc[stock_id]['ans_change_mean'], 5),
                                                   
                                                   round(df.loc[stock_id]['sample_change_max'], 5),
                                                   round(df.loc[stock_id]['ans_change_max_mean'], 5),
                                                         
                                                   round(df.loc[stock_id]['sample_change_min'], 5),
                                                   round(df.loc[stock_id]['ans_change_min_mean'], 5),
                                                   
                                                   round(df.loc[stock_id]['ans_change_std'], 5)))

        
        




In [47]:
min(1, 3)

1