In [None]:
import lib._util.visualplot as vp
import lib._util.fileproc as fp
import lib._util.dimreduce as dr
import lib._util.normalizer as nrm

In [None]:
import pandas as pd
import numpy as np

# Plotly
import plotly.express as px
import plotly.graph_objects as go

# Time measurement
import time
from datetime import timedelta

# Sound notification
import winsound

# Technical analysis
import ta

# Useful Functions

In [None]:
SOURCE_PATH_DATA = 'resources/data/'
OUT_PATH_GRAPH = 'resources/output/eda/graph/'
OUT_PATH_FILE = 'resources/output/eda/file/'

def time_taken(seconds):
    print(f'\nTime Taken: {str(timedelta(seconds=seconds))}')
    winsound.Beep(frequency=1000, duration=100)
    winsound.Beep(frequency=1500, duration=50)

# Data Preparation

### Data Loading (Raw Data)

In [None]:
def load_data(currency_pair, periods):
    df_list = []
    for period in periods:
        source_file = f'resources/data/DAT_ASCII_{currency_pair}_T_{period}.csv'
        df_chunks   = pd.read_csv(source_file, sep=',',
                                  header=None, names=['datetime', 'bid', 'ask', 'vol'],
                                  usecols=['datetime', 'bid', 'ask'],
                                  parse_dates=['datetime'],
                                  date_parser=lambda x: pd.to_datetime(x, format='%Y%m%d %H%M%S%f'),
                                  chunksize=50_000)
        df = pd.concat(df_chunks)
        df_list.append(df)

    return pd.concat(df_list)

In [None]:
EXEC_START = time.time()

currency_pair = 'EURUSD'
periods       = [f'2019{str(x+1).zfill(2)}' for x in range(12)]

timeseries_df = load_data(currency_pair, periods)

EXEC_END = time.time()
time_taken(EXEC_END - EXEC_START)

In [None]:
vp.faststat(timeseries_df)

### Time Series Aggregation

In [None]:
def aggregate(df, rule):
    bid_df = df.set_index('datetime')['bid'].resample(rule).ohlc().reset_index()
    ask_df = df.set_index('datetime')['ask'].resample(rule).ohlc().reset_index()

    bid_df.dropna(inplace=True)
    ask_df.dropna(inplace=True)

    bid_df.reset_index(drop=True, inplace=True)
    ask_df.reset_index(drop=True, inplace=True)
    
    new_df = pd.DataFrame({
        'datetime': bid_df['datetime'],

        'open_bid': bid_df['open'],
        'high_bid': bid_df['high'],
        'low_bid': bid_df['low'],
        'bid': bid_df['close'],

        'open_ask': ask_df['open'],
        'high_ask': ask_df['high'],
        'low_ask': ask_df['low'],
        'ask': ask_df['close']
    })
    new_df['datetime'] = new_df['datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')
    
    for column in [x for x in new_df.columns if x != 'datetime']:
        new_df[column] = np.round(new_df[column], 5)
    
    return new_df

In [None]:
# Aggregation (Daily)
day_df = aggregate(timeseries_df, rule='1D')
vp.faststat(day_df)

In [None]:
# Aggregation (Hourly)
hour_df = aggregate(timeseries_df, rule='1H')
vp.faststat(hour_df)

In [None]:
# Aggregation (Minute)
min_df = aggregate(timeseries_df, rule='1Min')
vp.faststat(min_df)

In [None]:
# Export
EXEC_START = time.time()

fp.generate_csv(day_df, out_path=OUT_PATH_FILE,
                out_filename=f'DAT_ASCII_{currency_pair}_Day.csv', export_index=False)
fp.generate_csv(hour_df, out_path=OUT_PATH_FILE,
                out_filename=f'DAT_ASCII_{currency_pair}_Hour.csv', export_index=False)
fp.generate_csv(min_df, out_path=OUT_PATH_FILE,
                out_filename=f'DAT_ASCII_{currency_pair}_Minute.csv', export_index=False)

EXEC_END = time.time()
time_taken(EXEC_END - EXEC_START)

# Feature Engineering

### Data Loading (Aggregated Data)

In [None]:
def load_data(currency_pair, time_type):
    time_types = ['Day', 'Hour', 'Minute']
    assert time_type in time_types, f'{time_type} not in valid list: {time_types}'
    
    filename    = f'DAT_ASCII_{currency_pair}_{time_type}.csv'
    source_file = f'{OUT_PATH_FILE}{filename}'
    df_chunks   = pd.read_csv(source_file, sep=';',
                              parse_dates=['datetime'],
                              date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d %H:%M:%S'),
                              chunksize=50_000)
    return pd.concat(df_chunks)

In [None]:
currency_pair = 'EURUSD'
day_df  = load_data(currency_pair, time_type='Day')
hour_df = load_data(currency_pair, time_type='Hour')
min_df  = load_data(currency_pair, time_type='Minute')

### Feature Engineering

In [None]:
def ta_features(df):
    new_df = df.copy()
    
    new_df['volume'] = 1
    new_df = ta.add_all_ta_features(new_df, 'open_bid', 'high_bid', 'low_bid', 'bid', 'volume')
    
    # Remove volume related features
    new_df.drop(columns=[x for x in new_df.columns if 'volume' in x or 'momentum_mfi' in x], inplace=True)
    
    # Column trend_trix is having highest number of leading N/A value
    index  = new_df[new_df['trend_trix'].isna() == False].index[0]
    new_df = new_df[index:].copy()
    new_df.reset_index(inplace=True, drop=True)
    
    # Start from psar up/down indicator with starting signal
    up_index   = new_df[new_df['trend_psar_up_indicator'] == 1]['trend_psar_up_indicator'].index[0]
    down_index = new_df[new_df['trend_psar_down_indicator'] == 1]['trend_psar_down_indicator'].index[0]
    index      = min(up_index, down_index)
    new_df     = new_df[index:].copy()
    new_df.reset_index(inplace=True, drop=True)
    
    new_df.fillna(0, inplace=True)
    return new_df

In [None]:
day_df = ta_features(day_df)
day_df.shape

In [None]:
hour_df = ta_features(hour_df)
hour_df.shape

In [None]:
min_df  = ta_features(min_df)
min_df.shape

### Histogram

In [None]:
def histogram(df, title):
    vp.histogram(df, title=title, out_path=OUT_PATH_GRAPH, layout_height=4096)

In [None]:
histogram(day_df, title='Histogram - Day')

In [None]:
histogram(hour_df, title='Histogram - Hour')

In [None]:
histogram(min_df, title='Histogram - Minute')

### Violin-Plot

In [None]:
def violinplot(df, title):
    vp.violinplot(df, title=title, out_path=OUT_PATH_GRAPH, layout_height=4096)

In [None]:
violinplot(day_df, title='Violin-Plot - Day')

In [None]:
violinplot(hour_df, title='Violin-Plot - Hour')

In [None]:
violinplot(min_df, title='Violin-Plot - Minute')

# Dimensionality Reduction

### Correlation Matrix - Original

In [None]:
def corrmatrix(df, title):
    tmp_df = df.drop(columns=[x for x in df.columns if any([y for y in ['datetime', '_bid', '_ask'] if y in x])]).copy()
    vp.corrmatrix(tmp_df, title=title, out_path=OUT_PATH_GRAPH, layout_height=1024)

In [None]:
corrmatrix(day_df, title='Correlation Matrix - Day')

In [None]:
corrmatrix(hour_df, title='Correlation Matrix - Hour')

In [None]:
corrmatrix(min_df, title='Correlation Matrix - Minute')

### Feature Selection

In [None]:
def dropcorr(df, corr_ratio):
    new_df = df.drop(columns=[x for x in df.columns if any([y for y in ['datetime', '_bid', '_ask'] if y in x])]).copy()
    new_df = dr.dropcorr(new_df, corr_ratio=corr_ratio)
    
    return new_df

In [None]:
selft_day_df = dropcorr(day_df, corr_ratio=.9)

In [None]:
selft_hour_df = dropcorr(hour_df, corr_ratio=.9)

In [None]:
selft_min_df = dropcorr(min_df, corr_ratio=.9)

### Correlation Matrix - After Selection

In [None]:
corrmatrix(selft_day_df, title='Correlation Matrix - Day - Selected Features')

In [None]:
corrmatrix(selft_hour_df, title='Correlation Matrix - Hour - Selected Features')

In [None]:
corrmatrix(selft_min_df, title='Correlation Matrix - Minute - Selected Features')

### Feature Extraction

In [None]:
# Feature scaling
def normalize(df):
    new_df = df.drop(columns=[x for x in df.columns if any([y for y in ['datetime', 'bid', 'ask'] if y in x])]).copy()
    return nrm.standard_scaler(new_df, new_df.columns, drop=True)

In [None]:
extft_day_df = normalize(day_df)

In [None]:
extft_hour_df = normalize(hour_df)

In [None]:
extft_min_df = normalize(min_df)

In [None]:
# PCA Evaluation
def pca_evaluation(df, columns, title):
    _, explained_variances = dr.pca_reduction(df, columns, n_component=df[columns].shape[1], drop=True)
    
    dr.expvar_evaluation(explained_variances, title=title, out_path=OUT_PATH_GRAPH)

In [None]:
pca_evaluation(extft_day_df, extft_day_df.columns, title='PCA Evaluation - Day')

In [None]:
pca_evaluation(extft_hour_df, extft_hour_df.columns, title='PCA Evaluation - Hour')

In [None]:
pca_evaluation(extft_min_df, extft_min_df.columns, title='PCA Evaluation - Minute')

In [None]:
# PCA Reduction
def pca_reduction(ori_df, df, columns, n_component):
    new_df, _ = dr.pca_reduction(df, columns, n_component=n_component, drop=True)
    
    retain_columns = [x for x in ori_df.columns if any([y for y in ['datetime', 'bid', 'ask'] if y in x])]
    new_df = ori_df[retain_columns].merge(new_df, left_index=True, right_index=True, how='inner')
    
    return new_df

In [None]:
extft_day_df = pca_reduction(day_df, extft_day_df, extft_day_df.columns, 15)

In [None]:
extft_hour_df = pca_reduction(hour_df, extft_hour_df, extft_hour_df.columns, 15)

In [None]:
extft_min_df = pca_reduction(min_df, extft_min_df, extft_min_df.columns, 15)

### Correlation Matrix - After Extraction

In [None]:
corrmatrix(extft_day_df, title='Correlation Matrix - Day - Extracted Features')

In [None]:
corrmatrix(extft_hour_df, title='Correlation Matrix - Hour - Extracted Features')

In [None]:
corrmatrix(extft_min_df, title='Correlation Matrix - Minute - Extracted Features')

### Pair-Plot

In [None]:
def pairplot(df, title):
    vp.pairplot(df, title, out_path=OUT_PATH_GRAPH, layout_height=2048, layout_width=2048)

In [None]:
pairplot(extft_day_df, title='Pair-Plot - Day - Extracted Features')

In [None]:
pairplot(extft_hour_df, title='Pair-Plot - Hour - Extracted Features')

In [None]:
pairplot(extft_min_df, title='Pair-Plot - Minute - Extracted Features')

### Feature Selection + Extraction

In [None]:
selextft_day_df = normalize(selft_day_df)
pca_evaluation(selextft_day_df, selextft_day_df.columns, title='PCA Evaluation - Day - Selected Feature')

In [None]:
selextft_hour_df = normalize(selft_hour_df)
pca_evaluation(selextft_hour_df, selextft_hour_df.columns, title='PCA Evaluation - Hour - Selected Feature')

In [None]:
selextft_min_df = normalize(selft_min_df)
pca_evaluation(selextft_min_df, selextft_min_df.columns, title='PCA Evaluation - Minute - Selected Feature')

### Export Feature

In [None]:
def export_feature(df, filename):
    new_df = df.copy()
    
    for column in [x for x in new_df.columns if x != 'datetime']:
        new_df[column] = np.round(new_df[column], 5)
        
    fp.generate_csv(new_df, out_path=OUT_PATH_FILE, out_filename=filename, export_index=False)

In [None]:
# Export
EXEC_START = time.time()

export_feature(extft_day_df, f'DAT_ASCII_{currency_pair}_Day_Feature.csv')
export_feature(extft_hour_df, f'DAT_ASCII_{currency_pair}_Hour_Feature.csv')
export_feature(extft_min_df, f'DAT_ASCII_{currency_pair}_Minute_Feature.csv')

EXEC_END = time.time()
time_taken(EXEC_END - EXEC_START)