In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

from scipy import optimize

import tensorflow as tf
import torch
import os
os.chdir('..')

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from IPython.display import clear_output
import datetime
import matplotlib as mpl
from matplotlib import cm
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline, FeatureUnion

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [4]:
from download_daily_data import my_list
my_list = list(my_list)

In [6]:
from supervised.utils import get_numeric_categoric, delta_dataframe_with_y_columns

In [10]:
def get_transfomed_combiner(df):
    # Use only the ones worked well in autoencoder
    transfomer = [
        ('Data after min-max scaling',
         MinMaxScaler()),
        ('Data after max-abs scaling',
         MaxAbsScaler()),
        ('Data after quantile transformation (uniform pdf)',
         QuantileTransformer(output_distribution='uniform')),
        ('Data after sample-wise L2 normalizing',
         Normalizer()),
    ]

    combined = FeatureUnion(transfomer)
    _ = combined.fit(df)

    return combined


def get_input_target(ticker):
    # messy code... 
    train_df_original, test_df_original, numeric_cols, categoric_cols = ohlc_train_df_test_df(ticker)
    if train_df_original is None:
        return None, None, None, None
    
    y_cols, not_interested = ohlc_get_y_cols(numeric_cols)
    numeric_cols = list(sorted(set(numeric_cols) - set(y_cols) - set(not_interested)))    
    
    train_df, y_train = train_df_original[numeric_cols], train_df_original[y_cols]
    test_df, y_test   = test_df_original[numeric_cols], test_df_original[y_cols]
    y_train.drop(y_train.columns[2:], axis=1, inplace=True)
    y_test.drop( y_test.columns[2:],  axis=1, inplace=True)

    combined = get_transfomed_combiner(train_df)
    
    x_train_transformed = combined.transform(train_df).astype(np.float32)
    x_test_transformed = combined.transform(test_df).astype(np.float32)

    return x_train_transformed, x_test_transformed, y_train, y_test

In [9]:
import pickle
from collections import defaultdict

ticker_dict = defaultdict(bool)

In [12]:
for ticker in my_list:
    
    if ticker in ticker_dict:
        continue
    
    print('Processing: {}...'.format(ticker))
    
    try:
        data_list = get_input_target(ticker)
    except Exception as e:
        print(e)
        continue
        
    file_names = ('_x_train', '_x_test', 
                  '_y_train', '_y_test')
    
    if data_list[0] is not None:
        
        for file_name, data in zip(file_names, data_list):
            f_name = 'data/ohlc_processed/' + ticker + file_name + '.pickle'
            with open(f_name, 'wb') as handle:
                pickle.dump(data, handle)
        
        ticker_dict[ticker] = True