In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.insert(0, "./../../")

In [3]:
from src import utils

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [5]:
df = pd.read_parquet("./../../data/qqq.parquet")

In [6]:
df = df.sort_values(by='date')

In [7]:
df.head()

Unnamed: 0,date,open,high,low,close,volume
6070,1999-11-01,131.5,133.1,130.6,130.8,4840900.0
6069,1999-11-02,131.5,133.1,130.4,130.9,6417400.0
6068,1999-11-03,132.8,134.3,132.4,133.5,9376300.0
6067,1999-11-04,135.4,135.6,133.6,135.0,10024300.0
6066,1999-11-05,137.8,138.4,136.4,136.4,7567300.0


In [8]:
df.tail()

Unnamed: 0,date,open,high,low,close,volume
4,2023-12-11,391.74,395.79,391.53,395.52,41198021.0
3,2023-12-12,395.56,398.79,394.41,398.67,39221152.0
2,2023-12-13,399.62,404.36,398.8399,403.74,55408129.0
1,2023-12-14,404.98,406.3,400.34,403.39,55447808.0
0,2023-12-15,404.18,406.5399,403.57,405.34,62896645.0


In [9]:
features_cols = ["open", "high", "low", "close", "volume"]

### (1) Log Transformation

In [10]:
for col in features_cols:
    df[col] = np.log(df[col])

In [11]:
df.head()

Unnamed: 0,date,open,high,low,close,volume
6070,1999-11-01,4.879007,4.891101,4.872139,4.873669,15.392611
6069,1999-11-02,4.879007,4.891101,4.870607,4.874434,15.674524
6068,1999-11-03,4.888844,4.900076,4.885828,4.894101,16.053696
6067,1999-11-04,4.908233,4.909709,4.89485,4.905275,16.120523
6066,1999-11-05,4.925803,4.930148,4.915592,4.915592,15.839347


### (2) Define Upper Limit

In [12]:
df_tmp = pd.DataFrame()
df_tmp['date'] = ['9999-12-31']

In [13]:
expand_range = 0.3

In [14]:
def get_uppper_value(values, expand_range):
    value_range = max(values) - min(values)
    return max(values) + value_range * expand_range

In [15]:
for col in features_cols:
    df_tmp[col] = [get_uppper_value(df[col], expand_range)]

In [16]:
df_tmp

Unnamed: 0,date,open,high,low,close,volume
0,9999-12-31,6.909063,6.910341,6.905357,6.906526,21.749782


In [17]:
df = pd.concat([df, df_tmp])

In [18]:
df.tail()

Unnamed: 0,date,open,high,low,close,volume
3,2023-12-12,5.980302,5.988435,5.977391,5.988134,17.484727
2,2023-12-13,5.990514,6.002306,5.98856,6.000771,17.830237
1,2023-12-14,6.003838,6.007092,5.992314,5.999904,17.830953
0,2023-12-15,6.00186,6.007682,6.00035,6.004726,17.957003
0,9999-12-31,6.909063,6.910341,6.905357,6.906526,21.749782


### (2) MinMax score standardization

In [19]:
scaler_dict = {
}

In [20]:
for col in features_cols:
    scaler = MinMaxScaler()
    df[col] = scaler.fit_transform(df[[col]].values)
    scaler_dict[col] = scaler

In [21]:
df.tail()

Unnamed: 0,date,open,high,low,close,volume
3,2023-12-12,0.76285,0.762912,0.763376,0.764985,0.348084
2,2023-12-13,0.765457,0.766479,0.766224,0.768219,0.400896
1,2023-12-14,0.768859,0.76771,0.767182,0.767997,0.401005
0,2023-12-15,0.768354,0.767862,0.769231,0.769231,0.420272
0,9999-12-31,1.0,1.0,1.0,1.0,1.0


In [22]:
utils.save_object("./artifacts/feature_scaler.pkl", scaler_dict)

In [23]:
df_data = df.iloc[:-1].copy()

In [24]:
df_data.head()

Unnamed: 0,date,open,high,low,close,volume
6070,1999-11-01,0.481644,0.480709,0.481546,0.479795,0.028304
6069,1999-11-02,0.481644,0.480709,0.481156,0.47999,0.071394
6068,1999-11-03,0.484156,0.483017,0.485037,0.485023,0.129351
6067,1999-11-04,0.489107,0.485494,0.487338,0.487883,0.139565
6066,1999-11-05,0.493593,0.49075,0.492627,0.490523,0.096587


In [25]:
df_data.tail()

Unnamed: 0,date,open,high,low,close,volume
4,2023-12-11,0.760372,0.76097,0.761508,0.762955,0.355601
3,2023-12-12,0.76285,0.762912,0.763376,0.764985,0.348084
2,2023-12-13,0.765457,0.766479,0.766224,0.768219,0.400896
1,2023-12-14,0.768859,0.76771,0.767182,0.767997,0.401005
0,2023-12-15,0.768354,0.767862,0.769231,0.769231,0.420272


In [26]:
df_data.to_parquet('./artifacts/processed_data.parquet')

In [27]:
np.exp(scaler_dict['close'].inverse_transform(df_data[['close']]))

array([[130.8 ],
       [130.9 ],
       [133.5 ],
       ...,
       [403.74],
       [403.39],
       [405.34]])