In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import os
import glob


In [11]:
path = 'C:/Users/makov/OneDrive/Desktop/RL project/data/charts' 
levels_path = 'C:/Users/makov/OneDrive/Desktop/RL project/data/supports_resistances.data' 

actives_data = {}
for filename in glob.glob(os.path.join(path, '*.csv')):
   with open(os.path.join(os.getcwd(), filename), 'r') as f: 
      active_name = filename.split("\\")[-1].split(".")[0].split("_")[0]
      actives_data[active_name] = f.readlines()

with open(levels_path, 'r') as f: 
      levels_lines = f.readlines()
      levels_data = {}
      active = "init"
      tmp_arr = []
      for line in levels_lines:
          tmp = line.split("\n")[0].split(",")
          if len(tmp) == 1:
                pre_active = line.split("\n")[0].split(" ")[0]
                if pre_active == "":
                     continue 
                levels_data[active] = tmp_arr if  len(tmp_arr) == 0 else np.stack(tmp_arr) 
                active = pre_active
                tmp_arr = []
          else:
                tmp_arr.append(np.array(tmp))
      _ = levels_data.pop("init")
              
levels_data.keys()         


dict_keys(['POGR', 'FIVE', 'OZON', 'MAGN', 'MAIL', 'DSKY', 'TATN', 'TATNP', 'HYDR', 'HHRU', 'TCSG', 'PHOR', 'GLTR', 'GMKN', 'CBOM', 'RUAL', 'YNDX', 'NVTK', 'MOEX', 'ALRS', 'QIWI', 'POLY', 'RSTI', 'AFKS', 'IRAO', 'MGNT', 'VTBR', 'FEES', 'PIKK', 'NLMK', 'LKOH', 'PLZL', 'UPRO', 'LSRG', 'ROSN', 'GAZP', 'MTSS', 'CHMF', 'TRNFP', 'AFLT', 'SBERP', 'SBER', 'RTKM'])

In [13]:
active = actives_data["CBOM"]
levels = levels_data["CBOM"]
# active, levels 

In [14]:
def get_level_life_time(price, date, levels_df, band_for_level):
    observed_levels = levels_df[levels_df["date"] <= date].reset_index()
    best_locality = 0
    best_level_date = None

    for i in range(len(observed_levels)):
        level = observed_levels.loc[i]
        if np.abs((float(level["lower_bound"]) - price) / price) < band_for_level or np.abs((float(level["upper_bound"]) - price) / price) < band_for_level or float(level["lower_bound"]) < price < float(level["upper_bound"]):
            if int(level["locality"]) > best_locality and (date - level["date"]).days <= 365:
                best_locality = int(level["locality"])
                best_level_date = level["date"]

    result = 0.0
    if best_level_date is not None:
        result = (date - best_level_date).days / 365
    if result > 1.0:
        result = 0.0
    return result
            

def prepara_data_for_one_timestamp(lines, levels, band_for_level = 2 / 100, scaler_for_d = 10):
    data = []
    for line in lines:
        data.append(np.array(line.split("\n")[0].split(",")).astype("float"))

    levels_data = pd.DataFrame(levels, columns = ["date", "lower_bound", 
                                                  "upper_bound", "locality", 
                                                  "strength", "weakness"])
    levels_data = levels_data.drop(["strength", "weakness"], axis = 1)
    levels_data["date"] = pd.to_datetime(levels_data["date"], format = "%Y/%m/%d" )
    
    # current open price = O = P2
    # current close price = C = P3
    # current max price = H = Pmax
    # current min price = L = Pmin
    # previous close price = P1

    data = pd.DataFrame(data, columns = ["date", "time", "P2", "Pmax", "Pmin", "P3", "V"])
    data = data.sort_values(by = ["date"])
    data["date"] = pd.to_datetime(data["date"], format = "%Y%m%d" )
    data = data.drop("time", axis = 1)
    data["day_of_week"] = data["date"].dt.day_of_week
    data = data.drop(data[data["day_of_week"] >=5].index ).reset_index()
    data = data.join(pd.get_dummies(data["day_of_week"], prefix = "day_of_week"))

    level_life_times = []
    for i in range(len(data)):
        price = data.loc[i]["P3"]
        date = data.loc[i]["date"]
        level_life_times.append(get_level_life_time(price, date, levels_data, band_for_level))
   
    data["level_lifetime"] = level_life_times


    data["P1"]    = [data["P3"].values[0]] + list(data["P3"].values)[:-1]
    data["prevV"] = [data["V"].values[0]]  + list(data["V"].values)[:-1]

    data["D1"] = (data["P2"] - data["P1"]) / data["P1"] * scaler_for_d
    data["D2"] = (data["P3"] - data["P2"]) / data["P2"] * scaler_for_d
    data["D3"] = (data["Pmax"] - data["P2"]) / data["P2"] * scaler_for_d
    data["D4"] = (data["P2"] - data["Pmin"]) / data["Pmin"] * scaler_for_d
    data["D"]  = (data["P3"] - data["P1"]) / data["P1"] * scaler_for_d
    data["VD"] = (data["V"] - data["prevV"]) / data["prevV"]

    
    data = data.drop(["index", #"date", 
                      "day_of_week", 
                      "P1", "P2", "P3", 
                      "Pmin", "Pmax", 
                      "V", "prevV"], axis = 1)
    
    return data, levels_data

def prepare_active_data(lines, levels, N, M, band_for_level = 2 / 100, scaler_for_d = 10):

    prepared_oneday_active, _ = prepara_data_for_one_timestamp(lines, levels, band_for_level = 2 / 100, scaler_for_d = 10)
    
    VDs = prepared_oneday_active["VD"].values
    Ds = prepared_oneday_active["D"].values

    Ds = np.stack([np.array([None for i in range(N)]) for j in range(N)] + [Ds[i-N:i] for  i in range(N, len(Ds))])
    VDs = np.stack([np.array([None for i in range(M)]) for j in range(M)] + [VDs[i-M:i] for  i in range(M, len(VDs))])
    # print(Ds.shape, VDs.shape)

    prepared_oneday_active[["Ds_" + str(i) for i in range(N)]] = Ds
    prepared_oneday_active[["VDs_" + str(i) for i in range(M)]] = VDs
    return prepared_oneday_active[["date"] +
                                  ["day_of_week_" + str(i) for i in range(5)] +
                                  ["level_lifetime"] +
                                  ["VDs_" + str(i) for i in range(M)] +
                                  ["Ds_" + str(i) for i in range(N)] + 
                                  ["D3", "D4", "D1", "D2"]].dropna()
data = prepare_active_data(active, levels, N = 60, M = 10, band_for_level = 2 / 100, scaler_for_d = 10)

In [16]:
len(data.columns), len(data)

(81, 1996)

In [17]:
full_data_dict = {}
for key in tqdm(actives_data.keys()):
    try:
        active = actives_data[key]
        levels = levels_data[key]

        full_data_dict[key] = prepare_active_data(active, levels, N = 60, M = 10, band_for_level = 1 / 100, scaler_for_d = 10)
    except Exception as ex:
        print(key)
        print(ex)
        

 80%|████████  | 36/45 [08:39<03:04, 20.49s/it]

SNGSP
'SNGSP'
SNGS
'SNGS'


100%|██████████| 45/45 [10:14<00:00, 13.66s/it]


In [22]:
set(actives_data.keys()).difference(set(levels_data.keys())), set(levels_data.keys()).difference(set(actives_data.keys()))

({'SNGS', 'SNGSP'}, set())

In [23]:
len(actives_data.keys()), len(actives_data.keys())

(45, 45)

In [24]:
path_to_save = "data/prepocessed_charts/"
for key in full_data_dict.keys():
    full_data_dict[key].to_csv(path_to_save + key + ".csv")

In [25]:
len(full_data_dict.keys())

43

In [26]:
actives_for_test = ["LKOH", "GAZP", "ROSN"]
actives_for_train = set(full_data_dict.keys()) - set(actives_for_test)
len(actives_for_train), len(actives_for_test)

(40, 3)

In [27]:
test_year = 2022
with open("train.data", "w") as fin:
    for active in tqdm(actives_for_train):

        active_data = full_data_dict[active].copy()
        active_data = active_data[active_data["date"].dt.year < test_year]
        active_data = active_data.drop("date", axis = 1)

        fin.write(active + " D " + str(len(active_data)) + "\n" )
        for x in active_data.values:
            tmp1 = np.array2string(
                x[:5], 
                separator = "," , 
                max_line_width = 1e10,
                formatter={"all": lambda x: "%i" %float(x)}
                )[1:-1]
            tmp2 = np.array2string(
                    x[6:],
                    separator = "," ,
                    max_line_width = 1e10,
                    formatter={"all": lambda x: "%+.6f" %float(x)}
                    )[1:-1]
            fin.write(tmp1 + "," + "%.6f" %float(x[5]) + "," +  tmp2 + "\n")

100%|██████████| 40/40 [00:13<00:00,  3.01it/s]


In [28]:
for active in actives_for_test:
    with open(f"test_{active}.data", "w") as fin:
        active_data = full_data_dict[active].copy()
        # active_data = active_data[active_data["date"].dt.year > test_year]
        active_data = active_data.drop("date", axis = 1)

        fin.write(active + " D " + str(len(active_data)) + "\n" )
        for x in active_data.values:
            tmp1 = np.array2string(
                x[:5], 
                separator = "," , 
                max_line_width = 1e10,
                formatter={"all": lambda x: "%i" %float(x)}
                )[1:-1]
            tmp2 = np.array2string(
                    x[6:],
                    separator = "," ,
                    max_line_width = 1e10,
                    formatter={"all": lambda x: "%+.6f" %float(x)}
                    )[1:-1]
            fin.write(tmp1 + "," + "%.6f" %float(x[5]) + "," +  tmp2 + "\n")

    

In [29]:
for active in actives_for_test:
    with open(f"test_{active}_last.data", "w") as fin:
        active_data = full_data_dict[active].copy()
        active_data = active_data[active_data["date"].dt.year > test_year]
        active_data = active_data.drop("date", axis = 1)

        fin.write(active + " D " + str(len(active_data)) + "\n" )
        for x in active_data.values:
            tmp1 = np.array2string(
                x[:5], 
                separator = "," , 
                max_line_width = 1e10,
                formatter={"all": lambda x: "%i" %float(x)}
                )[1:-1]
            tmp2 = np.array2string(
                    x[6:],
                    separator = "," ,
                    max_line_width = 1e10,
                    formatter={"all": lambda x: "%+.6f" %float(x)}
                    )[1:-1]
            fin.write(tmp1 + "," + "%.6f" %float(x[5]) + "," +  tmp2 + "\n")

    