In [12]:
import os, glob
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
folder = "data/2021-03-21/"
files = glob.glob(os.path.join(folder, "*.csv"))
len(files)

483

In [82]:
def preprocess_df(data):
    # fix columns
    data.columns = list(map(lambda x: x.split('. ')[-1], data.columns))
    data = data[['open', 'close']]
    
    # sort
    data = data.sort_index(ascending=True)
    
    return data

def fix_data_for_date(df):
    df = preprocess_df(df)
    
    # remove boundaries with big gaps in data
    index = df.index
    diff = np.array(list(map(lambda x: x.seconds, index[1:] - index[:-1])))
    max_gap = 300 # in seconds 
    diff_index = np.where(diff > max_gap)[0] 
    try:
        start = diff_index[diff_index < df.shape[0]//2].max()
    except Exception:
        start = -1
    try:
        end = diff_index[diff_index > df.shape[0]//2].min()
    except Exception:
        end = df.shape[0]
    df = df.iloc[start+1:end] 

    # linear interpolation
    index = df.index
    diff = np.array(list(map(lambda x: x.seconds, index[1:] - index[:-1]))) / 60
    if diff.max() > 1:
        diff = np.array(list(map(lambda x: x.seconds, index - index.min()))) / 60
        x = np.arange(diff[0], diff[-1], 1)
        opening = np.interp(x, diff, df['open'])
        closing = np.interp(x, diff, df['close'])
        df = pd.DataFrame({"open": opening,
                           "close": closing
                          },
                          index=index.min() + x.astype(np.timedelta64)*10**9*60)
    return df
    
def fix_data(file):
    df = pd.read_csv(file, index_col=0, parse_dates=True)
    dates = list(set(list(map(lambda x: x.date(), df.index))))
    dates.sort()

    # interpolate the data 
    new_df = None
    if df.shape[0] > 4000:
        for date in dates:
            _data = df.loc[list(filter(lambda x: str(date) in x.__str__(), df.index))].sort_index(ascending=True)
            temp_df = fix_data_for_date(_data)
            if new_df is None:
                new_df = temp_df
            else:
                new_df = new_df.append(temp_df)
        print(file, df.shape, new_df.shape)
        new_df.to_csv("data/fixed_data/" + file.split('/')[-1])
        return True
    
count = 0
for file in files:
    if fix_data(file):
        count += 1
#     if count > 0:
#         break
print(count)

data/2021-03-21/TPR.csv (4004, 5) (3971, 2)
data/2021-03-21/FTI.csv (4610, 5) (4231, 2)
data/2021-03-21/BCS.csv (4127, 5) (3999, 2)
data/2021-03-21/C.csv (5142, 5) (4560, 2)
data/2021-03-21/HSBC.csv (4278, 5) (4016, 2)
data/2021-03-21/MOS.csv (4327, 5) (4118, 2)
data/2021-03-21/AM.csv (4116, 5) (3974, 2)
data/2021-03-21/BMY.csv (4334, 5) (4066, 2)
data/2021-03-21/AMWL.csv (4487, 5) (4186, 2)
data/2021-03-21/SPCE.csv (6939, 5) (6193, 2)
data/2021-03-21/BHC.csv (4032, 5) (3966, 2)
data/2021-03-21/UAA.csv (4047, 5) (3967, 2)
data/2021-03-21/TME.csv (4504, 5) (4064, 2)
data/2021-03-21/UNP.csv (4001, 5) (3936, 2)
data/2021-03-21/RIO.csv (4667, 5) (4225, 2)
data/2021-03-21/AMT.csv (4019, 5) (3968, 2)
data/2021-03-21/AA.csv (4658, 5) (4173, 2)
data/2021-03-21/BEKE.csv (4922, 5) (4166, 2)
data/2021-03-21/CNK.csv (4453, 5) (4104, 2)
data/2021-03-21/AZUL.csv (4114, 5) (3952, 2)
data/2021-03-21/TDOC.csv (4921, 5) (4362, 2)
data/2021-03-21/WFC.csv (5909, 5) (5290, 2)
data/2021-03-21/GNW.csv (4077,