In [7]:
! pip3 install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp310-cp310-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.0-cp310-cp310-macosx_12_0_arm64.whl (10.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading joblib-1.5.1-py3-none-any.whl (307 kB)
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.5.1 scikit-learn-1.7.0 threadpoolctl-3.6.0


In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns
import os
from datetime import datetime, timedelta
from decimal import Decimal
from itertools import compress
import pickle

In [13]:
with open('/Users/ijimin/Documents/GitHub/YOLO-Futures/data/processed/kospi200_preprocessed.pkl', 'rb') as f:
    df = pickle.load(f)
df.head()

Unnamed: 0,date,time,open,high,low,close,prevClose,vol
2010-02-16 09:01:00,20100216,901,207.55,207.65,207.5,207.6,207.5,3985.0
2010-02-16 09:02:00,20100216,902,207.6,207.65,207.25,207.55,207.5,5095.0
2010-02-16 09:03:00,20100216,903,207.55,207.8,207.5,207.6,207.5,2175.0
2010-02-16 09:04:00,20100216,904,207.55,207.85,207.55,207.8,207.5,1301.0
2010-02-16 09:05:00,20100216,905,207.8,208.15,207.8,208.05,207.5,3870.0


In [21]:
print(df.iloc[0:2].index)
target_times = df.iloc[0:2].index + timedelta(minutes=1)

# 그 시간이 인덱스에 포함된 행만 선택
df_filtered = df[df.index.isin(target_times)]

print(df_filtered)

DatetimeIndex(['2010-02-16 09:01:00', '2010-02-16 09:02:00'], dtype='datetime64[ns]', freq=None)
                         date time    open    high     low   close  prevClose  \
2010-02-16 09:02:00  20100216  902  207.60  207.65  207.25  207.55      207.5   
2010-02-16 09:03:00  20100216  903  207.55  207.80  207.50  207.60      207.5   

                        vol  
2010-02-16 09:02:00  5095.0  
2010-02-16 09:03:00  2175.0  


In [25]:
idx = df.index
idx.isin(idx + timedelta(minutes=1))

array([False,  True,  True, ...,  True,  True, False], shape=(941826,))

In [30]:
str(idx[0])

'2010-02-16 09:01:00'

In [14]:
df_dict = {}

for i in range(5):
    df_dict[i] = df.iloc[300*i:300*(i+1)]

df_dict

{0:                          date  time    open    high     low   close  \
 2010-02-16 09:01:00  20100216   901  207.55  207.65  207.50  207.60   
 2010-02-16 09:02:00  20100216   902  207.60  207.65  207.25  207.55   
 2010-02-16 09:03:00  20100216   903  207.55  207.80  207.50  207.60   
 2010-02-16 09:04:00  20100216   904  207.55  207.85  207.55  207.80   
 2010-02-16 09:05:00  20100216   905  207.80  208.15  207.80  208.05   
 ...                       ...   ...     ...     ...     ...     ...   
 2010-02-16 13:56:00  20100216  1356  209.95  210.00  209.95  209.95   
 2010-02-16 13:57:00  20100216  1357  209.95  210.05  209.95  210.05   
 2010-02-16 13:58:00  20100216  1358  210.10  210.15  210.05  210.05   
 2010-02-16 13:59:00  20100216  1359  210.05  210.10  209.95  210.10   
 2010-02-16 14:00:00  20100216  1400  210.10  210.25  210.10  210.25   
 
                      prevClose     vol  
 2010-02-16 09:01:00      207.5  3985.0  
 2010-02-16 09:02:00      207.5  5095.0  
 2010

In [15]:
class TimeSeriesSegmenter:
    def __init__(self, window_size: int, target_step: int, data_type: str, with_datetime: bool = True):
        '''
        __init__(window_size: int, target_step: int, data_type: str, with_datetime: bool) 
            -> None : 생성자

        -------
        시계열 분할 클래스의 초기화 함수
        - window_size: 입력 시계열의 길이 (과거 관찰 구간의 크기)
        - target_step: 예측 대상 시점 (미래 몇 분 뒤를 예측할 것인지)
        - data_type: 사용할 컬럼명 (예: 'close', 'open')
        - with_datetime: datetime 정보 포함 여부

        예:
        segmenter = TimeSeriesSegmenter(80, 10, 'close')
        segmented_df = segmenter(df_dict)
        '''
        self.window_size = window_size
        self.target_step = target_step
        self.data_type = data_type
        self.with_datetime = with_datetime

    def __call__(self, dataset: dict) -> pd.DataFrame:
        '''
        __call__(dataset: dict) -> return segmented_df : pd.DataFrame

        -------
        여러 개의 시계열 DataFrame이 담긴 dict에서 데이터를 분할한다.
        - 서킷브레이커, 수능, 일반적이지 않은 장을 쉽게 넣고 빼기 위해 dict로 나누어 사용한다. 
        - 각 DataFrame에 대해 window / target_step만큼 슬라이딩 윈도우 방식으로 입력-타겟 시퀀스를 생성합니다.
        - 생성된 시퀀스를 하나의 DataFrame으로 반환합니다.

        Return:
        df.DataFrame
        - 입력 시퀀스 (X)
        - 타겟 값 (y)
        - (optional) 해당 시점의 타임스탬프
        '''
        xs, ys, target_times = [], [], []

        for df in dataset.values():
            X, y, t_time = self._segment_dataset(df)
            xs.extend(X)
            ys.extend(y)
            target_times.extend(t_time)
        
        df = pd.DataFrame(np.array(xs))
        df['target'] = ys
        
        if self.with_datetime:
            df['target_time'] = target_times  

        return df

    def _segment_dataset(self, df: pd.DataFrame) -> tuple[list, list, list]:
        '''
        _segment_dataset(df: pd.DataFrame) -> return (X: list, y: list, target_times: list)

        -------
        단일 DataFrame에 대해 시계열 분할을 수행한다.
        - 날짜(date)별로 데이터를 그룹화하여 각 그룹마다 슬라이딩 윈도우 분할을 수행한다. 
        - target_step 분 만큼 미래 시점을 타겟으로 사용한다. 
        - 모든 타임스텝이 연속적이지 않은 경우를 처리하기 위해 time을 지표로 이용한다. 
          (예시: 장마감 때는 10분 동안 가격 고지가 이뤄지지 X  15:05 -> 15:15 로 타임 스탬프가 뛴다.)

        예:
        X, y, t = self._segment_dataset(df)
        '''
        X, y, target_times = [], [], []

        for _, group in df.groupby('date'):
            timeseries = group[self.data_type].values         # 예: 'close' 컬럼
            timestep_idx = group.index                        # 인덱스는 datetime index

            # 예측 대상 시점 리스트
            target_indices = timestep_idx[self.window_size:] + timedelta(minutes=self.target_step)
            target_mask = timestep_idx.isin(target_indices)

            # 윈도우 수만큼 반복
            total_iteration = len(timeseries) - self.target_step - self.window_size

            # X: 입력 시계열을 윈도우 단위로 쪼갠다. 
            X_window = [timeseries[i:i+self.window_size] for i in range(total_iteration)]

            # 마스킹된 X만 남기기 (유효한 타겟이 있는 위치만)
            X.extend(compress(X_window, target_mask[self.target_step + self.window_size:]))
            y.extend(timeseries[target_mask])                     # 타겟 y
            target_times.extend(timestep_idx[target_mask])        # 해당 시점의 시간 정보

        return X, y, target_times
    
segmenter = TimeSeriesSegmenter(80, 1, 'close')
ddf = segmenter(df_dict)
ddf
    

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,72,73,74,75,76,77,78,79,target,target_time
0,207.60,207.55,207.60,207.80,208.05,208.15,208.15,208.15,208.40,208.65,...,209.90,210.05,209.90,209.90,209.85,209.90,209.85,209.80,209.90,2010-02-16 10:22:00
1,207.55,207.60,207.80,208.05,208.15,208.15,208.15,208.40,208.65,208.70,...,210.05,209.90,209.90,209.85,209.90,209.85,209.80,209.85,209.75,2010-02-16 10:23:00
2,207.60,207.80,208.05,208.15,208.15,208.15,208.40,208.65,208.70,208.75,...,209.90,209.90,209.85,209.90,209.85,209.80,209.85,209.90,209.85,2010-02-16 10:24:00
3,207.80,208.05,208.15,208.15,208.15,208.40,208.65,208.70,208.75,208.65,...,209.90,209.85,209.90,209.85,209.80,209.85,209.90,209.75,209.85,2010-02-16 10:25:00
4,208.05,208.15,208.15,208.15,208.40,208.65,208.70,208.75,208.65,208.70,...,209.85,209.90,209.85,209.80,209.85,209.90,209.75,209.85,209.85,2010-02-16 10:26:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
823,209.10,209.15,209.20,209.10,209.15,209.00,209.05,209.15,209.05,209.05,...,208.50,208.50,208.55,208.60,208.60,208.60,208.75,208.75,208.70,2010-02-19 15:01:00
824,209.15,209.20,209.10,209.15,209.00,209.05,209.15,209.05,209.05,209.10,...,208.50,208.55,208.60,208.60,208.60,208.75,208.75,208.75,208.85,2010-02-19 15:02:00
825,209.20,209.10,209.15,209.00,209.05,209.15,209.05,209.05,209.10,209.05,...,208.55,208.60,208.60,208.60,208.75,208.75,208.75,208.70,208.75,2010-02-19 15:03:00
826,209.10,209.15,209.00,209.05,209.15,209.05,209.05,209.10,209.05,209.00,...,208.60,208.60,208.60,208.75,208.75,208.75,208.70,208.85,208.70,2010-02-19 15:04:00


In [74]:
index[index.isin(index[80:] + timedelta(minutes=10))]

DatetimeIndex(['2010-02-16 10:31:00', '2010-02-16 10:32:00',
               '2010-02-16 10:33:00', '2010-02-16 10:34:00',
               '2010-02-16 10:35:00', '2010-02-16 10:36:00',
               '2010-02-16 10:37:00', '2010-02-16 10:38:00',
               '2010-02-16 10:39:00', '2010-02-16 10:40:00',
               ...
               '2010-02-16 14:57:00', '2010-02-16 14:58:00',
               '2010-02-16 14:59:00', '2010-02-16 15:00:00',
               '2010-02-16 15:01:00', '2010-02-16 15:02:00',
               '2010-02-16 15:03:00', '2010-02-16 15:04:00',
               '2010-02-16 15:05:00', '2010-02-16 15:15:00'],
              dtype='datetime64[ns]', length=276, freq=None)

In [60]:
for i in ddf['action_time'].values:
    print(i)

2010-02-16T10:21:00.000000000
2010-02-16T10:22:00.000000000
2010-02-16T10:23:00.000000000
2010-02-16T10:24:00.000000000
2010-02-16T10:25:00.000000000
2010-02-16T10:26:00.000000000
2010-02-16T10:27:00.000000000
2010-02-16T10:28:00.000000000
2010-02-16T10:29:00.000000000
2010-02-16T10:30:00.000000000
2010-02-16T10:31:00.000000000
2010-02-16T10:32:00.000000000
2010-02-16T10:33:00.000000000
2010-02-16T10:34:00.000000000
2010-02-16T10:35:00.000000000
2010-02-16T10:36:00.000000000
2010-02-16T10:37:00.000000000
2010-02-16T10:38:00.000000000
2010-02-16T10:39:00.000000000
2010-02-16T10:40:00.000000000
2010-02-16T10:41:00.000000000
2010-02-16T10:42:00.000000000
2010-02-16T10:43:00.000000000
2010-02-16T10:44:00.000000000
2010-02-16T10:45:00.000000000
2010-02-16T10:46:00.000000000
2010-02-16T10:47:00.000000000
2010-02-16T10:48:00.000000000
2010-02-16T10:49:00.000000000
2010-02-16T10:50:00.000000000
2010-02-16T10:51:00.000000000
2010-02-16T10:52:00.000000000
2010-02-16T10:53:00.000000000
2010-02-16

In [31]:
class imeSeriesSegmenter:
    def __init__(self, data, window_size, max_trg_num):
        self.data = data
        self.window_size = window_size
        self.max_trg_num = max_trg_num

    def _make_x(self):

        x, dates, start_time = list(), list(), list()

        for date, group in self.data.groupby('date'):
            vector = group['close'].values
            time = group['time'].values

            x.extend([vector[i:i+self.window_size] for i in range(len(vector) - self.max_trg_num - self.window_size + 1)])
            dates.extend([date] * (len(vector) - self.max_trg_num - self.window_size + 1))
            start_time.extend([time[i+self.window_size] for i in range(len(vector) - self.max_trg_num - self.window_size + 1)])

        df = pd.DataFrame(np.array(x))
        df['date'] = dates
        df['time'] = start_time

        return df

    def _make_y(self):

        y, dates, start_time = list(), list(), list()

        for date, group in self.data.groupby('date'):
            vector = group['close'].values
            time = group['time'].values

            y.extend([vector[i+self.window_size : i+self.window_size+self.max_trg_num] for i in range(len(vector) - self.max_trg_num - self.window_size + 1)])
            dates.extend([date] * (len(vector) - self.max_trg_num - self.window_size + 1))
            start_time.extend([time[i+self.window_size] for i in range(len(vector) - self.max_trg_num - self.window_size + 1)])

        df = pd.DataFrame(np.array(y))
        df['date'] = dates
        df['time'] = start_time
        return df

    def make_xy(self):
        self.x, self.y = self._make_x(), self._make_y()
        return self.x, self.y

    def select_by_interval(self, x, y, interval):
        x_df, y_df = x.iloc[::interval,:], y.iloc[::interval,:]
        return x_df, y_df
    
ss = imeSeriesSegmenter(df, 80, 1)
ss.make_xy()

(             0       1       2       3       4       5       6       7  \
 0       207.60  207.55  207.60  207.80  208.05  208.15  208.15  208.15   
 1       207.55  207.60  207.80  208.05  208.15  208.15  208.15  208.40   
 2       207.60  207.80  208.05  208.15  208.15  208.15  208.40  208.65   
 3       207.80  208.05  208.15  208.15  208.15  208.40  208.65  208.70   
 4       208.05  208.15  208.15  208.15  208.40  208.65  208.70  208.75   
 ...        ...     ...     ...     ...     ...     ...     ...     ...   
 740991  229.10  229.20  228.85  228.90  229.20  229.25  228.95  228.90   
 740992  229.20  228.85  228.90  229.20  229.25  228.95  228.90  228.80   
 740993  228.85  228.90  229.20  229.25  228.95  228.90  228.80  228.65   
 740994  228.90  229.20  229.25  228.95  228.90  228.80  228.65  229.10   
 740995  229.20  229.25  228.95  228.90  228.80  228.65  229.10  229.05   
 
              8       9  ...      72      73      74      75      76      77  \
 0       208.40  2

In [80]:
dicts = {'kk' : 10}
