In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


'''
None || 20000
'''
nb = None
nb_trains = nb
nb_valids = nb
nb_tests = nb

In [2]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [3]:
# cd drive/MyDrive/Projects/JPX_Tokyo_Stock/working

#### Load data

In [4]:
from data_utils import *

In [5]:
prices = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv', parse_dates=True, nrows = nb_trains)
sprices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/secondary_stock_prices.csv", parse_dates=True, nrows = nb_trains)

supplemental_prices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv", parse_dates=True, nrows = nb_valids)
supplemental_sprices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/secondary_stock_prices.csv", parse_dates=True, nrows = nb_valids)

In [6]:
train_df = pd.concat([prices, sprices]).reset_index(drop= True)
valid_df = supplemental_sprices.reset_index(drop= True)
test_df = supplemental_prices.reset_index(drop= True)
official_test = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/example_test_files/stock_prices.csv', nrows = nb_tests, parse_dates=['Date'])

In [7]:
print(train_df.shape)
print(valid_df.shape)
print(test_df.shape) 

(4717106, 12)
(242704, 12)
(229958, 12)


In [8]:
print(train_df.columns)
print(valid_df.columns)
print(test_df.columns)

Index(['RowId', 'Date', 'SecuritiesCode', 'Open', 'High', 'Low', 'Close',
       'Volume', 'AdjustmentFactor', 'ExpectedDividend', 'SupervisionFlag',
       'Target'],
      dtype='object')
Index(['RowId', 'Date', 'SecuritiesCode', 'Open', 'High', 'Low', 'Close',
       'Volume', 'AdjustmentFactor', 'ExpectedDividend', 'SupervisionFlag',
       'Target'],
      dtype='object')
Index(['RowId', 'Date', 'SecuritiesCode', 'Open', 'High', 'Low', 'Close',
       'Volume', 'AdjustmentFactor', 'ExpectedDividend', 'SupervisionFlag',
       'Target'],
      dtype='object')


#### Get CloseT1, CloseT2; remove missing value

In [9]:
def fillCloseT1T2(_df: pd.DataFrame):
    assert all([x in _df.columns for x in ['Target', 'CloseT1', 'CloseT2']])

    def calCloseT2fromT1(target, closeT1):
        '''
        target = ( close(t + 2) - close(t + 1) )/close(t + 1)
        => close(t + 2) = target * close(t + 1) + close(t + 1)
        '''
        return target * closeT1 + closeT1

    df = _df.copy()

    sorted_tmp = pd.DataFrame()
    for code in df['SecuritiesCode'].unique():
        df_tmp = df[['SecuritiesCode', 'Date', 'CloseT1']][df['SecuritiesCode'] == code]
        df_tmp = df_tmp.sort_values(by=['Date'], ascending= True)
        df_tmp = fillInterpolate(df_tmp, ['CloseT1'])
        sorted_tmp = sorted_tmp.append(df_tmp)

    # left or right 
    df = df.drop('CloseT1', axis = 1).merge(
        sorted_tmp,
        how='left', on=['SecuritiesCode', 'Date']
    )

    df = df.fillna({"CloseT1": df['Close']})

    df['CloseT2'] = df.apply(lambda x: calCloseT2fromT1(x['Target'], x['CloseT1']) if np.isnan(x['CloseT2']) else x['CloseT2'], axis = 1)
    return df

In [10]:
def DataPreprocessing_for_Train(_df: pd.DataFrame):
    df = _df.copy() 
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.dropna(axis=0, subset=['Target'] )

    df_return = df[['Date', 'SecuritiesCode', 'Volume', 'AdjustmentFactor', 'ExpectedDividend', 'SupervisionFlag', 'Close', 'Target']]
    df_return['ExpectedDividend'] = df_return['ExpectedDividend'].fillna(-1)

    # add close of t + 1 and t + 2; use as Target
    t = df_return[['SecuritiesCode', 'Date', 'Close']].shift(-1)
    t['Date'] = t['Date'] - pd.Timedelta("1 day")
    t.columns = t.columns.str.replace('Close', 'CloseT1')
    t = t.iloc[:-1]
    df_return = pd.merge(df_return, t, how='left', on= ['SecuritiesCode', 'Date'])

    t = df_return[['SecuritiesCode', 'Date', 'Close']].shift(-2)
    t['Date'] = t['Date'] - pd.Timedelta("2 day")
    t.columns = t.columns.str.replace('Close', 'CloseT2')
    t = t.iloc[:-2]
    df_return = pd.merge(df_return, t, how='left', on= ['SecuritiesCode', 'Date'])

    df_return = df_return.drop('Close', axis= 1)

    sorted_tmp = pd.DataFrame()
    for code in df_return['SecuritiesCode'].unique():
        df_tmp = df[['SecuritiesCode', 'Date', 'Open', 'High', 'Low', 'Close']][df['SecuritiesCode'] == code]
        df_tmp = df_tmp.sort_values(by=['Date'], ascending= True)

        sorted_tmp = sorted_tmp.append(df_tmp)

    df_return = df_return.merge(
        sorted_tmp,
        how='left', on=['SecuritiesCode', 'Date']
    )
    
    df_return.dropna(subset = df_return.columns.drop(['CloseT1', 'CloseT2']), axis = 0, inplace = True)    

    df_return = fillCloseT1T2(df_return)
    return df_return

In [11]:
def DataPreprocessing_for_Valid(_df: pd.DataFrame, df_train: pd.DataFrame):
    df = _df.copy()
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.dropna(axis=0, subset=['Target'] )

    df_return = df[['Date', 'SecuritiesCode', 'Volume', 'AdjustmentFactor', 'ExpectedDividend', 'SupervisionFlag', 'Close', 'Target']]
    df_return['ExpectedDividend'] = df_return['ExpectedDividend'].fillna(-1)

    # add close of t + 1 and t + 2; use as Target
    t = df_return[['SecuritiesCode', 'Date', 'Close']].shift(-1)
    t['Date'] = t['Date'] - pd.Timedelta("1 day")
    t.columns = t.columns.str.replace('Close', 'CloseT1')
    t = t.iloc[:-1]
    df_return = pd.merge(df_return, t, how='left', on= ['SecuritiesCode', 'Date'])

    t = df_return[['SecuritiesCode', 'Date', 'Close']].shift(-2)
    t['Date'] = t['Date'] - pd.Timedelta("2 day")
    t.columns = t.columns.str.replace('Close', 'CloseT2')
    t = t.iloc[:-2]
    df_return = pd.merge(df_return, t, how='left', on= ['SecuritiesCode', 'Date'])

    df_return = df_return.drop('Close', axis= 1)

    sorted_tmp = pd.DataFrame()
    for code in df['SecuritiesCode'].unique():
        df_tmp = df_train[['SecuritiesCode', 'Date', 'Open', 'High', 'Low', 'Close']][df_train['SecuritiesCode'] == code].sort_values(by=['Date'], ascending= True).iloc[-15:].append(
                    df[['SecuritiesCode', 'Date', 'Open', 'High', 'Low', 'Close']][df['SecuritiesCode'] == code])     
        df_tmp = df_tmp.sort_values(by=['Date'], ascending= True)

        sorted_tmp = sorted_tmp.append(df_tmp)
    
    df_return = df_return.merge(
        sorted_tmp,
        how='left', on=['SecuritiesCode', 'Date']
    )

    df_return.dropna(subset = df_return.columns.drop(['CloseT1', 'CloseT2']), axis = 0, inplace = True)    

    df_return = fillCloseT1T2(df_return)
    return df_return

In [12]:
def DataPreprocessing_for_TestData(_df: pd.DataFrame, df_train: pd.DataFrame):
    df = _df.copy()
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.dropna(axis=0, subset=['Target'] )  

    df_return = df[['Date', 'SecuritiesCode', 'Volume', 'AdjustmentFactor', 'ExpectedDividend', 'SupervisionFlag', 'Close', 'Target']]
    df_return['ExpectedDividend'] = df_return['ExpectedDividend'].fillna(-1)

    # add close of t + 1 and t + 2; use as Target
    t = df_return[['SecuritiesCode', 'Date', 'Close']].shift(-1)
    t['Date'] = t['Date'] - pd.Timedelta("1 day")
    t.columns = t.columns.str.replace('Close', 'CloseT1')
    t = t.iloc[:-1]
    df_return = pd.merge(df_return, t, how='left', on= ['SecuritiesCode', 'Date'])

    t = df_return[['SecuritiesCode', 'Date', 'Close']].shift(-2)
    t['Date'] = t['Date'] - pd.Timedelta("2 day")
    t.columns = t.columns.str.replace('Close', 'CloseT2')
    t = t.iloc[:-2]
    df_return = pd.merge(df_return, t, how='left', on= ['SecuritiesCode', 'Date'])

    df_return = df_return.drop('Close', axis= 1)

    sorted_tmp = pd.DataFrame()
    for code in df['SecuritiesCode'].unique():
        df_tmp = df_train[['SecuritiesCode', 'Date', 'Open', 'High', 'Low', 'Close']][df_train['SecuritiesCode'] == code].sort_values(by=['Date'], ascending= True).iloc[-15:].append(
            df[['SecuritiesCode', 'Date', 'Open', 'High', 'Low', 'Close']][df['SecuritiesCode'] == code])
        df_tmp = df_tmp.sort_values(by=['Date'], ascending= True)
        df_tmp = fillInterpolate(df_tmp, ['Open', 'High', 'Low', 'Close'])

        sorted_tmp = sorted_tmp.append(df_tmp)

    df_return = df_return.merge(
        sorted_tmp,
        how='left', on=['SecuritiesCode', 'Date']
    )

    df_return = fillCloseT1T2(df_return)
    
    y_true = df_return[['SecuritiesCode', 'Date', 'CloseT1', 'CloseT2', 'Target']]
    df_return = df_return.drop(['CloseT1', 'CloseT2', 'Target'], axis = 1)
    return df_return, y_true

In [13]:
train_df2 = DataPreprocessing_for_Train(train_df)
train_df2.to_csv('../input/1_after_datapreprocessing/train_full.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [14]:
valid_df2 = DataPreprocessing_for_Valid(valid_df, train_df2)
valid_df2.to_csv('../input/1_after_datapreprocessing/valid_full.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [15]:
test_df2, y_test = DataPreprocessing_for_TestData(test_df, train_df2)
test_df2.to_csv('../input/1_after_datapreprocessing/test_full.csv', index = False)
y_test.to_csv('../input/1_after_datapreprocessing/y_test_full.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [16]:
off_test2 = DataPreprocessing_for_official_test(official_test, train_df2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_return['ExpectedDividend'] = df_return['ExpectedDividend'].fillna(-1)


In [17]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4717106 entries, 0 to 4717105
Data columns (total 12 columns):
 #   Column            Dtype  
---  ------            -----  
 0   RowId             object 
 1   Date              object 
 2   SecuritiesCode    int64  
 3   Open              float64
 4   High              float64
 5   Low               float64
 6   Close             float64
 7   Volume            int64  
 8   AdjustmentFactor  float64
 9   ExpectedDividend  float64
 10  SupervisionFlag   bool   
 11  Target            float64
dtypes: bool(1), float64(7), int64(2), object(2)
memory usage: 400.4+ MB


In [18]:
train_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4617191 entries, 0 to 4617190
Data columns (total 13 columns):
 #   Column            Dtype         
---  ------            -----         
 0   Date              datetime64[ns]
 1   SecuritiesCode    int64         
 2   Volume            int64         
 3   AdjustmentFactor  float64       
 4   ExpectedDividend  float64       
 5   SupervisionFlag   bool          
 6   Target            float64       
 7   CloseT2           float64       
 8   Open              float64       
 9   High              float64       
 10  Low               float64       
 11  Close             float64       
 12  CloseT1           float64       
dtypes: bool(1), datetime64[ns](1), float64(9), int64(2)
memory usage: 462.3 MB


In [19]:
train_df[train_df['SecuritiesCode'] == 1387][['Date', 'SecuritiesCode', 'Open', 'High', 'Low', 'Close']]

Unnamed: 0,Date,SecuritiesCode,Open,High,Low,Close
2332571,2017-01-04,1387,13930.0,14000.0,13930.0,14000.0
2334463,2017-01-05,1387,14010.0,14010.0,13930.0,14000.0
2336355,2017-01-06,1387,,,,
2338247,2017-01-10,1387,,,,
2340139,2017-01-11,1387,14010.0,14010.0,14010.0,14010.0
...,...,...,...,...,...,...
4706701,2021-11-29,1387,18980.0,19415.0,18980.0,19415.0
4708789,2021-11-30,1387,19265.0,19265.0,19265.0,19265.0
4710878,2021-12-01,1387,,,,
4712967,2021-12-02,1387,18525.0,18865.0,18525.0,18845.0


In [20]:
train_df2[train_df2['SecuritiesCode'] == 1387][['Date', 'SecuritiesCode', 'Open', 'High', 'Low', 'Close']]

Unnamed: 0,Date,SecuritiesCode,Open,High,Low,Close
2324963,2017-01-04,1387,13930.0,14000.0,13930.0,14000.0
2326805,2017-01-05,1387,14010.0,14010.0,13930.0,14000.0
2332344,2017-01-11,1387,14010.0,14010.0,14010.0,14010.0
2343304,2017-01-19,1387,13800.0,13800.0,13800.0,13800.0
2348720,2017-01-24,1387,13720.0,13720.0,13700.0,13700.0
...,...,...,...,...,...,...
4600947,2021-11-24,1387,19830.0,19830.0,19830.0,19830.0
4605003,2021-11-26,1387,19580.0,19580.0,19540.0,19540.0
4607039,2021-11-29,1387,18980.0,19415.0,18980.0,19415.0
4609091,2021-11-30,1387,19265.0,19265.0,19265.0,19265.0


In [21]:
train_df.head(5)

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target
0,20170104_1301,2017-01-04,1301,2734.0,2755.0,2730.0,2742.0,31400,1.0,,False,0.00073
1,20170104_1332,2017-01-04,1332,568.0,576.0,563.0,571.0,2798500,1.0,,False,0.012324
2,20170104_1333,2017-01-04,1333,3150.0,3210.0,3140.0,3210.0,270800,1.0,,False,0.006154
3,20170104_1376,2017-01-04,1376,1510.0,1550.0,1510.0,1550.0,11300,1.0,,False,0.011053
4,20170104_1377,2017-01-04,1377,3270.0,3350.0,3270.0,3330.0,150800,1.0,,False,0.003026


In [22]:
train_df2.head(5)

Unnamed: 0,Date,SecuritiesCode,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target,CloseT2,Open,High,Low,Close,CloseT1
0,2017-01-04,1301,31400,1.0,-1.0,False,0.00073,2740.0,2734.0,2755.0,2730.0,2742.0,2738.0
1,2017-01-04,1332,2798500,1.0,-1.0,False,0.012324,575.0,568.0,576.0,563.0,571.0,568.0
2,2017-01-04,1333,270800,1.0,-1.0,False,0.006154,3270.0,3150.0,3210.0,3140.0,3210.0,3250.0
3,2017-01-04,1376,11300,1.0,-1.0,False,0.011053,1555.0,1510.0,1550.0,1510.0,1550.0,1538.0
4,2017-01-04,1377,150800,1.0,-1.0,False,0.003026,3315.0,3270.0,3350.0,3270.0,3330.0,3305.0


In [23]:
valid_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 233748 entries, 0 to 233747
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   Date              233748 non-null  datetime64[ns]
 1   SecuritiesCode    233748 non-null  int64         
 2   Volume            233748 non-null  int64         
 3   AdjustmentFactor  233748 non-null  float64       
 4   ExpectedDividend  233748 non-null  float64       
 5   SupervisionFlag   233748 non-null  bool          
 6   Target            233748 non-null  float64       
 7   CloseT2           233748 non-null  float64       
 8   Open              233748 non-null  float64       
 9   High              233748 non-null  float64       
 10  Low               233748 non-null  float64       
 11  Close             233748 non-null  float64       
 12  CloseT1           233748 non-null  float64       
dtypes: bool(1), datetime64[ns](1), float64(9), int64(2)
memory 

In [24]:
valid_df.head(5)

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target
0,20211206_1305,2021-12-06,1305,2061.5,2065.0,2041.5,2048.0,104900,1.0,,False,0.005013
1,20211206_1306,2021-12-06,1306,2037.0,2041.0,2018.5,2025.0,1427480,1.0,,False,0.005793
2,20211206_1308,2021-12-06,1308,2016.0,2019.5,1997.5,2004.5,110000,1.0,,False,0.005367
3,20211206_1309,2021-12-06,1309,43400.0,43710.0,43030.0,43190.0,323,1.0,,False,0.005927
4,20211206_1311,2021-12-06,1311,952.1,953.4,943.5,947.5,2030,1.0,,False,0.0063


In [25]:
valid_df2.head(5)

Unnamed: 0,Date,SecuritiesCode,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target,CloseT2,Open,High,Low,Close,CloseT1
0,2021-12-06,1305,104900,1.0,-1.0,False,0.005013,2105.0,2061.5,2065.0,2041.5,2048.0,2094.5
1,2021-12-06,1306,1427480,1.0,-1.0,False,0.005793,2083.5,2037.0,2041.0,2018.5,2025.0,2071.5
2,2021-12-06,1308,110000,1.0,-1.0,False,0.005367,2060.5,2016.0,2019.5,1997.5,2004.5,2049.5
3,2021-12-06,1309,323,1.0,-1.0,False,0.005927,44130.0,43400.0,43710.0,43030.0,43190.0,43870.0
4,2021-12-06,1311,2030,1.0,-1.0,False,0.0063,974.4,952.1,953.4,943.5,947.5,968.3


In [26]:
test_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 229952 entries, 0 to 229951
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   Date              229952 non-null  datetime64[ns]
 1   SecuritiesCode    229952 non-null  int64         
 2   Volume            229952 non-null  int64         
 3   AdjustmentFactor  229952 non-null  float64       
 4   ExpectedDividend  229952 non-null  float64       
 5   SupervisionFlag   229952 non-null  bool          
 6   Open              229952 non-null  float64       
 7   High              229952 non-null  float64       
 8   Low               229952 non-null  float64       
 9   Close             229952 non-null  float64       
dtypes: bool(1), datetime64[ns](1), float64(6), int64(2)
memory usage: 17.8 MB


In [27]:
test_df.head(5)

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target
0,20211206_1301,2021-12-06,1301,2982.0,2982.0,2965.0,2971.0,8900,1.0,,False,-0.003263
1,20211206_1332,2021-12-06,1332,592.0,599.0,588.0,589.0,1360800,1.0,,False,-0.008993
2,20211206_1333,2021-12-06,1333,2368.0,2388.0,2360.0,2377.0,125900,1.0,,False,-0.009963
3,20211206_1375,2021-12-06,1375,1230.0,1239.0,1224.0,1224.0,81100,1.0,,False,-0.015032
4,20211206_1376,2021-12-06,1376,1339.0,1372.0,1339.0,1351.0,6200,1.0,,False,0.002867


In [28]:
test_df2.head(5)

Unnamed: 0,Date,SecuritiesCode,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Open,High,Low,Close
0,2021-12-06,1301,8900,1.0,-1.0,False,2982.0,2982.0,2965.0,2971.0
1,2021-12-06,1332,1360800,1.0,-1.0,False,592.0,599.0,588.0,589.0
2,2021-12-06,1333,125900,1.0,-1.0,False,2368.0,2388.0,2360.0,2377.0
3,2021-12-06,1375,81100,1.0,-1.0,False,1230.0,1239.0,1224.0,1224.0
4,2021-12-06,1376,6200,1.0,-1.0,False,1339.0,1372.0,1339.0,1351.0


In [29]:
official_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   RowId             4000 non-null   object        
 1   Date              4000 non-null   datetime64[ns]
 2   SecuritiesCode    4000 non-null   int64         
 3   Open              3990 non-null   float64       
 4   High              3990 non-null   float64       
 5   Low               3990 non-null   float64       
 6   Close             3990 non-null   float64       
 7   Volume            4000 non-null   int64         
 8   AdjustmentFactor  4000 non-null   float64       
 9   ExpectedDividend  0 non-null      float64       
 10  SupervisionFlag   4000 non-null   bool          
dtypes: bool(1), datetime64[ns](1), float64(6), int64(2), object(1)
memory usage: 316.5+ KB


In [30]:
off_test2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4000 entries, 0 to 3999
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Date              4000 non-null   datetime64[ns]
 1   SecuritiesCode    4000 non-null   int64         
 2   Volume            4000 non-null   int64         
 3   AdjustmentFactor  4000 non-null   float64       
 4   ExpectedDividend  4000 non-null   float64       
 5   SupervisionFlag   4000 non-null   bool          
 6   Open              4000 non-null   float64       
 7   High              4000 non-null   float64       
 8   Low               4000 non-null   float64       
 9   Close             4000 non-null   float64       
 10  Year              4000 non-null   int64         
 11  Month             4000 non-null   int64         
 12  Day               4000 non-null   int64         
dtypes: bool(1), datetime64[ns](1), float64(6), int64(5)
memory usage: 410.2 KB


In [31]:
official_test.head(5)

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag
0,20211206_1301,2021-12-06,1301,2982.0,2982.0,2965.0,2971.0,8900,1.0,,False
1,20211206_1332,2021-12-06,1332,592.0,599.0,588.0,589.0,1360800,1.0,,False
2,20211206_1333,2021-12-06,1333,2368.0,2388.0,2360.0,2377.0,125900,1.0,,False
3,20211206_1375,2021-12-06,1375,1230.0,1239.0,1224.0,1224.0,81100,1.0,,False
4,20211206_1376,2021-12-06,1376,1339.0,1372.0,1339.0,1351.0,6200,1.0,,False


In [32]:
off_test2.head(5)

Unnamed: 0,Date,SecuritiesCode,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Open,High,Low,Close,Year,Month,Day
0,2021-12-06,1301,8900,1.0,-1.0,False,2982.0,2982.0,2965.0,2971.0,2021,12,6
1,2021-12-06,1332,1360800,1.0,-1.0,False,592.0,599.0,588.0,589.0,2021,12,6
2,2021-12-06,1333,125900,1.0,-1.0,False,2368.0,2388.0,2360.0,2377.0,2021,12,6
3,2021-12-06,1375,81100,1.0,-1.0,False,1230.0,1239.0,1224.0,1224.0,2021,12,6
4,2021-12-06,1376,6200,1.0,-1.0,False,1339.0,1372.0,1339.0,1351.0,2021,12,6


#### Mini TrainSet

In [33]:
miniset = train_df2[train_df2['Date'] > '2021-11-01'].reset_index()
miniset

Unnamed: 0,index,Date,SecuritiesCode,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target,CloseT2,Open,High,Low,Close,CloseT1
0,2281056,2021-11-02,1301,17800,1.0,-1.0,False,-0.043077,3250.000000,3085.0,3100.0,3065.0,3080.0,3090.000000
1,2281057,2021-11-02,1332,1137500,1.0,-1.0,False,0.040945,635.000000,651.0,656.0,646.0,649.0,653.000000
2,2281058,2021-11-02,1333,106500,1.0,-1.0,False,-0.001917,2608.000000,2630.0,2634.0,2608.0,2614.0,2610.333333
3,2281059,2021-11-02,1375,46200,1.0,-1.0,False,-0.015972,1440.000000,1439.0,1443.0,1430.0,1430.0,1425.666667
4,2281060,2021-11-02,1376,1400,1.0,-1.0,False,-0.010554,1516.000000,1518.0,1518.0,1510.0,1510.0,1506.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88320,4617186,2021-12-03,9980,44400,1.0,-1.0,False,0.007692,131.000000,129.0,131.0,129.0,130.0,130.000000
88321,4617187,2021-12-03,9986,3800,1.0,-1.0,False,0.004213,1903.987362,1900.0,1900.0,1891.0,1896.0,1896.000000
88322,4617188,2021-12-03,9995,43400,1.0,-1.0,False,0.030534,406.030534,390.0,395.0,388.0,394.0,394.000000
88323,4617189,2021-12-03,9996,500,1.0,-1.0,False,0.022150,1523.003257,1490.0,1500.0,1490.0,1500.0,1490.000000


In [34]:
miniset.to_csv('../input/1_after_datapreprocessing/mini_train.csv', index = False)