In [54]:
import yfinance as yf
import pandas as pd
from sklearn.preprocessing import RobustScaler
import numpy as np
import matplotlib.pyplot as plt
from typing import List, Union
from datetime import datetime, timedelta

from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from xgboost import XGBRegressor

from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

In [55]:
ticker_market = yf.Ticker('BMRI.JK')

df = ticker_market.history(period='5y')
df.index = pd.to_datetime(df.index, format='%Y-%m-%d')

dates = df.index.strftime('%Y-%m-%d').tolist()

close_actual = df['Close'].values.tolist()

In [56]:
df = df.drop(['Dividends','Stock Splits'], axis=1)
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-10-31 00:00:00+07:00,2803.372678,2803.372678,2743.726451,2793.431641,78160000
2019-11-01 00:00:00+07:00,2793.431882,2793.431882,2753.667727,2773.549805,24366800
2019-11-04 00:00:00+07:00,2783.490589,2793.431627,2733.785400,2733.785400,56584800
2019-11-05 00:00:00+07:00,2763.608660,2863.019043,2753.667621,2863.019043,94253000
2019-11-06 00:00:00+07:00,2853.078114,2863.019153,2713.903572,2773.549805,113175200
...,...,...,...,...,...
2024-10-25 00:00:00+07:00,6975.000000,7050.000000,6975.000000,7050.000000,34637500
2024-10-28 00:00:00+07:00,7075.000000,7075.000000,6800.000000,6825.000000,114500800
2024-10-29 00:00:00+07:00,6800.000000,6850.000000,6725.000000,6825.000000,91268300
2024-10-30 00:00:00+07:00,6575.000000,6775.000000,6575.000000,6750.000000,122132700


In [57]:
def add_lag(dataframe, days):
    # period = round((days * 5)/7)
    # dataframe['lag'] = dataframe['Close'].shift(periods=period, freq='B')
    # target_map = dataframe['Close'].to_dict()
    # dataframe['lag'] = (dataframe.index - pd.Timedelta('364 days')).map(target_map)
    
    df_copy = dataframe.copy()
    lag_offset = pd.offsets.BDay(days)
    lagged_index = df_copy.index - lag_offset

    for feature in ['Open', 'High', 'Low', 'Volume']:
        target_map = dict(zip(df_copy.index, df_copy[feature]))
        df_copy[f'{feature.lower()}_lag'] = lagged_index.map(target_map)
    
    return df_copy

In [58]:
df = add_lag(df, 30)
features = df.drop(columns=['Close'],axis=1)
target = df['Close']
split_data = int(len(df)*0.9)

# X_train, X_test = features[:split_data], features[split_data:]
# y_train, y_test = target[:split_data].values.ravel(), target[split_data:].values.ravel()

X = features
y = target.values.ravel()

In [59]:
model_xgb = XGBRegressor(n_estimators=1000,
                early_stopping_rounds=50,
                eta=0.01,
                max_depth=5,
                subsample=0.3)

In [60]:
# model_xgb.fit(X_train, y_train, eval_set=[(X_train,y_train),(X_test,y_test)], verbose=100)
model_xgb.fit(X, y, eval_set=[(X,y)], verbose=50)

[0]	validation_0-rmse:1542.31853
[50]	validation_0-rmse:950.55317
[100]	validation_0-rmse:588.35745
[150]	validation_0-rmse:366.00570
[200]	validation_0-rmse:229.89166
[250]	validation_0-rmse:146.19603
[300]	validation_0-rmse:96.02424
[350]	validation_0-rmse:66.44174
[400]	validation_0-rmse:49.71857
[450]	validation_0-rmse:40.32874
[500]	validation_0-rmse:35.34456
[550]	validation_0-rmse:32.54720
[600]	validation_0-rmse:30.86721
[650]	validation_0-rmse:29.84348
[700]	validation_0-rmse:28.94221
[750]	validation_0-rmse:28.34304
[800]	validation_0-rmse:27.76163
[850]	validation_0-rmse:27.16449
[900]	validation_0-rmse:26.64656
[950]	validation_0-rmse:26.13309
[999]	validation_0-rmse:25.67470


In [None]:
    model = XGBRegressor(n_estimators=250,
                    eta=0.01,
                    max_depth=5,
                    subsample=0.3)

In [62]:
# model_xgb.fit(X_train, y_train)
model.fit(X, y)

In [63]:
# y_pred = model_xgb.predict(X_test)
y_pred = model.predict(X)

In [64]:
# print(f'R2: {r2_score(y_test,y_pred)}')
print(f'R2: {r2_score(y,y_pred)}')

R2: 0.991031132748306


In [65]:
last_dates = df.index.max()

In [66]:
days = 30
future = pd.date_range(start=last_dates + pd.Timedelta(days=1), periods=days, freq='B')
future_df = pd.DataFrame(index=future)
future_df['isFuture'] = True
df['isFuture'] = False

combined_df = pd.concat([df, future_df])
combined_df = add_lag(combined_df, days)
combined_df

Unnamed: 0,Open,High,Low,Close,Volume,open_lag,high_lag,low_lag,volume_lag,isFuture
2019-10-31 00:00:00+07:00,2803.372678,2803.372678,2743.726451,2793.431641,78160000.0,,,,,False
2019-11-01 00:00:00+07:00,2793.431882,2793.431882,2753.667727,2773.549805,24366800.0,,,,,False
2019-11-04 00:00:00+07:00,2783.490589,2793.431627,2733.785400,2733.785400,56584800.0,,,,,False
2019-11-05 00:00:00+07:00,2763.608660,2863.019043,2753.667621,2863.019043,94253000.0,,,,,False
2019-11-06 00:00:00+07:00,2853.078114,2863.019153,2713.903572,2773.549805,113175200.0,,,,,False
...,...,...,...,...,...,...,...,...,...,...
2024-12-06 00:00:00+07:00,,,,,,6975.0,7050.0,6975.0,34637500.0,True
2024-12-09 00:00:00+07:00,,,,,,7075.0,7075.0,6800.0,114500800.0,True
2024-12-10 00:00:00+07:00,,,,,,6800.0,6850.0,6725.0,91268300.0,True
2024-12-11 00:00:00+07:00,,,,,,6575.0,6775.0,6575.0,122132700.0,True


In [67]:
combined_df_future = combined_df.query('isFuture').copy()
combined_df_future = combined_df_future[['open_lag','high_lag','low_lag','volume_lag']]
combined_df_future.columns = ['Open','High','Low','Volume']
combined_df_future

Unnamed: 0,Open,High,Low,Volume
2024-11-01 00:00:00+07:00,7400.0,7425.0,7300.0,97901700.0
2024-11-04 00:00:00+07:00,7325.0,7450.0,7300.0,79675600.0
2024-11-05 00:00:00+07:00,7400.0,7475.0,7375.0,81817100.0
2024-11-06 00:00:00+07:00,7350.0,7350.0,7050.0,192197100.0
2024-11-07 00:00:00+07:00,7175.0,7200.0,7100.0,173044000.0
2024-11-08 00:00:00+07:00,7025.0,7125.0,7000.0,161405200.0
2024-11-11 00:00:00+07:00,7000.0,7025.0,6875.0,166203500.0
2024-11-12 00:00:00+07:00,7000.0,7075.0,6950.0,82690100.0
2024-11-13 00:00:00+07:00,7025.0,7100.0,6925.0,91076400.0
2024-11-14 00:00:00+07:00,7000.0,7075.0,6900.0,86264900.0


In [69]:
y_pred_future = model.predict(combined_df_future[['Open','High','Low','Volume']])
y_pred_future

ValueError: feature_names mismatch: ['Open', 'High', 'Low', 'Volume', 'open_lag', 'high_lag', 'low_lag', 'volume_lag'] ['Open', 'High', 'Low', 'Volume']
expected high_lag, open_lag, low_lag, volume_lag in input data