In [181]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
import yfinance as yf
import matplotlib.pyplot as plt

In [164]:
def add_lag(dataframe, days):
    df_copy = dataframe.copy()
    
    for feature in ['Open', 'High', 'Low']:
        df_copy[f'{feature.lower()}_lag'] = dataframe[feature].shift(periods=days)
    
    return df_copy

In [165]:
def prepare_future_data(df, days_to_predict):
    last_date = pd.to_datetime(df.index.max())
        
    future_dates = pd.date_range(start=last_date+pd.Timedelta(days=1), periods=days_to_predict, freq='B')
    
    future_df = pd.DataFrame(index=future_dates, columns=df.columns)
    
    combined_df = pd.concat([df, future_df])
    combined_df = add_lag(combined_df, days=days_to_predict)
    
    future_feature_df = combined_df.loc[future_dates]
    features_pred = ['open_lag','high_lag','low_lag']
    
    return future_feature_df[features_pred], future_dates

In [187]:
ticker = 'ARTO.JK'

In [230]:
ticker_market = yf.Ticker(ticker)
df = ticker_market.history(period='5y')
df.index = pd.to_datetime(df.index, format='%Y-%m-%d')
df = df.drop(['Volume','Dividends', 'Stock Splits'], axis=1)

In [231]:
df

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-11-26 00:00:00+07:00,262.039581,263.287415,237.083450,245.194199
2019-11-27 00:00:00+07:00,245.194199,245.194199,243.322479,244.570282
2019-11-28 00:00:00+07:00,237.707352,242.074677,237.083450,237.083450
2019-11-29 00:00:00+07:00,237.083450,237.707352,237.083450,237.083450
2019-12-02 00:00:00+07:00,237.083450,237.083450,224.605362,225.229279
...,...,...,...,...
2024-11-20 00:00:00+07:00,2600.000000,2730.000000,2580.000000,2660.000000
2024-11-21 00:00:00+07:00,2670.000000,2750.000000,2660.000000,2660.000000
2024-11-22 00:00:00+07:00,2680.000000,2760.000000,2670.000000,2730.000000
2024-11-25 00:00:00+07:00,2740.000000,2770.000000,2680.000000,2740.000000


In [232]:
df_with_lags = add_lag(df, days=7)
df_with_lags.dropna()

Unnamed: 0_level_0,Open,High,Low,Close,open_lag,high_lag,low_lag
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-12-05 00:00:00+07:00,243.946381,293.234772,243.946381,268.278625,262.039581,263.287415,237.083450
2019-12-06 00:00:00+07:00,273.269867,298.226013,268.278625,275.765472,245.194199,245.194199,243.322479
2019-12-09 00:00:00+07:00,282.004517,282.004517,269.526428,274.517670,237.707352,242.074677,237.083450
2019-12-10 00:00:00+07:00,272.022064,272.022064,267.030823,269.526428,237.083450,237.707352,237.083450
2019-12-11 00:00:00+07:00,267.030823,280.756714,267.030823,272.022064,237.083450,237.083450,224.605362
...,...,...,...,...,...,...,...
2024-11-20 00:00:00+07:00,2600.000000,2730.000000,2580.000000,2660.000000,2600.000000,2630.000000,2400.000000
2024-11-21 00:00:00+07:00,2670.000000,2750.000000,2660.000000,2660.000000,2500.000000,2530.000000,2410.000000
2024-11-22 00:00:00+07:00,2680.000000,2760.000000,2670.000000,2730.000000,2450.000000,2600.000000,2420.000000
2024-11-25 00:00:00+07:00,2740.000000,2770.000000,2680.000000,2740.000000,2570.000000,2600.000000,2480.000000


In [233]:
features = ['open_lag','high_lag','low_lag']
target = ['Close']

X = df_with_lags[features]
y = df_with_lags[target].values.ravel()

In [249]:
data = {
    'BBCA.JK': [{
        'n_estimators': 100,
        'max_depth': 12,
        'max_features': 7,
        'min_samples_leaf': 2,
        'min_samples_split': 5,
    },{
        'n_estimators': 300,
        'eta': 0.1,
        'max_depth': 3,
        'subsample': 1,
    },{'weights':[2,1]}],
    'ARTO.JK': [{
        'n_estimators': 100,
        'max_depth': 10,
        'max_features': 8,
        'min_samples_leaf': 2,
        'min_samples_split': 5,
    },{
        'n_estimators': 250,
        'eta': 0.05,
        'max_depth': 3,
        'subsample': 0.7,
    },{'weights':[1,2]}],
    'BMRI.JK': [{
        'n_estimators': 100,
        'max_depth': 15,
        'max_features': 8,
        'min_samples_leaf': 2,
        'min_samples_split': 2,
    },{
        'n_estimators': 300,
        'eta': 0.15,
        'max_depth': 3,
        'subsample': 0.3,
    },{'weights':[1,2]}],
    'BBNI.JK': [{
        'n_estimators': 150,
        'max_depth': 12,
        'max_features': 6,
        'min_samples_leaf': 2,
        'min_samples_split': 2,
    },{
        'n_estimators': 100,
        'eta': 0.1,
        'max_depth': 3,
        'subsample': 1,
    },{'weights':[2,1]}],
    'BBRI.JK': [{
        'n_estimators': 250,
        'max_depth': 12,
        'max_features': 7,
        'min_samples_leaf': 2,
        'min_samples_split': 5,
    },{
        'n_estimators': 300,
        'eta': 0.05,
        'max_depth': 7,
        'subsample': 0.3,
    },{'weights':[2,1]}],
    'BBTN.JK': [{
        'n_estimators': 150,
        'max_depth': 10,
        'max_features': 7,
        'min_samples_leaf': 2,
        'min_samples_split': 2,
    },{
        'n_estimators': 100,
        'eta': 0.15,
        'max_depth': 10,
        'subsample': 0.5,
    },{'weights':[1,2]}],
    'BRIS.JK': [{
        'n_estimators': 150,
        'max_depth': 20,
        'max_features': 7,
        'min_samples_leaf': 2,
        'min_samples_split': 5,
    },{
        'n_estimators': 100,
        'eta': 0.1,
        'max_depth': 9,
        'subsample': 0.7,
    },{'weights':[1,2]}]
}

rf = RandomForestRegressor(**(data[ticker][0]))
xgb = XGBRegressor(**(data[ticker][1]))

model = VotingRegressor(estimators=[
    ('rf', rf),
    ('xgb', xgb)
], **(data[ticker][2]))

In [250]:
model.fit(X, y)

In [255]:
cv_scores = cross_val_score(model, X, y, cv=100, scoring='neg_root_mean_squared_error')

In [256]:
print(f"CV Scores (Negative Root Mean Squared Error): {cv_scores}")
print(f"Root Mean CV Score: {np.mean(cv_scores)}")
print(f"Standard Deviation of CV Scores: {np.std(cv_scores)}")

CV Scores (Negative Root Mean Squared Error): [-8474.57903447  -163.88961849   -96.50218781   -75.29380058
   -60.83502259  -157.00764197  -315.96696381  -239.17930941
   -91.5845508    -63.30828624  -252.91357137  -611.49531834
  -104.06198282  -103.72342057   -91.18311124   -84.14298039
   -76.72850092  -120.93619783  -107.70355993  -398.53014914
  -476.89801154  -663.00284852 -1261.78281912 -2042.76910133
 -1945.89645842  -736.12907134  -585.76937559  -527.21465892
  -463.78951274 -1728.08837171 -1880.70430705 -1051.54931658
 -2120.87617822 -1122.99360775 -1170.51602123 -1536.34056763
 -2231.0561982  -2678.75015121  -763.43491155  -468.07321573
  -423.2657419  -1911.70189433 -1966.41159861 -1576.84588729
 -1547.97294171  -486.58066548 -1464.43971008 -1529.47067343
 -3892.29298933 -1099.45176652  -511.23189188  -813.52638963
  -957.21704522  -660.92058418 -1284.95709754 -1265.26221588
 -1302.9192642  -1927.92426297  -558.92905986 -1119.20683611
 -1717.76830157  -638.96792126  -594.11

In [210]:
y_pred = model.predict(X)

In [211]:
ss_res = np.sum((y - y_pred)**2)
ss_tot = np.sum((y - np.mean(y))**2)

rmse = np.sqrt(np.mean((y - y_pred)**2))
mae = np.mean(np.abs(y - y_pred))
r2 = (1 - (ss_res / ss_tot))

rmse, mae, r2

(np.float64(420.8051019592307),
 np.float64(268.2932660679778),
 np.float64(0.9935386354287677))

In [27]:
days_to_predict = 7
future_features, future_dates = prepare_future_data(df, days_to_predict)

  combined_df = pd.concat([df, future_df])


In [28]:
prediction_features = ['open_lag', 'high_lag', 'low_lag']
future_features[prediction_features] = future_features[prediction_features].astype('float64')
y_pred_future = model.predict(future_features[prediction_features])

In [31]:
future_predictions = pd.DataFrame(
    y_pred_future,
    index=future_dates,
    columns=['Close']
)

historical_data = pd.DataFrame(df['Close'])
combined_df = pd.concat([historical_data, future_predictions])

combined_dates = combined_df.index
combined_close = combined_df['Close'].values

In [None]:
plt.figure(figsize=(15,12))
plt.plot(combined_dates, combined_close)
plt.show()

In [None]:
# ticker_market = yf.Ticker('BBCA.JK')

# df = ticker_market.history(period='5y')
# df.index = pd.to_datetime(df.index, format='%Y-%m-%d')

# dates = df.index.strftime('%Y-%m-%d').tolist()

# close_actual = df['Close'].values.tolist()

In [None]:
# df = df.drop(['Dividends','Stock Splits'], axis=1)
# df

In [None]:
# def add_lag(dataframe, n_past):
#     # period = round((days * 5)/7)
#     # dataframe['lag'] = dataframe['Close'].shift(periods=period, freq='B')
#     # target_map = dataframe['Close'].to_dict()
#     # dataframe['lag'] = (dataframe.index - pd.Timedelta('364 days')).map(target_map)
    
#     df_copy = dataframe.copy()
#     # lag_offset = pd.offsets.BDay(days)
#     # lagged_index = df_copy.index - lag_offset

#     # for feature in ['Open', 'High', 'Low', 'Volume']:
#     #     target_map = dict(zip(df_copy.index, df_copy[feature]))
#     #     df_copy[f'{feature.lower()}_lag'] = lagged_index.map(target_map)
#     df_copy = df_copy.dropna()
    
#     price_columns = ['Open', 'High', 'Low', 'Close', 'Volume']
#     for column in price_columns:
#         for lag in range(1, n_past + 1):
#             df_copy[f'{column}_lag{lag}'] = df_copy[column].shift(periods=lag, freq='B')
    
#     df_copy = df_copy.dropna()
    
#     return df_copy

In [None]:
# df_featured = add_lag(df,7)
# df_featured
# features = df_featured.drop(columns=['Close'], axis=1)
# target = df_featured[['Close']]

# X = features
# y = target.values.ravel()

In [None]:
# rf = RandomForestRegressor(n_estimators=100,
#                                 max_depth=20,
#                                 max_features=4,
#                                 min_samples_leaf=2,
#                                 min_samples_split=2)
# xgb = XGBRegressor(n_estimators=100,
#                 eta=0.2,
#                 max_depth=6,
#                 subsample=0.3)
        
# model = VotingRegressor(estimators=[
#             ('rf', rf),
#             ('xgb', xgb)
#         ], weights=None)

In [None]:
model.fit(X,y)

In [None]:
# n_past = 30
# days = 30

# last_data = df.tail(n_past + days).copy()
# predictions = []

# last_date = pd.to_datetime(df_featured.index[-1])
# dates_future = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=days, freq='B')

# for _ in range(days):
#     features_df = add_lag(last_data, n_past)
#     last_features = features_df.iloc[[-1]]
    
    
#     # Make prediction
#     pred = model.predict(X)
#     predictions.append(pred)
    
#     # Update the data for the next prediction
#     new_row = last_data.iloc[-1].copy()
#     new_row['Close'] = pred
#     last_data = pd.concat([last_data, pd.DataFrame([new_row])])
    
# predictions = np.array(predictions)