In [204]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error, mean_absolute_percentage_error

import plotly.express as px
import plotly.graph_objects as go
from datetime import timedelta

In [205]:
def load_and_merge(coin):
    if coin == 'BTC':
        price = pd.read_csv('btc-usd-max.csv', parse_dates=['snapped_at'])
        github = pd.read_csv('bitcoin_bitcoin_github_social_signals.csv')
    elif coin == 'ETH':
        price = pd.read_csv('eth-usd-max.csv', parse_dates=['snapped_at'])
        github = pd.read_csv('ethereum_go-ethereum_github_social_signals.csv')
    else:
        print('Salah nama koin')
        return

    # Normalize & remove timezone
    price['snapped_at'] = pd.to_datetime(price['snapped_at']).dt.tz_localize(None).dt.normalize()
    github['date'] = pd.to_datetime(github['date'], errors='coerce').dt.tz_localize(None).dt.normalize()

    price = price.dropna(subset=['snapped_at'])
    github = github.dropna(subset=['date'])

    min_date = price['snapped_at'].min()
    max_date = price['snapped_at'].max()

    github = github[(github['date'] >= min_date) & (github['date'] <= max_date)]

    df = pd.merge(price, github, left_on='snapped_at', right_on='date', how='left')
    df = df.drop(columns=['date'])
    return df

In [206]:
def preprocess(df):
    # Pastikan harga numerik
    df['price'] = pd.to_numeric(df['price'], errors='coerce')

    # Urutkan berdasarkan waktu
    df = df.sort_values('snapped_at')

    # Isi market cap yang hilang dengan forward fill
    df['market_cap'] = df['market_cap'].ffill()

    # Gabungkan issue GitHub
    issue_cols = [col for col in ['issues_opened', 'issues_closed', 'issue_comments'] if col in df.columns]
    if issue_cols:
        df['issue_activity'] = df[issue_cols].sum(axis=1)
        df.drop(columns=issue_cols, inplace=True)

    # Gabungkan pulls Github
    pull_cols = [col for col in ['pulls_opened', 'pulls_merged', 'pulls_closed'] if col in df.columns]
    if pull_cols:
        df['pull_activity'] = df[pull_cols].sum(axis=1)
        df.drop(columns=pull_cols, inplace=True)

    # Hapus stars dan forks
    df.drop(columns=[col for col in ['stars', 'forks'] if col in df.columns], inplace=True)

    return df

In [207]:
def calculate_metrics(y_true, y_pred, model_name="Model"):
    # Konversi input ke array NumPy untuk konsistensi
    y_true_np = np.array(y_true)
    y_pred_np = np.array(y_pred)

    # Pemeriksaan awal untuk panjang dan kekosongan
    if len(y_true_np) == 0:
        print(f"  Peringatan: y_true kosong untuk {model_name}. Metrik tidak dihitung.")
        return {'rmse': np.nan, 'mae': np.nan, 'mape': np.nan, 'r2': np.nan}
    if len(y_true_np) != len(y_pred_np):
        print(f"  Peringatan: Panjang y_true ({len(y_true_np)}) dan y_pred ({len(y_pred_np)}) tidak cocok untuk {model_name}. Metrik tidak dihitung.")
        min_len = min(len(y_true_np), len(y_pred_np))
        y_true_np = y_true_np[:min_len]
        y_pred_np = y_pred_np[:min_len]
        if min_len == 0:
            print(f"  Peringatan: Setelah penyesuaian panjang, data untuk {model_name} kosong. Metrik tidak dihitung.")
            return {'rmse': np.nan, 'mae': np.nan, 'mape': np.nan, 'r2': np.nan}


    rmse_val = root_mean_squared_error(y_true_np, y_pred_np)
    mae_val = mean_absolute_error(y_true_np, y_pred_np)
    r2_val = r2_score(y_true_np, y_pred_np)
    mape_val = mean_absolute_percentage_error(y_true_np, y_pred_np) * 100
    
    print(f"\n--- Metrik Evaluasi untuk {model_name} ---")
    print(f"RMSE: {rmse_val:.4f}")
    print(f"MAE: {mae_val:.4f}")
    if not np.isnan(mape_val): 
        print(f"MAPE: {mape_val:.2f}%")
    else: 
        print("MAPE: Tidak dapat dihitung")
    print(f"R2 Score: {r2_val:.4f}")
    
    return {'rmse': rmse_val, 'mae': mae_val, 'mape': mape_val, 'r2': r2_val}

In [208]:
# Fungsi untuk memuat dan memproses data aktual dari CSV untuk evaluasi akhir
def load_actual_data_for_evaluation(csv_file_path, n_periods, expected_start_date):
    try:
        df_actual = pd.read_csv(csv_file_path).iloc[-n_periods:]
        
        date_col_actual = None
        if 'snapped_at' in df_actual.columns: date_col_actual = 'snapped_at'
        elif 'Date' in df_actual.columns: date_col_actual = 'Date'
        else: raise KeyError("Kolom tanggal ('snapped_at' atau 'Date') tidak ditemukan di CSV aktual.")
        
        df_actual['date_norm'] = pd.to_datetime(df_actual[date_col_actual]).dt.tz_localize(None).dt.normalize()
        
        price_col_actual = None
        possible_price_cols = ['Close', 'price', 'Price', 'close']
        for col in possible_price_cols:
            if col in df_actual.columns: price_col_actual = col; break
        if price_col_actual is None: raise KeyError(f"Kolom harga (e.g., 'Close', 'price') tidak ditemukan. Kolom: {df_actual.columns.tolist()}")
            
        df_actual = df_actual.set_index('date_norm')[[price_col_actual]].rename(columns={price_col_actual: 'Actual_Price'})
        df_actual = df_actual.sort_index()
        
        # Ambil data yang relevan dengan periode prediksi
        # Pastikan data aktual mencakup periode prediksi
        expected_end_date = expected_start_date + timedelta(days=n_periods - 1)
        df_actual_eval = df_actual[(df_actual.index >= expected_start_date) & (df_actual.index <= expected_end_date)]
        
        if len(df_actual_eval) < n_periods:
            print(f"Peringatan: Data aktual hanya memiliki {len(df_actual_eval)} poin untuk periode {n_periods} hari yang diharapkan.")
            print(f"Periode yang diharapkan: {expected_start_date.strftime('%Y-%m-%d')} hingga {expected_end_date.strftime('%Y-%m-%d')}")
            print(f"Data aktual tersedia dari {df_actual.index.min().strftime('%Y-%m-%d')} hingga {df_actual.index.max().strftime('%Y-%m-%d')}")


        return df_actual_eval
    except FileNotFoundError:
        print(f"ERROR: File '{csv_file_path}' tidak ditemukan.")
        return pd.DataFrame()
    except Exception as e:
        print(f"ERROR saat memuat data aktual: {e}")
        return pd.DataFrame()

In [209]:
# Plot Matriks Korelasi
def plot_corr(df, title):
    corr = df.select_dtypes(include=[np.number]).corr()
    fig = px.imshow(
        corr,
        text_auto=True,
        color_continuous_scale='RdBu',
        zmin=-1, zmax=1,
        aspect="auto",
        labels=dict(x="Fitur", y="Fitur", color="Korelasi")
    )
    fig.update_layout(title=title, width=800, height=700)
    fig.show()

In [None]:
def evaluate_and_plot_prediction(model_name, df_predictions, target_column, actual_data_path, n_days, start_date, coin_name):
    """
    Evaluasi hasil prediksi terhadap data aktual dan tampilkan plot perbandingan.

    Parameters:
    - model_name: str, nama model untuk ditampilkan di grafik dan metrik.
    - df_predictions: DataFrame, berisi hasil prediksi dengan index berupa tanggal.
    - target_column: str, nama kolom hasil prediksi di df_predictions.
    - actual_data_path: str, path ke file CSV berisi data aktual.
    - n_days: int, jumlah hari ke depan untuk evaluasi.
    - start_date: datetime, tanggal awal prediksi.
    """
    print(f"\n--- Prediksi {n_days} Hari ke Depan ({model_name}) ---")
    print(df_predictions[[target_column]])

    print(f"\n--- Evaluasi Prediksi {model_name} dengan Data Aktual ---")
    actual_data = load_actual_data_for_evaluation(actual_data_path, n_days, start_date)

    if actual_data.empty or df_predictions[target_column].isnull().all():
        print(f"Data aktual tidak tersedia atau semua prediksi {model_name} adalah NaN.")
        return

    comparison = df_predictions.join(actual_data, how='inner')

    if comparison.empty or 'Actual_Price' not in comparison.columns or target_column not in comparison.columns:
        print(f"Gagal menggabungkan prediksi {model_name} dengan data aktual atau kolom tidak ditemukan.")
        return

    comparison.dropna(subset=['Actual_Price', target_column], inplace=True)

    if comparison.empty:
        print(f"Tidak ada data yang cocok untuk evaluasi {model_name} setelah dropna.")
        return

    # Gunakan fungsi yang menyimpan metrik
    if 'calculate_metrics_with_storage' in globals():
        calculate_metrics_with_storage(comparison['Actual_Price'], comparison[target_column], 
                                     f"{model_name} Pred vs Actual", coin_name, "Multivariate Time Series")
    else:
        calculate_metrics(comparison['Actual_Price'], comparison[target_column], f"{model_name} Pred vs Actual")

    fig = go.Figure()

    # Harga Aktual
    fig.add_trace(go.Scatter(
        x=comparison.index,
        y=comparison['Actual_Price'],
        mode='lines+markers',
        name='Harga Aktual',
        marker=dict(symbol='circle', size=8),
        line=dict(color='blue')
    ))

    # Harga Prediksi
    fig.add_trace(go.Scatter(
        x=comparison.index,
        y=comparison[target_column],
        mode='lines+markers',
        name=f'Prediksi {model_name}',
        marker=dict(symbol='x', size=8),
        line=dict(color='red', dash='dash')
    ))

    fig.update_layout(
        title_text=f'Perbandingan Harga Prediksi {model_name} dengan Harga Aktual {coin_name} dalam {n_days} Hari ke Depan',
        xaxis_title='Tanggal',
        yaxis_title='Harga',
        legend_title_text='Legenda',
        hovermode="x unified"
    )

    fig.show()


# BTC

In [211]:
btc = load_and_merge('BTC')
btc

Unnamed: 0,snapped_at,price,market_cap,total_volume,stars,forks,issues_opened,issues_closed,pulls_opened,pulls_merged,pulls_closed,commits,issue_comments
0,2013-04-28,135.300000,1.500518e+09,0.000000e+00,0,0,1,1,3,1,0,4,4
1,2013-04-29,141.960000,1.575032e+09,0.000000e+00,0,0,1,4,2,5,1,9,4
2,2013-04-30,135.300000,1.501657e+09,0.000000e+00,0,0,3,3,3,1,0,5,2
3,2013-05-01,117.000000,1.298952e+09,0.000000e+00,0,0,0,0,5,0,0,4,5
4,2013-05-02,103.430000,1.148668e+09,0.000000e+00,0,0,1,1,5,1,2,5,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4386,2025-05-03,96855.568134,1.923252e+12,2.327638e+10,0,0,2,3,5,0,0,1,1
4387,2025-05-04,95922.868424,1.904895e+12,1.379755e+10,0,0,0,0,2,0,1,1,1
4388,2025-05-05,94326.620485,1.872812e+12,1.490312e+10,0,0,0,0,4,3,4,3,1
4389,2025-05-06,94758.823711,1.882511e+12,2.408646e+10,0,0,2,16,6,5,0,7,1


In [212]:
plot_corr(btc, 'Bitcoin: Sebelum Feature Engineering')

In [213]:
btc_pre = preprocess(btc).drop(columns=['issue_activity', 'pull_activity'])
btc_pre

Unnamed: 0,snapped_at,price,market_cap,total_volume,commits
0,2013-04-28,135.300000,1.500518e+09,0.000000e+00,4
1,2013-04-29,141.960000,1.575032e+09,0.000000e+00,9
2,2013-04-30,135.300000,1.501657e+09,0.000000e+00,5
3,2013-05-01,117.000000,1.298952e+09,0.000000e+00,4
4,2013-05-02,103.430000,1.148668e+09,0.000000e+00,5
...,...,...,...,...,...
4386,2025-05-03,96855.568134,1.923252e+12,2.327638e+10,1
4387,2025-05-04,95922.868424,1.904895e+12,1.379755e+10,1
4388,2025-05-05,94326.620485,1.872812e+12,1.490312e+10,3
4389,2025-05-06,94758.823711,1.882511e+12,2.408646e+10,7


In [214]:
plot_corr(btc_pre, 'Bitcoin: Setelah Feature Engineering')

In [215]:
# Persiapan DataFrame utama
df_full = btc_pre.copy()
df_full['snapped_at'] = pd.to_datetime(df_full['snapped_at'])
df_full = df_full.set_index('snapped_at')
df_full.sort_index(inplace=True) # Pastikan data terurut waktu

# Konfigurasi prediksi
TARGET_COL = 'price'
N_FUTURE_PERIODS = 21
LAST_HISTORICAL_DATE = df_full.index.max()
start_date = pd.to_datetime(LAST_HISTORICAL_DATE + timedelta(days=1))

print(f"Data historis terakhir pada: {LAST_HISTORICAL_DATE.strftime('%Y-%m-%d')}")
print(f"Jumlah baris data historis: {len(df_full)}")
print(f"Prediksi dimulai dari: {start_date.strftime('%Y-%m-%d')}")

Data historis terakhir pada: 2025-05-07
Jumlah baris data historis: 4391
Prediksi dimulai dari: 2025-05-08


In [216]:
df_full.columns

Index(['price', 'market_cap', 'total_volume', 'commits'], dtype='object')

## Multivariate Time Series

In [217]:
X_features = ['market_cap', 'total_volume', 'commits']

### XGBoost

In [218]:
# 1. Konfigurasi
best_params = {
    'n_estimators': 1000,
    'learning_rate': 0.7,
    'objective': 'reg:squarederror',
    'max_depth': 11,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 3,
    'lambda': 3,
    'alpha': 3,
    'min_child_weight': 10
}

# 2. Model price berdasarkan fitur exogenous (tanpa prediksi fitur exogenous)
df_price = df_full.copy()

X = df_price[X_features]
y = df_price['price']

model_price = xgb.XGBRegressor(**best_params)
model_price.fit(X, y)

# 3. Untuk prediksi masa depan, hitung trend dari data historis
def calculate_trend(series, window=30):
    """Hitung trend linear dari data historis"""
    recent_data = series.tail(window)
    if len(recent_data) < 2:
        return 0
    
    x = np.arange(len(recent_data))
    y = recent_data.values
    
    # Linear regression sederhana
    slope = np.polyfit(x, y, 1)[0]
    return slope

# Hitung trend untuk setiap fitur exogenous
trends = {}
for feat in X_features:
    trends[feat] = calculate_trend(df_full[feat])

print(f"\nTrend yang dihitung:")
for feat, trend in trends.items():
    print(f"  {feat}: {trend:.6f}")

# Nilai awal (terakhir) dari fitur exogenous
initial_values = df_full[X_features].iloc[-1].copy()
print(f"\nNilai awal fitur:")
for feat in X_features:
    print(f"  {feat}: {initial_values[feat]:.2f}")

# 4. Prediksi harga untuk 21 hari ke depan dengan trend
future_dates = pd.date_range(start=start_date, periods=N_FUTURE_PERIODS)
future_predictions = []
predicted_features = []

for i in range(N_FUTURE_PERIODS):
    # Update fitur berdasarkan trend
    current_features = initial_values.copy()
    for feat in X_features:
        current_features[feat] = initial_values[feat] + (trends[feat] * (i + 1))
    
    # Prediksi harga
    pred_price = model_price.predict([current_features.values])[0]
    future_predictions.append(pred_price)
    predicted_features.append(current_features.copy())

# 5. Buat DataFrame hasil prediksi
predicted_future_X = pd.DataFrame({
    'price_pred_xgb': future_predictions
}, index=future_dates)

# Tambahkan fitur yang diprediksi untuk referensi
for feat in X_features:
    predicted_future_X[feat] = [pf[feat] for pf in predicted_features]

predicted_future_X


Trend yang dihitung:
  market_cap: 13167165580.770658
  total_volume: -1075236863.141995
  commits: 0.071190

Nilai awal fitur:
  market_cap: 1923643840814.60
  total_volume: 23348052607.07
  commits: 17.00


Unnamed: 0,price_pred_xgb,market_cap,total_volume,commits
2025-05-08,97916.960938,1936811000000.0,22272820000.0,17.07119
2025-05-09,97500.609375,1949978000000.0,21197580000.0,17.14238
2025-05-10,101049.359375,1963145000000.0,20122340000.0,17.213571
2025-05-11,100590.820312,1976313000000.0,19047110000.0,17.284761
2025-05-12,100233.78125,1989480000000.0,17971870000.0,17.355951
2025-05-13,100385.734375,2002647000000.0,16896630000.0,17.427141
2025-05-14,103768.390625,2015814000000.0,15821390000.0,17.498331
2025-05-15,104054.992188,2028981000000.0,14746160000.0,17.569522
2025-05-16,103938.460938,2042148000000.0,13670920000.0,17.640712
2025-05-17,103954.140625,2055315000000.0,12595680000.0,17.711902


In [219]:
# predicted_future_X.to_csv('Prediction/btc-xgb-21d-price.csv', index=True, sep=',')

In [220]:
# df_future_dates = pd.read_csv('Prediction/btc-xgb-21d-price.csv')
# df_future_dates['date'] = pd.date_range(start='2025-05-08', periods=len(df_future_dates))
# df_future_dates.set_index('date', inplace=True)

df_future_dates = predicted_future_X.copy()

start_date = df_future_dates.index[0]
evaluate_and_plot_prediction('XGBoost', df_future_dates, 'price_pred_xgb', 'btc-usd-max_21days.csv', N_FUTURE_PERIODS, start_date, coin_name='BTC')


--- Prediksi 21 Hari ke Depan (XGBoost) ---
            price_pred_xgb
2025-05-08    97916.960938
2025-05-09    97500.609375
2025-05-10   101049.359375
2025-05-11   100590.820312
2025-05-12   100233.781250
2025-05-13   100385.734375
2025-05-14   103768.390625
2025-05-15   104054.992188
2025-05-16   103938.460938
2025-05-17   103954.140625
2025-05-18   103892.281250
2025-05-19   103883.078125
2025-05-20   103857.554688
2025-05-21   103907.929688
2025-05-22   104160.601562
2025-05-23   104279.460938
2025-05-24   104253.648438
2025-05-25   104139.007812
2025-05-26   104049.812500
2025-05-27   103996.101562
2025-05-28   103991.921875

--- Evaluasi Prediksi XGBoost dengan Data Aktual ---

--- Metrik Evaluasi untuk XGBoost Pred vs Actual ---
RMSE: 3605.0687
MAE: 2969.7590
MAPE: 2.78%
R2 Score: -0.2690


### Random Forest

In [221]:
# 1. Konfigurasi
rf_params = {
    'ccp_alpha': True,
    'oob_score': True,
    'random_state': 42
}

# 2. Model price berdasarkan fitur exogenous (tanpa prediksi fitur exogenous)
df_price = df_full.copy()
df_price.dropna(subset=X_features + ['price'], inplace=True)

X = df_price[X_features]
y = df_price['price']

model_price = RandomForestRegressor(**rf_params)
model_price.fit(X, y)

# 3. Untuk prediksi masa depan, hitung trend dari data historis
def calculate_trend_rf(series, window=30):
    """Hitung trend linear dari data historis"""
    recent_data = series.tail(window)
    if len(recent_data) < 2:
        return 0
    
    x = np.arange(len(recent_data))
    y = recent_data.values
    
    # Linear regression sederhana
    slope = np.polyfit(x, y, 1)[0]
    return slope

# Hitung trend untuk setiap fitur exogenous
trends = {}
for feat in X_features:
    trends[feat] = calculate_trend_rf(df_full[feat])

# Nilai awal (terakhir) dari fitur exogenous
initial_values = df_full[X_features].iloc[-1].copy()

# 4. Prediksi harga untuk 21 hari ke depan dengan trend
future_dates = pd.date_range(start=start_date, periods=N_FUTURE_PERIODS)
future_predictions = []

for i in range(N_FUTURE_PERIODS):
    # Update fitur berdasarkan trend
    current_features = initial_values.copy()
    for feat in X_features:
        current_features[feat] = initial_values[feat] + (trends[feat] * (i + 1))
    
    pred_price = model_price.predict(pd.DataFrame([current_features], columns=X_features))[0]
    future_predictions.append(pred_price)

# 5. Buat DataFrame hasil prediksi
predicted_future_X = pd.DataFrame({
    'price_pred_rf': future_predictions
}, index=future_dates)

predicted_future_X

Unnamed: 0,price_pred_rf
2025-05-08,97832.372286
2025-05-09,98449.549253
2025-05-10,99219.534193
2025-05-11,99970.891174
2025-05-12,100381.070201
2025-05-13,101371.159487
2025-05-14,101568.932107
2025-05-15,102366.030703
2025-05-16,102859.61819
2025-05-17,103927.746109


In [222]:
# predicted_future_X.to_csv('Prediction/btc-rf-21d-price.csv', index=True, sep=',')

In [223]:
# df_future_dates = pd.read_csv('Prediction/btc-rf-21d-price.csv')
# df_future_dates['date'] = pd.date_range(start='2025-05-08', periods=len(df_future_dates))
# df_future_dates.set_index('date', inplace=True)

df_future_dates = predicted_future_X.copy()

start_date = df_future_dates.index[0]
evaluate_and_plot_prediction('Random Forest', df_future_dates, 'price_pred_rf', 'btc-usd-max_21days.csv', N_FUTURE_PERIODS, start_date, 'BTC')


--- Prediksi 21 Hari ke Depan (Random Forest) ---
            price_pred_rf
2025-05-08   97832.372286
2025-05-09   98449.549253
2025-05-10   99219.534193
2025-05-11   99970.891174
2025-05-12  100381.070201
2025-05-13  101371.159487
2025-05-14  101568.932107
2025-05-15  102366.030703
2025-05-16  102859.618190
2025-05-17  103927.746109
2025-05-18  104293.579552
2025-05-19  104768.483100
2025-05-20  105655.763047
2025-05-21  105646.531703
2025-05-22  105696.975799
2025-05-23  105696.975799
2025-05-24  105696.975799
2025-05-25  105696.975799
2025-05-26  105696.975799
2025-05-27  105696.975799
2025-05-28  105696.975799

--- Evaluasi Prediksi Random Forest dengan Data Aktual ---

--- Metrik Evaluasi untuk Random Forest Pred vs Actual ---
RMSE: 2913.8578
MAE: 2439.6414
MAPE: 2.29%
R2 Score: 0.1710


### Decision Tree

In [224]:
# 1. Konfigurasi
dt_params = {
    'random_state': 42
}

# 2. Model price berdasarkan fitur exogenous (tanpa prediksi fitur exogenous)
df_price = df_full.copy()
df_price.dropna(subset=X_features + ['price'], inplace=True)

X = df_price[X_features]
y = df_price['price']

model_price = DecisionTreeRegressor(**dt_params)
model_price.fit(X, y)

# 3. Untuk prediksi masa depan, hitung trend dari data historis
def calculate_trend_dt(series, window=30):
    """Hitung trend linear dari data historis"""
    recent_data = series.tail(window)
    if len(recent_data) < 2:
        return 0
    
    x = np.arange(len(recent_data))
    y = recent_data.values
    
    # Linear regression sederhana
    slope = np.polyfit(x, y, 1)[0]
    return slope

# Hitung trend untuk setiap fitur exogenous
trends = {}
for feat in X_features:
    trends[feat] = calculate_trend_dt(df_full[feat])

# Nilai awal (terakhir) dari fitur exogenous
initial_values = df_full[X_features].iloc[-1].copy()

# 4. Prediksi harga untuk 21 hari ke depan dengan trend
future_dates = pd.date_range(start=start_date, periods=N_FUTURE_PERIODS)
future_predictions = []

for i in range(N_FUTURE_PERIODS):
    # Update fitur berdasarkan trend
    current_features = initial_values.copy()
    for feat in X_features:
        current_features[feat] = initial_values[feat] + (trends[feat] * (i + 1))
    
    # Prediksi harga
    pred_price = model_price.predict(pd.DataFrame([current_features], columns=X_features))[0]
    future_predictions.append(pred_price)

# 5. Buat DataFrame hasil prediksi
predicted_future_X = pd.DataFrame({
    'price_pred_dtree': future_predictions
}, index=future_dates)

predicted_future_X

Unnamed: 0,price_pred_dtree
2025-05-08,97836.188561
2025-05-09,98364.589466
2025-05-10,99344.954174
2025-05-11,99344.954174
2025-05-12,100674.787625
2025-05-13,101764.908602
2025-05-14,101764.908602
2025-05-15,102552.248743
2025-05-16,102552.248743
2025-05-17,103718.979398


In [225]:
# predicted_future_X.to_csv('Prediction/btc-dtree-21d-price.csv', index=True, sep=',')

In [226]:
# df_future_dates = pd.read_csv('Prediction/btc-dtree-21d-price.csv')
# df_future_dates['date'] = pd.date_range(start='2025-05-08', periods=len(df_future_dates))
# df_future_dates.set_index('date', inplace=True)

df_future_dates = predicted_future_X.copy()

start_date = df_future_dates.index[0]
evaluate_and_plot_prediction('Decision Tree', df_future_dates, 'price_pred_dtree', 'btc-usd-max_21days.csv', N_FUTURE_PERIODS, start_date, 'BTC')


--- Prediksi 21 Hari ke Depan (Decision Tree) ---
            price_pred_dtree
2025-05-08      97836.188561
2025-05-09      98364.589466
2025-05-10      99344.954174
2025-05-11      99344.954174
2025-05-12     100674.787625
2025-05-13     101764.908602
2025-05-14     101764.908602
2025-05-15     102552.248743
2025-05-16     102552.248743
2025-05-17     103718.979398
2025-05-18     104334.615757
2025-05-19     104796.040777
2025-05-20     106034.913403
2025-05-21     106034.913403
2025-05-22     106034.913403
2025-05-23     106034.913403
2025-05-24     106034.913403
2025-05-25     106034.913403
2025-05-26     106034.913403
2025-05-27     106034.913403
2025-05-28     106034.913403

--- Evaluasi Prediksi Decision Tree dengan Data Aktual ---

--- Metrik Evaluasi untuk Decision Tree Pred vs Actual ---
RMSE: 2800.1849
MAE: 2309.0185
MAPE: 2.17%
R2 Score: 0.2344


# ETH

In [227]:
eth = load_and_merge('ETH')
eth

Unnamed: 0,snapped_at,price,market_cap,total_volume,stars,forks,issues_opened,issues_closed,pulls_opened,pulls_merged,pulls_closed,commits,issue_comments
0,2015-08-07,2.831620,0.000000e+00,9.062200e+04,0,0,9,2,5,6,0,12,2
1,2015-08-08,1.330750,8.033948e+07,3.680700e+05,0,0,3,0,2,1,0,2,2
2,2015-08-10,0.687586,4.155631e+07,4.004641e+05,0,0,3,3,2,0,0,2,2
3,2015-08-11,1.067379,6.453901e+07,1.518998e+06,0,0,1,2,4,3,0,6,2
4,2015-08-12,1.256613,7.601326e+07,2.073893e+06,0,0,4,1,3,2,0,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3556,2025-05-03,1841.427591,2.222690e+11,1.168630e+10,0,0,0,1,0,1,1,1,0
3557,2025-05-04,1834.501912,2.214950e+11,6.704990e+09,0,0,0,0,3,2,1,2,0
3558,2025-05-05,1808.156094,2.182163e+11,7.479846e+09,0,0,1,3,6,6,5,8,0
3559,2025-05-06,1820.004460,2.197699e+11,1.054523e+10,0,0,1,3,5,2,4,2,0


In [228]:
plot_corr(eth, 'Ethereum: Sebelum Feature Engineering')

In [229]:
# eth.to_csv('eth-github.csv', sep=',')

In [230]:
eth_pre = preprocess(eth).drop(columns=['issue_activity', 'commits'])
plot_corr(eth_pre, 'Ethereum: Setelah Feature Engineering')

In [231]:
# Persiapan DataFrame utama
df_full = eth_pre.copy()
df_full['snapped_at'] = pd.to_datetime(df_full['snapped_at'])
df_full = df_full.set_index('snapped_at')
df_full.sort_index(inplace=True) # Pastikan data terurut waktu

print(f"Data historis terakhir pada: {LAST_HISTORICAL_DATE.strftime('%Y-%m-%d')}")
print(f"Jumlah baris data historis: {len(df_full)}")

Data historis terakhir pada: 2025-05-07
Jumlah baris data historis: 3561


In [232]:
df_full

Unnamed: 0_level_0,price,market_cap,total_volume,pull_activity
snapped_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-08-07,2.831620,0.000000e+00,9.062200e+04,11
2015-08-08,1.330750,8.033948e+07,3.680700e+05,3
2015-08-10,0.687586,4.155631e+07,4.004641e+05,2
2015-08-11,1.067379,6.453901e+07,1.518998e+06,7
2015-08-12,1.256613,7.601326e+07,2.073893e+06,5
...,...,...,...,...
2025-05-03,1841.427591,2.222690e+11,1.168630e+10,2
2025-05-04,1834.501912,2.214950e+11,6.704990e+09,6
2025-05-05,1808.156094,2.182163e+11,7.479846e+09,17
2025-05-06,1820.004460,2.197699e+11,1.054523e+10,11


In [233]:
X_features = ['market_cap', 'total_volume', 'pull_activity']

### XGBoost

In [234]:
# 1. Konfigurasi
best_params = {
    'n_estimators': 1000,
    'learning_rate': 0.6,
    'objective': 'reg:squarederror'
}

# 2. Model price berdasarkan fitur exogenous (tanpa prediksi fitur exogenous)
df_price = df_full.copy()

X = df_price[X_features]
y = df_price['price']

# Debug info
print(f"Rentang harga historis: ${y.min():.2f} - ${y.max():.2f}")
print(f"Harga rata-rata: ${y.mean():.2f}")
print(f"Harga terakhir: ${y.iloc[-1]:.2f}")
print(f"Jumlah data: {len(y)}")

model_price = xgb.XGBRegressor(**best_params)
model_price.fit(X, y)

# 3. Untuk prediksi masa depan, hitung trend dari data historis
def calculate_trend_eth(series, window=30):
    """Hitung trend linear dari data historis"""
    recent_data = series.tail(window)
    if len(recent_data) < 2:
        return 0
    
    x = np.arange(len(recent_data))
    y = recent_data.values
    
    # Linear regression sederhana
    slope = np.polyfit(x, y, 1)[0]
    return slope

# Hitung trend untuk setiap fitur exogenous
trends = {}
for feat in X_features:
    trends[feat] = calculate_trend_eth(df_full[feat])

# Nilai awal (terakhir) dari fitur exogenous
initial_values = df_full[X_features].iloc[-1].copy()

# 4. Prediksi harga untuk 21 hari ke depan dengan trend
future_dates = pd.date_range(start=start_date, periods=N_FUTURE_PERIODS)
future_predictions = []
predicted_features = []

for i in range(N_FUTURE_PERIODS):
    # Update fitur berdasarkan trend
    current_features = initial_values.copy()
    for feat in X_features:
        current_features[feat] = initial_values[feat] + (trends[feat] * (i + 1))
    
    # Prediksi harga
    pred_price = model_price.predict([current_features.values])[0]
    future_predictions.append(pred_price)
    predicted_features.append(current_features.copy())

# 5. Buat DataFrame hasil prediksi
predicted_future_X = pd.DataFrame({
    'price_pred_xgb': future_predictions
}, index=future_dates)

# Tambahkan fitur yang diprediksi untuk referensi
for feat in X_features:
    predicted_future_X[feat] = [pf[feat] for pf in predicted_features]

predicted_future_X

Rentang harga historis: $0.43 - $4815.00
Harga rata-rata: $1194.77
Harga terakhir: $1816.17
Jumlah data: 3561


Unnamed: 0,price_pred_xgb,market_cap,total_volume,pull_activity
2025-05-08,1838.557617,220737400000.0,11415520000.0,12.051168
2025-05-09,1849.613281,222168900000.0,10880640000.0,12.102336
2025-05-10,1852.602905,223600500000.0,10345770000.0,12.153504
2025-05-11,1864.271729,225032000000.0,9810893000.0,12.204672
2025-05-12,1894.092163,226463500000.0,9276018000.0,12.25584
2025-05-13,1892.550049,227895000000.0,8741143000.0,12.307008
2025-05-14,1907.452026,229326600000.0,8206268000.0,12.358176
2025-05-15,1920.932983,230758100000.0,7671393000.0,12.409344
2025-05-16,1917.759644,232189600000.0,7136518000.0,12.460512
2025-05-17,1934.904419,233621200000.0,6601643000.0,12.51168


In [235]:
# predicted_future_X.to_csv('Prediction/eth-xgb-21d-price.csv', index=True, sep=',')

In [236]:
# df_future_dates = pd.read_csv('Prediction/eth-xgb-21d-price.csv')
# df_future_dates['date'] = pd.date_range(start='2025-05-08', periods=len(df_future_dates))
# df_future_dates.set_index('date', inplace=True)

df_future_dates = predicted_future_X.copy()

start_date = df_future_dates.index[0]

evaluate_and_plot_prediction('XGBoost', df_future_dates, 'price_pred_xgb', 'eth-usd-max_21days.csv', N_FUTURE_PERIODS, start_date, 'ETH')


--- Prediksi 21 Hari ke Depan (XGBoost) ---
            price_pred_xgb
2025-05-08     1838.557617
2025-05-09     1849.613281
2025-05-10     1852.602905
2025-05-11     1864.271729
2025-05-12     1894.092163
2025-05-13     1892.550049
2025-05-14     1907.452026
2025-05-15     1920.932983
2025-05-16     1917.759644
2025-05-17     1934.904419
2025-05-18     1937.165039
2025-05-19     1954.012695
2025-05-20     1955.132080
2025-05-21     2006.637329
2025-05-22     2005.054565
2025-05-23     2005.249634
2025-05-24     2038.985229
2025-05-25     2053.320312
2025-05-26     2052.926025
2025-05-27     2088.852539
2025-05-28     2091.466797

--- Evaluasi Prediksi XGBoost dengan Data Aktual ---

--- Metrik Evaluasi untuk XGBoost Pred vs Actual ---
RMSE: 559.0217
MAE: 539.1795
MAPE: 21.29%
R2 Score: -8.2614


### Random Forest

In [237]:
# 1. Konfigurasi
rf_params = {
    'random_state': 42
}

# 2. Model price berdasarkan fitur exogenous (tanpa prediksi fitur exogenous)
df_price = df_full.copy()
df_price.dropna(subset=X_features + ['price'], inplace=True)

X = df_price[X_features]
y = df_price['price']

model_price = RandomForestRegressor(**rf_params)
model_price.fit(X, y)

# 3. Untuk prediksi masa depan, hitung trend dari data historis
def calculate_trend_rf(series, window=30):
    """Hitung trend linear dari data historis"""
    recent_data = series.tail(window)
    if len(recent_data) < 2:
        return 0
    
    x = np.arange(len(recent_data))
    y = recent_data.values
    
    # Linear regression sederhana
    slope = np.polyfit(x, y, 1)[0]
    return slope

# Hitung trend untuk setiap fitur exogenous
trends = {}
for feat in X_features:
    trends[feat] = calculate_trend_rf(df_full[feat])

# Nilai awal (terakhir) dari fitur exogenous
initial_values = df_full[X_features].iloc[-1].copy()

# 4. Prediksi harga untuk 21 hari ke depan dengan trend
future_dates = pd.date_range(start=start_date, periods=N_FUTURE_PERIODS)
future_predictions = []

for i in range(N_FUTURE_PERIODS):
    # Update fitur berdasarkan trend
    current_features = initial_values.copy()
    for feat in X_features:
        current_features[feat] = initial_values[feat] + (trends[feat] * (i + 1))
    
    # Prediksi harga
    pred_price = model_price.predict(pd.DataFrame([current_features], columns=X_features))[0]
    future_predictions.append(pred_price)

# 5. Buat DataFrame hasil prediksi
predicted_future_X = pd.DataFrame({
    'price_pred_rf': future_predictions
}, index=future_dates)

predicted_future_X

Unnamed: 0,price_pred_rf
2025-05-08,1832.392869
2025-05-09,1846.410144
2025-05-10,1879.292557
2025-05-11,1870.476266
2025-05-12,1882.153454
2025-05-13,1894.611584
2025-05-14,1909.95453
2025-05-15,1919.567559
2025-05-16,1932.153437
2025-05-17,1943.107328


In [238]:
# predicted_future_X.to_csv('Prediction/eth-rf-21d-price.csv', index=True, sep=',')

In [239]:
# df_future_dates = pd.read_csv('Prediction/eth-xgb-21d-price.csv')
# df_future_dates['date'] = pd.date_range(start='2025-05-08', periods=len(df_future_dates))
# df_future_dates.set_index('date', inplace=True)

df_future_dates = predicted_future_X.copy()

start_date = df_future_dates.index[0]

evaluate_and_plot_prediction('Random Forest', df_future_dates, 'price_pred_rf', 'eth-usd-max_21days.csv', N_FUTURE_PERIODS, start_date, 'ETH')


--- Prediksi 21 Hari ke Depan (Random Forest) ---
            price_pred_rf
2025-05-08    1832.392869
2025-05-09    1846.410144
2025-05-10    1879.292557
2025-05-11    1870.476266
2025-05-12    1882.153454
2025-05-13    1894.611584
2025-05-14    1909.954530
2025-05-15    1919.567559
2025-05-16    1932.153437
2025-05-17    1943.107328
2025-05-18    1947.802353
2025-05-19    1969.242880
2025-05-20    1981.427882
2025-05-21    1980.536988
2025-05-22    2001.261055
2025-05-23    2011.097935
2025-05-24    2022.543046
2025-05-25    2030.684977
2025-05-26    2050.931004
2025-05-27    2071.489886
2025-05-28    2074.891740

--- Evaluasi Prediksi Random Forest dengan Data Aktual ---

--- Metrik Evaluasi untuk Random Forest Pred vs Actual ---
RMSE: 558.7926
MAE: 539.0452
MAPE: 21.28%
R2 Score: -8.2538


### Decision Tree

In [240]:
# 1. Konfigurasi
dt_params = {
    'random_state': 42
}

# 2. Model price berdasarkan fitur exogenous (tanpa prediksi fitur exogenous)
df_price = df_full.copy()
df_price.dropna(subset=X_features + ['price'], inplace=True)

X = df_price[X_features]
y = df_price['price']

model_price = DecisionTreeRegressor(**dt_params)
model_price.fit(X, y)

# 3. Untuk prediksi masa depan, hitung trend dari data historis
def calculate_trend_dt_eth(series, window=30):
    """Hitung trend linear dari data historis"""
    recent_data = series.tail(window)
    if len(recent_data) < 2:
        return 0
    
    x = np.arange(len(recent_data))
    y = recent_data.values
    
    # Linear regression sederhana
    slope = np.polyfit(x, y, 1)[0]
    return slope

# Hitung trend untuk setiap fitur exogenous
trends = {}
for feat in X_features:
    trends[feat] = calculate_trend_dt_eth(df_full[feat])

# Nilai awal (terakhir) dari fitur exogenous
initial_values = df_full[X_features].iloc[-1].copy()

# 4. Prediksi harga untuk 21 hari ke depan dengan trend
future_dates = pd.date_range(start=start_date, periods=N_FUTURE_PERIODS)
future_predictions = []

for i in range(N_FUTURE_PERIODS):
    # Update fitur berdasarkan trend
    current_features = initial_values.copy()
    for feat in X_features:
        current_features[feat] = initial_values[feat] + (trends[feat] * (i + 1))
    
    # Prediksi harga
    pred_price = model_price.predict(pd.DataFrame([current_features], columns=X_features))[0]
    future_predictions.append(pred_price)

# 5. Buat DataFrame hasil prediksi
predicted_future_X = pd.DataFrame({
    'price_pred_dtree': future_predictions
}, index=future_dates)

predicted_future_X

Unnamed: 0,price_pred_dtree
2025-05-08,1834.173057
2025-05-09,1846.064264
2025-05-10,1910.726952
2025-05-11,1870.565223
2025-05-12,1883.242585
2025-05-13,1893.714274
2025-05-14,1907.212322
2025-05-15,1923.03599
2025-05-16,1932.796406
2025-05-17,1939.785688


In [241]:
# predicted_future_X.to_csv('Prediction/eth-dtree-21d-price.csv', index=True, sep=',')

In [242]:
# df_future_dates = pd.read_csv('Prediction/eth-dtree-21d-price.csv')
# df_future_dates['date'] = pd.date_range(start='2025-05-08', periods=len(df_future_dates))
# df_future_dates.set_index('date', inplace=True)

df_future_dates = predicted_future_X.copy()

start_date = df_future_dates.index[0]
evaluate_and_plot_prediction('Decision Tree', df_future_dates, 'price_pred_dtree', 'eth-usd-max_21days.csv', N_FUTURE_PERIODS, start_date, 'ETH')


--- Prediksi 21 Hari ke Depan (Decision Tree) ---
            price_pred_dtree
2025-05-08       1834.173057
2025-05-09       1846.064264
2025-05-10       1910.726952
2025-05-11       1870.565223
2025-05-12       1883.242585
2025-05-13       1893.714274
2025-05-14       1907.212322
2025-05-15       1923.035990
2025-05-16       1932.796406
2025-05-17       1939.785688
2025-05-18       1953.781128
2025-05-19       1964.529383
2025-05-20       1979.770545
2025-05-21       1979.770545
2025-05-22       2001.047023
2025-05-23       2011.126721
2025-05-24       2030.000506
2025-05-25       2025.888698
2025-05-26       2046.646350
2025-05-27       2077.535665
2025-05-28       2077.535665

--- Evaluasi Prediksi Decision Tree dengan Data Aktual ---

--- Metrik Evaluasi untuk Decision Tree Pred vs Actual ---
RMSE: 557.4505
MAE: 537.4567
MAPE: 21.21%
R2 Score: -8.2094
