In [None]:
!pip install yfinance
!pip install pandas_market_calendars
!pip install statsmodels
!pip install xgboost
!pip install matplotlib-venn

Collecting pandas_market_calendars
  Downloading pandas_market_calendars-5.1.1-py3-none-any.whl.metadata (9.7 kB)
Collecting exchange-calendars>=3.3 (from pandas_market_calendars)
  Downloading exchange_calendars-4.11.1-py3-none-any.whl.metadata (38 kB)
Collecting pyluach (from exchange-calendars>=3.3->pandas_market_calendars)
  Downloading pyluach-2.3.0-py3-none-any.whl.metadata (4.3 kB)
Collecting korean_lunar_calendar (from exchange-calendars>=3.3->pandas_market_calendars)
  Downloading korean_lunar_calendar-0.3.1-py3-none-any.whl.metadata (2.8 kB)
Downloading pandas_market_calendars-5.1.1-py3-none-any.whl (127 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.4/127.4 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading exchange_calendars-4.11.1-py3-none-any.whl (208 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m208.9/208.9 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading korean_lunar_calendar-0.3.1-py3-none-any.whl 

In [None]:
!pip install pandas_market_calendars
!pip install yfinance scikit-learn xgboost matplotlib
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import pandas_market_calendars as mcal
import matplotlib.pyplot as plt

# ไลบรารีสำหรับ Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler

# ------------------------------
# 0. กำหนด list หุ้นทั้งหมด
# ------------------------------
tickers = ["AAPL", "AMD", "APP", "AVGO", "GOOG", "GOOGL", "META", "MSFT", "NVDA", "PLTR"]

# กำหนดช่วงวันที่
start_date = datetime(2022, 1, 1)
end_date   = datetime(2024, 12, 31)

all_data_list = []

# ------------------------------
# 1. ดาวน์โหลดข้อมูลหุ้นทั้งหมด
# ------------------------------
for ticker in tickers:
    print(f"📥 Downloading {ticker} ...")
    data = yf.download(ticker, start=start_date.strftime('%Y-%m-%d'), end=end_date.strftime('%Y-%m-%d'))

    if data.empty:
        print(f"❌ No data for {ticker}")
        continue

    data.reset_index(inplace=True)

    if isinstance(data.columns, pd.MultiIndex):
        data.columns = [' '.join(col).strip() if isinstance(col, tuple) else col for col in data.columns.values]

    def clean_columns(cols):
        cleaned = []
        for col in cols:
            if 'Date' in col:
                cleaned.append('Date')
            else:
                cleaned.append(col.split()[0])
        return cleaned

    data.columns = clean_columns(data.columns)
    data['Symbol'] = ticker.upper()

    wanted_cols = ['Date', 'Close', 'High', 'Low', 'Open', 'Symbol']
    data = data[[col for col in wanted_cols if col in data.columns]]

    all_data_list.append(data)

# Concatenate all dataframes in the list into a single dataframe
flat_df = pd.concat(all_data_list, ignore_index=True)






In [None]:
flat_df

In [None]:
import pandas as pd
import pandas_market_calendars as mcal

# 1. สร้างช่วงวันที่ต่อเนื่องจากข้อมูลจริง
all_dates = pd.DataFrame({
    'Date': pd.date_range(flat_df['Date'].min(), flat_df['Date'].max())
})

# 2. สร้างรายการ Symbol ให้ครบทุกวัน
symbols = flat_df['Symbol'].unique()
expanded_list = []

for sym in symbols:
    temp = all_dates.copy()
    temp['Symbol'] = sym
    expanded_list.append(temp)

# รวมข้อมูล Symbol ทั้งหมดเข้าด้วยกัน
all_dates_symbols = pd.concat(expanded_list, ignore_index=True)

# 3. รวมข้อมูลกับ flat_df (ราคาหุ้น)
full_df = all_dates_symbols.merge(flat_df, on=['Date', 'Symbol'], how='left')

# 4. เลือกเฉพาะคอลัมน์ที่ต้องการ
full_df = full_df[['Date', 'Symbol', 'Open', 'High', 'Low', 'Close']]




In [None]:
full_df

In [None]:
# ----------------------------
# เติมค่าว่างให้ราคาหุ้น (Close, Open, High, Low) แยกแต่ละ Symbol
# ----------------------------
price_cols = ['Close', 'Open', 'High', 'Low']

for col in price_cols:
    full_df[col] = full_df.groupby('Symbol')[col].transform(
        lambda x: x.interpolate(method='spline', order=3)  # spline cubic
                      .interpolate(method='linear')        # ถ้ายังมี NaN เติมแบบ linear
    )

In [None]:
full_df.groupby('Symbol').apply(lambda g: g.isnull().sum())

In [None]:
# ----------------------------
# 2) สร้าง lag 7 วันสำหรับ Close แยกแต่ละ Symbol
# ----------------------------
lag_cols = []
for lag in range(1, 8):  # สร้าง lag ตั้งแต่ 1 ถึง 7 วัน
    col_name = f'Close_lag{lag}'
    # shift แยกตาม Symbol
    full_df[col_name] = full_df.groupby('Symbol')['Close'].shift(lag)
    lag_cols.append(col_name)

# ตรวจสอบจำนวน NaN
print("จำนวน NaN ที่เหลือในแต่ละคอลัมน์:")
print(full_df[['Close'] + lag_cols].isna().sum())

# แสดงตัวอย่างข้อมูล
cols_to_show = ['Date','Symbol','Close'] + lag_cols
print(full_df[cols_to_show].head(15))


In [None]:
full_df = full_df.dropna(subset=['Close'] + lag_cols)

In [None]:
full_df

In [None]:
# ----------------------------
# สร้าง Target: Close ของวันถัดไป
# ----------------------------
full_df['Close_next'] = full_df.groupby('Symbol')['Close'].shift(-1)

# ตรวจสอบว่ามีคอลัมน์แล้ว
print(full_df[['Symbol','Close','Close_next']].head(10))

full_df = full_df.dropna(subset=['Close_next'])

In [None]:
# 3) กำหนด Feature Matrix และ Target
lag_cols = [f'Close_lag{i}' for i in range(1,8)]
X_cols = lag_cols
X = full_df[X_cols]
y = full_df['Close_next']

# 4) ตรวจสอบตัวอย่าง
print("Feature Matrix X:")
print(X.head(10))
print("\nTarget y:")
print(y.head(10))

In [None]:
# รวม X และ y ชั่วคราว
data_ml = pd.concat([full_df[['Symbol']], X, y], axis=1)

# ลบ row ที่มี NaN ใน X หรือ y
data_ml_clean = data_ml.dropna().reset_index(drop=True)

# สร้าง dictionary เก็บ X, y ของแต่ละ Symbol
symbol_data = {}
for symbol, group in data_ml_clean.groupby('Symbol'):
    X_sym = group[X_cols].reset_index(drop=True)
    y_sym = group['Close_next'].reset_index(drop=True)
    symbol_data[symbol] = {'X': X_sym, 'y': y_sym}

# ตรวจสอบตัวอย่างของหุ้นแรก
first_symbol = list(symbol_data.keys())[0]
print(f"Symbol: {first_symbol}")
print("Feature Matrix X:")
print(symbol_data[first_symbol]['X'].head(10))
print("\nTarget y:")
print(symbol_data[first_symbol]['y'].head(10))


In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

# ================================
# 1. SCALE DATA FOR ALL SYMBOLS
# ================================

# dictionary เก็บ scaled X ของแต่ละ Symbol
symbol_scaled_data = {}

for symbol, data in symbol_data.items():
    X_sym = data['X']
    y_sym = data['y']

    # สร้าง scaler สำหรับหุ้นนี้
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_sym)

    # แปลงกลับเป็น DataFrame
    X_scaled_df = pd.DataFrame(X_scaled, columns=X_cols)

    # เก็บลง dictionary
    symbol_scaled_data[symbol] = {
        'X': X_scaled_df,
        'y': y_sym,
        'scaler': scaler  # เก็บ scaler ไว้ใช้ในอนาคต
    }

# ตรวจสอบตัวอย่างของหุ้นแรก
first_symbol = list(symbol_scaled_data.keys())[0]
print(f"Symbol: {first_symbol}")
print("Feature Matrix X (scaled):")
print(symbol_scaled_data[first_symbol]['X'].head(10))
print("\nTarget y:")
print(symbol_scaled_data[first_symbol]['y'].head(10))

# ================================
# 2. SPLIT DATA FOR WALK-FORWARD (70-15-15)
# ================================

def split_data_70_15_15(X, y):
    """
    แบ่งข้อมูลเป็น 70-15-15 สำหรับ time series
    """
    train_ratio = 0.70
    val_ratio = 0.15
    test_ratio = 0.15

    total_samples = len(X)
    train_end_idx = int(total_samples * train_ratio)
    val_end_idx = int(total_samples * (train_ratio + val_ratio))

    # แบ่งข้อมูล
    X_train = X[:train_end_idx]
    X_val = X[train_end_idx:val_end_idx]
    X_test = X[val_end_idx:]

    y_train = y[:train_end_idx]
    y_val = y[train_end_idx:val_end_idx]
    y_test = y[val_end_idx:]

    return X_train, X_val, X_test, y_train, y_val, y_test

# แบ่งข้อมูลสำหรับทุกหุ้น
symbol_split_data = {}

for symbol, data in symbol_scaled_data.items():
    X_train, X_val, X_test, y_train, y_val, y_test = split_data_70_15_15(
        data['X'], data['y']
    )

    symbol_split_data[symbol] = {
        'X_train': X_train,
        'X_val': X_val,
        'X_test': X_test,
        'y_train': y_train,
        'y_val': y_val,
        'y_test': y_test,
        'scaler': data['scaler']
    }

    print(f"\n📊 {symbol} Data Split:")
    print(f"  Train: {len(X_train)} samples (70%)")
    print(f"  Val:   {len(X_val)} samples (15%)")
    print(f"  Test:  {len(X_test)} samples (15%)")

In [None]:
# ================================
# 3. DEFINE MODELS
# ================================

models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.1),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42, objective='reg:squarederror')
}

print(f"\n🤖 Models to evaluate: {list(models.keys())}")

In [None]:
# ================================
# 4. WALK-FORWARD VALIDATION FUNCTION
# ================================

def walk_forward_validation_single_stock(X_train, y_train, X_val, y_val, X_test, y_test, window_size, models):
    """
    Walk-forward validation สำหรับหุ้นเดียว (ข้อมูลถูก scale แล้ว)
    """
    results = []
    predictions = {}
    val_predictions = {}

    # Initialize results storage
    for model_name in models.keys():
        predictions[model_name] = []
        val_predictions[model_name] = []
        results.append({
            'Model': model_name,
            'Val_RMSE': [],
            'Val_MAE': [],
            'Val_R2': [],
            'Test_RMSE': [],
            'Test_MAE': [],
            'Test_R2': []
        })

    # Combine train and validation for walk-forward
    X_train_val = pd.concat([X_train, X_val])
    y_train_val = pd.concat([y_train, y_val])

    # Walk-forward validation on test set
    for i in range(len(X_test)):
        # Define training window (use recent data from train+val)
        train_start = max(0, len(X_train_val) - window_size)
        X_train_window = X_train_val.iloc[train_start:]
        y_train_window = y_train_val.iloc[train_start:]

        # Skip if training data is empty
        if len(X_train_window) == 0:
            continue

        # ไม่ต้อง scale อีกครั้งเพราะข้อมูลถูก scale แล้ว
        X_train_array = X_train_window.values
        X_test_array = X_test.iloc[[i]].values

        # Predict on validation set (only once)
        if i == 0:
            X_val_array = X_val.values

        # Train and predict with each model
        for model_name, model in models.items():
            try:
                # Train model
                model.fit(X_train_array, y_train_window)

                # Predict on validation set (for model evaluation, only once)
                if i == 0:
                    y_val_pred = model.predict(X_val_array)
                    val_predictions[model_name] = y_val_pred

                # Predict on test point
                y_test_pred = model.predict(X_test_array)[0]
                predictions[model_name].append(y_test_pred)

                # Calculate validation metrics (only once)
                if i == 0:
                    val_rmse = np.sqrt(mean_squared_error(y_val, val_predictions[model_name]))
                    val_mae = mean_absolute_error(y_val, val_predictions[model_name])
                    val_r2 = r2_score(y_val, val_predictions[model_name])

                    model_idx = next(idx for idx, res in enumerate(results) if res['Model'] == model_name)
                    results[model_idx]['Val_RMSE'] = val_rmse
                    results[model_idx]['Val_MAE'] = val_mae
                    results[model_idx]['Val_R2'] = val_r2

                # Calculate test metrics (cumulative)
                if len(predictions[model_name]) > 1:
                    test_rmse = np.sqrt(mean_squared_error(y_test.iloc[:i+1], predictions[model_name]))
                    test_mae = mean_absolute_error(y_test.iloc[:i+1], predictions[model_name])
                    test_r2 = r2_score(y_test.iloc[:i+1], predictions[model_name])

                    model_idx = next(idx for idx, res in enumerate(results) if res['Model'] == model_name)
                    results[model_idx]['Test_RMSE'].append(test_rmse)
                    results[model_idx]['Test_MAE'].append(test_mae)
                    results[model_idx]['Test_R2'].append(test_r2)

            except Exception as e:
                print(f"⚠️ Model {model_name} failed at iteration {i}: {str(e)}")
                continue

        # Add current test point to training data for next iteration
        X_train_val = pd.concat([X_train_val, X_test.iloc[[i]]])
        y_train_val = pd.concat([y_train_val, y_test.iloc[[i]]])

    # Aggregate final results
    final_results = []
    for res in results:
        # Validation metrics (single values)
        val_rmse = res['Val_RMSE'] if isinstance(res['Val_RMSE'], (int, float)) else np.nan
        val_mae = res['Val_MAE'] if isinstance(res['Val_MAE'], (int, float)) else np.nan
        val_r2 = res['Val_R2'] if isinstance(res['Val_R2'], (int, float)) else np.nan

        # Test metrics (mean of all iterations)
        test_rmse_mean = np.mean(res['Test_RMSE']) if res['Test_RMSE'] else np.nan
        test_rmse_std = np.std(res['Test_RMSE']) if res['Test_RMSE'] else np.nan
        test_mae_mean = np.mean(res['Test_MAE']) if res['Test_MAE'] else np.nan
        test_mae_std = np.std(res['Test_MAE']) if res['Test_MAE'] else np.nan
        test_r2_mean = np.mean(res['Test_R2']) if res['Test_R2'] else np.nan
        test_r2_std = np.std(res['Test_R2']) if res['Test_R2'] else np.nan

        final_results.append({
            'Model': res['Model'],
            'Val_RMSE': val_rmse,
            'Val_MAE': val_mae,
            'Val_R2': val_r2,
            'Test_RMSE': test_rmse_mean,
            'Test_RMSE_std': test_rmse_std,
            'Test_MAE': test_mae_mean,
            'Test_MAE_std': test_mae_std,
            'Test_R2': test_r2_mean,
            'Test_R2_std': test_r2_std
        })

    return pd.DataFrame(final_results), predictions, val_predictions

In [None]:
# ================================
# 5. RUN WALK-FORWARD VALIDATION FOR ALL SYMBOLS
# ================================

# Set window size
window_size = 60  # Approximately 1 year of trading days

# เก็บผลลัพธ์ของทุกหุ้น
all_results = {}
all_predictions = {}

print("🚀 Starting Walk-Forward Validation for all symbols...")
print("="*80)

for symbol in symbol_split_data.keys():
    print(f"\n📈 Processing {symbol}...")

    data = symbol_split_data[symbol]

    # Run walk-forward validation
    wf_results, wf_test_predictions, wf_val_predictions = walk_forward_validation_single_stock(
        data['X_train'], data['y_train'],
        data['X_val'], data['y_val'],
        data['X_test'], data['y_test'],
        window_size, models
    )

    # เก็บผลลัพธ์
    all_results[symbol] = wf_results
    all_predictions[symbol] = {
        'test_predictions': wf_test_predictions,
        'val_predictions': wf_val_predictions,
        'actual_test': data['y_test'],
        'actual_val': data['y_val']
    }

    print(f"✅ {symbol} completed!")
    print(wf_results.round(4))


In [None]:
all_predictions.to_csv('all_results.csv', index=False)

In [None]:
# ================================
# 6. SUMMARY RESULTS ACROSS ALL SYMBOLS
# ================================

print("\n🎯 SUMMARY: Best Models by Symbol")
print("="*80)

summary_results = []

for symbol, results_df in all_results.items():
    best_val_rmse = results_df.loc[results_df['Val_RMSE'].idxmin(), 'Model']
    best_test_rmse = results_df.loc[results_df['Test_RMSE'].idxmin(), 'Model']
    best_val_r2 = results_df.loc[results_df['Val_R2'].idxmax(), 'Model']
    best_test_r2 = results_df.loc[results_df['Test_R2'].idxmax(), 'Model']

    summary_results.append({
        'Symbol': symbol,
        'Best_Val_RMSE_Model': best_val_rmse,
        'Val_RMSE': results_df['Val_RMSE'].min(),
        'Best_Test_RMSE_Model': best_test_rmse,
        'Test_RMSE': results_df['Test_RMSE'].min(),
        'Best_Val_R2_Model': best_val_r2,
        'Val_R2': results_df['Val_R2'].max(),
        'Best_Test_R2_Model': best_test_r2,
        'Test_R2': results_df['Test_R2'].max()
    })

summary_df = pd.DataFrame(summary_results)
print(summary_df.round(4))

# ================================
# 7. SAVE RESULTS
# ================================

# Save individual results
for symbol, results_df in all_results.items():
    results_df.to_csv(f'walk_forward_results_{symbol}.csv', index=False)

# Save summary
summary_df.to_csv('walk_forward_summary_all_symbols.csv', index=False)

# Save predictions
for symbol, pred_data in all_predictions.items():
    predictions_df = pd.DataFrame(pred_data['test_predictions'])
    predictions_df['Actual'] = pred_data['actual_test'].values
    predictions_df.to_csv(f'walk_forward_predictions_{symbol}.csv', index=False)

print(f"\n💾 Results saved for {len(all_results)} symbols")
print("✅ Walk-Forward Validation completed for all symbols!")

In [None]:
import matplotlib.pyplot as plt

# ================================
# 6. PLOT ACTUAL vs PREDICTED
# ================================
for symbol, preds in all_predictions.items():
    print(f"\n📊 Plotting Actual vs Predicted for {symbol}...")

    # สร้าง DataFrame สำหรับทดสอบ (ใช้เฉพาะโมเดลที่ดีที่สุดจาก RMSE)
    results_df = all_results[symbol]
    best_model_name = results_df.loc[results_df['Test_RMSE'].idxmin(), 'Model']

    print(f"🏆 Best Model for {symbol}: {best_model_name}")

    actual = preds['actual_test'].values

    # ดึงค่าพยากรณ์ของโมเดลที่ดีที่สุด
    y_pred = preds['test_predictions'][best_model_name]

    # กรองเฉพาะค่าที่เป็นตัวเลข
    y_pred = np.array([p for p in y_pred if isinstance(p, (float, int))])
    actual = actual[-len(y_pred):]  # ปรับให้ขนาดเท่ากัน

    # Plot
    plt.figure(figsize=(12, 6))
    plt.plot(actual, label='Actual', color='black', linewidth=2)
    plt.plot(y_pred, label='Predicted', color='red', linestyle='--', linewidth=1.8)

    plt.title(f'Actual vs Predicted Prices - {symbol} ({best_model_name})', fontsize=14)
    plt.xlabel('Time Steps', fontsize=12)
    plt.ylabel('Close Price', fontsize=12)
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()
