In [None]:
!pip install yfinance scikit-learn



In [None]:
!pip install pandas_market_calendars

Collecting pandas_market_calendars
  Downloading pandas_market_calendars-5.1.1-py3-none-any.whl.metadata (9.7 kB)
Collecting exchange-calendars>=3.3 (from pandas_market_calendars)
  Downloading exchange_calendars-4.11.1-py3-none-any.whl.metadata (38 kB)
Collecting pyluach (from exchange-calendars>=3.3->pandas_market_calendars)
  Downloading pyluach-2.2.0-py3-none-any.whl.metadata (4.3 kB)
Collecting korean_lunar_calendar (from exchange-calendars>=3.3->pandas_market_calendars)
  Downloading korean_lunar_calendar-0.3.1-py3-none-any.whl.metadata (2.8 kB)
Downloading pandas_market_calendars-5.1.1-py3-none-any.whl (127 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.4/127.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading exchange_calendars-4.11.1-py3-none-any.whl (208 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m208.9/208.9 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading korean_lunar_calendar-0.3.1-py3-none-any.whl (

"GOOG", "T", "CHTR", "CMCSA", "EA", "FOXA", "FOX", "IPG", "LYV",
           "MTCH", "META", "NFLX", "NWSA", "NWS", "OMC", "PARA", "TMUS", "TTWO", "TKO",
           "VZ", "DIS", "WBD

In [None]:
# นำเข้าไลบรารีที่จำเป็น
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import pandas_market_calendars as mcal

# ไลบรารีสำหรับ Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# ------------------------------
# 1. ดาวน์โหลดข้อมูลหุ้น
# ------------------------------
tickers = ["GOOGL"]

end_date = datetime.today()
start_date = end_date - timedelta(days=5*365)  # 5 ปี

all_data_list = []

for ticker in tickers:
    print(f"📥 Downloading {ticker} ...")
    data = yf.download(ticker, start=start_date.strftime('%Y-%m-%d'), end=end_date.strftime('%Y-%m-%d'))

    if data.empty:
        print(f"❌ No data for {ticker}")
        continue

    data.reset_index(inplace=True)

    if isinstance(data.columns, pd.MultiIndex):
        data.columns = [' '.join(col).strip() if isinstance(col, tuple) else col for col in data.columns.values]

    def clean_columns(cols):
        cleaned = []
        for col in cols:
            if 'Date' in col:
                cleaned.append('Date')
            else:
                cleaned.append(col.split()[0])
        return cleaned

    data.columns = clean_columns(data.columns)
    data['Symbol'] = ticker.upper()

    wanted_cols = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume', 'Symbol']
    data = data[[col for col in wanted_cols if col in data.columns]]

    all_data_list.append(data)

flat_df = pd.concat(all_data_list, ignore_index=True)
flat_df['Date'] = pd.to_datetime(flat_df['Date'])
flat_df['Symbol'] = flat_df['Symbol'].astype(str).str.upper()
flat_df = flat_df.sort_values(by=['Symbol', 'Date']).reset_index(drop=True)

# ------------------------------
# 2. สร้างฟีเจอร์ Holiday Effect (แก้ไข)
# ------------------------------
nyse = mcal.get_calendar('NYSE')

# ดึง list ของวันหยุดและแปลงเป็น DatetimeIndex
holidays = pd.to_datetime(nyse.holidays().holidays)

# สร้าง dummy variables
flat_df['holiday'] = flat_df['Date'].isin(holidays).astype(int)
flat_df['pre_holiday'] = flat_df['Date'].isin(holidays - pd.Timedelta(days=1)).astype(int)
flat_df['post_holiday'] = flat_df['Date'].isin(holidays + pd.Timedelta(days=1)).astype(int)

# ------------------------------
# 3. สร้าง target variable (daily return)
# ------------------------------
flat_df['Return'] = flat_df.groupby('Symbol')['Close'].pct_change()  # (Close_t - Close_{t-1}) / Close_{t-1}

# ลบแถวที่มี NaN (วันที่แรกของแต่ละหุ้น)
flat_df.dropna(inplace=True)

# ------------------------------
# 4. เลือก features และ target
# ------------------------------
feature_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'holiday', 'pre_holiday', 'post_holiday']
X = flat_df[feature_cols]
y = flat_df['Return']

# ------------------------------
# 5. แบ่งข้อมูล train/test
# ------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)  # time series

# ------------------------------
# 6. สร้างและฝึกโมเดล
# ------------------------------
# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# ------------------------------
# 7. ประเมินผลโมเดล
# ------------------------------
def evaluate(y_true, y_pred, model_name="Model"):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"📊 {model_name} Performance:")
    print(f"RMSE = {rmse:.6f}")
    print(f"MAE  = {mae:.6f}")
    print(f"R²   = {r2:.6f}")
    print("-"*30)

evaluate(y_test, y_pred_lr, "Linear Regression")
evaluate(y_test, y_pred_rf, "Random Forest")

# ------------------------------
# 8. ตัวอย่างแสดง dataframe พร้อม features และ target
# ------------------------------
print(flat_df.head())


📥 Downloading GOOGL ...


  data = yf.download(ticker, start=start_date.strftime('%Y-%m-%d'), end=end_date.strftime('%Y-%m-%d'))
[*********************100%***********************]  1 of 1 completed


📊 Linear Regression Performance:
RMSE = 0.014344
MAE  = 0.009887
R²   = 0.460894
------------------------------
📊 Random Forest Performance:
RMSE = 0.023344
MAE  = 0.015634
R²   = -0.427929
------------------------------
        Date      Close       High        Low       Open    Volume Symbol  \
1 2020-08-28  81.482819  81.578247  80.795944  80.987794  22418000  GOOGL   
2 2020-08-31  80.990776  81.734811  80.782029  81.688587  26422000  GOOGL   
3 2020-09-01  82.260658  82.466425  80.990778  81.121494  22652000  GOOGL   
4 2020-09-02  85.357574  85.790479  82.514624  82.903292  49522000  GOOGL   
5 2020-09-03  80.989784  84.493271  79.906278  84.469412  63726000  GOOGL   

   holiday  pre_holiday  post_holiday    Return  
1        0            0             0  0.006699  
2        0            0             0 -0.006039  
3        0            0             0  0.015679  
4        0            0             0  0.037648  
5        0            0             0 -0.051171  


In [None]:
import pandas as pd
import pandas_market_calendars as mcal

# 1. สร้าง continuous date range
all_dates = pd.DataFrame({'Date': pd.date_range(flat_df['Date'].min(), flat_df['Date'].max())})

# 2. ขยาย Symbol list ให้ครบทุกวัน
symbols = flat_df['Symbol'].unique()
expanded_list = []
for sym in symbols:
    temp = all_dates.copy()
    temp['Symbol'] = sym
    expanded_list.append(temp)
all_dates_symbols = pd.concat(expanded_list, ignore_index=True)

# 3. merge กับ flat_df ของราคาหุ้น
full_df = all_dates_symbols.merge(flat_df, on=['Date','Symbol'], how='left')

# 4. สร้าง holiday / weekend
nyse = mcal.get_calendar('NYSE')
schedule = nyse.schedule(start_date=full_df['Date'].min(), end_date=full_df['Date'].max())
business_days = schedule.index.normalize()
business_set = set(business_days)

full_df['holiday'] = (~full_df['Date'].isin(business_set)).astype(int)
full_df['weekend'] = full_df['Date'].dt.dayofweek.isin([5,6]).astype(int)

# 5. pre_holiday / post_holiday
def compute_pre_holiday(date):
    return int((date + pd.Timedelta(days=1)) not in business_set)

def compute_post_holiday(date):
    return int((date - pd.Timedelta(days=1)) not in business_set)

full_df['pre_holiday'] = full_df['Date'].apply(compute_pre_holiday)
full_df['post_holiday'] = full_df['Date'].apply(compute_post_holiday)

# 6. ตรวจสอบ
print(full_df[['Date','Symbol','Close','holiday','pre_holiday','post_holiday','weekend']].head(30))

         Date Symbol      Close  holiday  pre_holiday  post_holiday  weekend
0  2020-08-28  GOOGL  81.482819        0            1             1        0
1  2020-08-29  GOOGL        NaN        1            1             0        1
2  2020-08-30  GOOGL        NaN        1            0             1        1
3  2020-08-31  GOOGL  80.990776        0            0             1        0
4  2020-09-01  GOOGL  82.260658        0            0             0        0
5  2020-09-02  GOOGL  85.357574        0            0             0        0
6  2020-09-03  GOOGL  80.989784        0            0             0        0
7  2020-09-04  GOOGL  78.589180        0            1             0        0
8  2020-09-05  GOOGL        NaN        1            1             0        1
9  2020-09-06  GOOGL        NaN        1            1             1        1
10 2020-09-07  GOOGL        NaN        1            0             1        0
11 2020-09-08  GOOGL  75.725845        0            0             1        0

In [None]:
full_df.to_csv('full_df.csv', index=False)

In [None]:
from scipy.stats import ttest_ind

holiday_features = ['holiday', 'pre_holiday', 'post_holiday', 'weekend']

for col in holiday_features:
    group1 = full_df[full_df[col]==1]['Return'].dropna()
    group2 = full_df[full_df[col]==0]['Return'].dropna()
    t_stat, p_val = ttest_ind(group1, group2, equal_var=False)
    print(f"{col}: t-stat={t_stat:.4f}, p-value={p_val:.4f}")
