In [None]:
import yfinance as yf
import pandas as pd
from pandas_datareader import data as pdr
import datetime as dt
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
color_pal = sns.color_palette()
!pip install ta pandas_ta
import ta
import xgboost as xgb

# 1. Grab Data

In [None]:
from sklearn.metrics import mean_squared_error
color_pal = sns.color_palette()
plt.style.use('fivethirtyeight')

In [None]:
endDate = dt.datetime.now() - dt.timedelta(days = 7)
startDate = endDate - dt.timedelta(days = 365*5)

stock = 'AAPL'

df = yf.download(stock, start = startDate, end = endDate)
df.columns = df.columns.droplevel(1)
df.columns = df.columns.str.lower()


df

In [None]:
df['close'].plot(figsize=(18, 6),color = color_pal[2], title='AAPL Closing Price')
plt.xlabel('Date')
plt.ylabel('Price (USD)')
plt.grid(True)
plt.show()

In [None]:
print(type(df.index))

In [None]:
df.isnull().sum()

In [None]:
df.index.to_series().diff().value_counts()

In [None]:
df = df.resample('1D').ffill()

# 2. Features

In [None]:
df['lag_1'] = df['close'].shift(1)
df['lag_3'] = df['close'].shift(3)
df['lag_7'] = df['close'].shift(7)
df['rolling_mean_7'] = df['close'].rolling(window=7).mean()
df['rolling_std_7'] = df['close'].rolling(7).std()

In [None]:
df['is_weekend'] = df.index.weekday >= 5
df['volume_change_1'] = df['volume'].pct_change(1)
df['rsi'] = ta.momentum.RSIIndicator(close=df['close'], window=14).rsi()
df['target'] = df['close'].shift(-1)

In [None]:
!pip install pytrends
from pytrends.request import TrendReq

In [None]:
kw_list = ["Apple stock", "buy stocks", "market crash"]
def get_google_trends(keyword, start, end):
  pytrends = TrendReq(hl='en-US', tz=360)
  start_date_str = start.strftime('%Y-%m-%d')
  end_date_str = end.strftime('%Y-%m-%d')
  timeframe = f"{start_date_str} {end_date_str}"
  pytrends.build_payload(kw_list, cat=0, timeframe= timeframe, geo='US', gprop='')
  data = pytrends.interest_over_time()
  data = data.drop(columns=['isPartial'])  # 删除无用列
  return data

In [None]:
google_trends_data = get_google_trends(kw_list, startDate, endDate)
google_trends_data

In [None]:
data.index = pd.to_datetime(data.index)


data = data.resample('D').ffill()

In [None]:
df.index = pd.to_datetime(df.index)

df = df.merge(data, how='left', left_index=True, right_index=True)

In [None]:
def create_features(df,google_trend_data):
    df = df.copy()

    df['rolling_mean_7'] = df['close'].rolling(7).mean()
    df['rolling_std_7'] = df['close'].rolling(7).std()
    df['volume_change_1'] = df['volume'].pct_change(1)
    df['lag_1'] = df['close'].shift(1)
    df['lag_3'] = df['close'].shift(3)
    df['lag_7'] = df['close'].shift(7)
    df['is_weekend'] = df.index.dayofweek >= 5

    df['ma_gap'] = df['close'] - df['rolling_mean_7']
    df['bb_upper'] = df['close'].rolling(20).mean() + 2 * df['close'].rolling(20).std()
    df['bb_lower'] = df['close'].rolling(20).mean() - 2 * df['close'].rolling(20).std()
    df['bb_width'] = df['bb_upper'] - df['bb_lower']

    df['volatility_7'] = df['close'].pct_change().rolling(7).std()
    df['close_open_diff'] = df['close'] - df['open']
    df['day_of_week'] = df.index.dayofweek
    df['is_month_end'] = df.index.is_month_end
    df['is_quarter_end'] = df.index.is_quarter_end
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['target'] = df['close'].shift(-1)
    df['rsi'] = ta.momentum.RSIIndicator(close=df['close'], window=14).rsi()
    macd = ta.trend.MACD(df['close'])
    df['macd'] = macd.macd()
    df['macd_signal'] = macd.macd_signal()
    df['macd_diff'] = macd.macd_diff()
    df = df.merge(google_trend_data, how='left', left_index=True, right_index=True)




    return df

In [None]:
df.columns

# 3. Time Series Cross Validation

In [None]:
from sklearn.model_selection import TimeSeriesSplit

In [None]:
df = df.sort_index()

In [None]:
tss = TimeSeriesSplit(n_splits=5, test_size=90, gap=1)

In [None]:
fig, axs = plt.subplots(5, 1, figsize=(14, 12), sharex=True)

fold = 0
for train_idx, val_idx in tss.split(df):
    train = df.iloc[train_idx]
    test = df.iloc[val_idx]


    train['close'].plot(ax=axs[fold],
                        label='Training Set',
                        title=f'Data Train/Test Split Fold {fold}')

    test['close'].plot(ax=axs[fold],
                       label='Test Set')

    axs[fold].axvline(test.index.min(), color='black', ls='--')
    axs[fold].legend()
    fold += 1

plt.tight_layout()
plt.show()

# 4. Train Using Cross Validation

In [None]:
df = df.dropna()

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

tss = TimeSeriesSplit(n_splits=5, test_size=90, gap=1)
df = df.sort_index()


fold = 0
preds = []
scores = []

FEATURES = ['lag_1', 'lag_3', 'lag_7', 'rolling_mean_7', 'rolling_std_7',
            'volume_change_1', 'rsi', 'ma_gap', 'bb_upper', 'bb_lower',
            'bb_width', 'macd', 'macd_signal', 'macd_diff', 'volatility_7',
            'close_open_diff', 'day_of_week', 'is_weekend',
            'is_month_end', 'is_quarter_end', 'month', 'year','Apple stock', 'buy stocks', 'market crash']
TARGET = 'target'

for train_idx, val_idx in tss.split(df):
    train = df.iloc[train_idx]
    test = df.iloc[val_idx]

    X_train = train[FEATURES]
    y_train = train[TARGET]

    X_test = test[FEATURES]
    y_test = test[TARGET]

    reg = xgb.XGBRegressor(
        base_score=0.5,
        booster='gbtree',
        n_estimators=1000,
        early_stopping_rounds=50,
        objective='reg:squarederror',
        max_depth=3,
        learning_rate=0.01,
        random_state=42
    )

    reg.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_test, y_test)],
            verbose=100)

    y_pred = reg.predict(X_test)
    preds.append(y_pred)
    score = np.sqrt(mean_squared_error(y_test, y_pred))
    scores.append(score)
    print(f'Fold {fold} RMSE: {score:.4f}')
    fold += 1

    last_val_idx = val_idx


print(f'Average RMSE: {np.mean(scores):.4f}')

In [None]:
print(f'Score across folds {np.mean(scores):0.4f}')
print(f'Fold scores:{scores}')

In [None]:
df['close'].describe()

In [None]:
plt.figure(figsize=(10, 4))
plt.plot(y_test.values, label='Actual')
plt.plot(y_pred, label='Predicted')
plt.legend()
plt.title('Prediction vs Actual')
plt.show()


In [None]:
fi = pd.DataFrame(
    data=reg.feature_importances_,
    index=reg.feature_names_in_,
    columns=['importance']
)


fi.sort_values('importance').plot(kind='barh', title='Feature Importance', figsize=(14, 8))
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.grid(True)
plt.tight_layout()
plt.show()

# 5. Backtest using pandas

In [None]:
ticker = "AAPL"
end_date = dt.datetime.now()
start_date = end_date - dt.timedelta(days = 30)


sp = yf.download(ticker, start=start_date, end=end_date)
sp.columns = sp.columns.droplevel(1)
sp.columns = sp.columns.str.lower()

print(sp.head())

In [None]:
data = get_google_trends(kw_list, start_date, end_date)
data

In [None]:
sp = create_features(sp,data)

In [None]:
sp

In [None]:
sp.columns

In [None]:
FEATURES = ['lag_1', 'lag_3', 'lag_7', 'rolling_mean_7', 'rolling_std_7',
            'volume_change_1', 'rsi', 'ma_gap', 'bb_upper', 'bb_lower',
            'bb_width', 'macd', 'macd_signal', 'macd_diff', 'volatility_7',
            'close_open_diff', 'day_of_week', 'is_weekend',
            'is_month_end', 'is_quarter_end', 'month', 'year','Apple stock', 'buy stocks', 'market crash']

X_backtest = sp[FEATURES].iloc[-7:]
y_backtest_true = sp['target'].iloc[-7:]

In [None]:
X_backtest = sp[FEATURES].iloc[-30:]
dates = sp.index[-30:]
actual_prices = sp['close'].iloc[-30:].values

predicted_prices = reg.predict(X_backtest)


strategy_returns = []
market_returns = []

for i in range(len(predicted_prices) - 1):
    # strategy: if predict growth, buy
    if predicted_prices[i + 1] > predicted_prices[i]:
        strategy_return = (actual_prices[i + 1] - actual_prices[i]) / actual_prices[i]
    else:
        strategy_return = 0
    strategy_returns.append(strategy_return)


    market_return = (actual_prices[i + 1] - actual_prices[i]) / actual_prices[i]
    market_returns.append(market_return)


strategy_cum = (1 + pd.Series(strategy_returns)).cumprod()
market_cum = (1 + pd.Series(market_returns)).cumprod()

# visualization

plt.figure(figsize=(10, 6))
plt.plot(dates[1:], strategy_cum, label='Strategy Return', linewidth=2)
plt.plot(dates[1:], market_cum, label='Market Return', linewidth=2, linestyle='--')
plt.title("Backtest Comparison: Strategy vs Market")
plt.xlabel("Date")
plt.ylabel("Cumulative Return")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
def max_drawdown(cum_returns):
    peak = np.maximum.accumulate(cum_returns)
    drawdown = (cum_returns / peak) - 1
    return drawdown.min()


strategy_returns = np.array(strategy_returns)


strategy_cum = pd.Series((1 + strategy_returns).cumprod(), index=sp.index[-len(strategy_returns):])


mdd = max_drawdown(strategy_cum)
print(f"Max Drawdown: {mdd:.2%}")


In [None]:
# sharpe ratio
sharpe_ratio = np.sqrt(252) * np.mean(strategy_returns) / np.std(strategy_returns)
print(f"Sharpe Ratio: {sharpe_ratio:.2f}")


In [None]:
print(strategy_returns)