In [None]:
import yfinance as yf
import pandas as pd
import numpy as np

# Google
df = yf.download("GOOGL", start="2020-04-01", end="2025-04-01", interval="1d")
df = df.dropna().reset_index()

# adding new columns for the date
df["log_return"] = np.log(df[("Close", "GOOGL")] / df[("Close", "GOOGL")].shift(1))
df["price_change"] = df[("Close", "GOOGL")] - df[("Close", "GOOGL")].shift(1)
df["MA_5"] = df[("Close", "GOOGL")].rolling(5).mean()
df["MA_10"] = df[("Close", "GOOGL")].rolling(10).mean()
df["volatility_5"] = df[("Close", "GOOGL")].rolling(5).std()
df["volatility_10"] = df[("Close", "GOOGL")].rolling(10).std()

# add datetime features
df["dayofweek"] = df["Date"].dt.dayofweek
df["is_month_end"] = df["Date"].dt.is_month_end.astype(int)

# regression target: prediction of the next day's price
df["target_regression"] = df[("Close", "GOOGL")].shift(-1)

# classification target: prediction of the next day's price direction
df["target_direction"] = (df["target_regression"] > df[("Close", "GOOGL")]).astype(int)


# Step 5: 删除缺失值
df = df.dropna().reset_index(drop=True)

# Step 6: 保存
df.to_csv("GOOGL_processed.csv", index=False)
print("already down：GOOGL_processed.csv")


[*********************100%***********************]  1 of 1 completed

✅ 文件已生成：GOOGL_processed.csv





In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler


df = pd.read_csv("GOOGL_processed.csv")

# the featres we want to standardize
cols_to_scale = [
    "price_change",
    "MA_5", "MA_10",
    "volatility_5", "volatility_10"
]

# normalize the features
scaler = StandardScaler()
scaled_values = scaler.fit_transform(df[cols_to_scale])


scaled_df = pd.DataFrame(scaled_values, columns=[col + "_scaled" for col in cols_to_scale])
df_scaled = pd.concat([df, scaled_df], axis=1)

df_scaled.to_csv("GOOGL_processed_scaled.csv", index=False)
print("already down: GOOGL_processed_scaled.csv")
