In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("white")
plt.rcParams['axes.unicode_minus'] = False

plt.rcParams['font.family'] = 'KaiTi'  
plt.rcParams['font.size'] = 10  


df = pd.read_csv("./all_companies_long_sorted.csv")
df['Dates'] = pd.to_datetime(df['Dates'], errors='coerce')

missing_ratio = df.isnull().mean()
df = df.drop(columns=missing_ratio[missing_ratio > 0.3].index)
print(f"删除高缺失列后剩余列数: {df.shape[1]}")

num_cols = df.select_dtypes(include='number').columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

df = df.dropna(subset=['Dates', 'Company'])

df = df.sort_values(by=['Company', 'Dates']).reset_index(drop=True)

df.to_csv("./cleaned_all_companies_long.csv", index=False)
print("已保存清洗后的数据：./cleaned_all_companies_long.csv")

删除高缺失列后剩余列数: 65
已保存清洗后的数据：./cleaned_all_companies_long.csv


In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("./cleaned_all_companies_long.csv")
df['Dates'] = pd.to_datetime(df['Dates'], errors='coerce')
df = df.sort_values(['Company', 'Dates'])

df['return_lag1'] = df.groupby('Company')['PX_LAST'].pct_change(1)
df['momentum_5d'] = df.groupby('Company')['PX_LAST'].pct_change(5)
df['momentum_10d'] = df.groupby('Company')['PX_LAST'].pct_change(10)
df['volatility_5d'] = df.groupby('Company')['PX_LAST'].rolling(5).std().reset_index(level=0, drop=True)
df['volatility_10d'] = df.groupby('Company')['PX_LAST'].rolling(10).std().reset_index(level=0, drop=True)
df['PE_log'] = np.log1p(df['PE_RATIO'])
df['VIX_PE_interact'] = df['VIX Index'] * df['PE_RATIO']
df['USDJPY_ret'] = df['USDJPY Curncy'].pct_change(1)

train_companies = [
    "ADBE", "AMD", "AMZN", "AVGO", "CSCO", "GOOGL", "IBM",
    "INTC", "META", "MSFT", "NVDA", "CRM"
]  # AAPL, ORCL, TXN 保留用于泛化验证

train_df = df[
    (df['Company'].isin(train_companies)) &
    (df['Dates'].dt.year <= 2022)
].copy()

train_df['log_return'] = train_df.groupby('Company')['PX_LAST'].transform(lambda x: np.log(x).diff())
train_df['return'] = (
    train_df.groupby('Company')['log_return']
    .shift(-1)
    .rolling(10)
    .mean()
    .shift(-9)
    .clip(-0.3, 0.3)
)
train_df = train_df.dropna(subset=['return'])

exclude_cols = ['Dates', 'Company', 'PX_LAST', 'PX_OPEN', 'PX_HIGH', 'PX_LOW',
                'PX_VOLUME.1', 'log_return', 'return']
features = [col for col in train_df.columns if col not in exclude_cols]

X = train_df[features].fillna(0)
y = train_df['return'].fillna(0)
X_scaled = StandardScaler().fit_transform(X)

rf = RandomForestRegressor(n_estimators=200, max_depth=6, random_state=42)
rf.fit(X_scaled, y)

importances = pd.Series(rf.feature_importances_, index=features).sort_values(ascending=False)
top_20_factors = importances.head(20)

print("行业统一选出的 Top20 因子：")
print(top_20_factors)

top_20_factors.to_frame(name='importance').to_csv("./selected_factors.csv", index=True)
print("已保存 Top 20 因子到 ./selected_factors.csv")

# 保存带有衍生变量的训练数据，供后续 IC 检验或建模用
train_df.to_csv("./with_features_train_data.csv", index=False)
print("衍生变量训练数据已保存到：./with_features_train_data.csv")

行业统一选出的 Top20 因子：
GT2 Govt            0.158134
SPX Index           0.097571
INDU Index          0.091712
EPUCGLOB Index      0.076829
NDX Index           0.053654
GT10 Govt           0.045825
EPUCGLCP Index      0.045414
VIX Index           0.035592
USDJPY Curncy       0.031044
RTY Index           0.022385
CL1 Comdty          0.021674
VIX_PE_interact     0.021445
BCOM Index          0.018948
EURUSD Curncy       0.018021
DXY Curncy          0.017290
CALL_IMP_VOL_30D    0.016674
XAU Curncy          0.014179
PUT_IMP_VOL_30D     0.013277
CPURNSA Index       0.012042
BS_TOT_LIAB2        0.011384
dtype: float64
已保存 Top 20 因子到 ./selected_factors.csv
衍生变量训练数据已保存到：./with_features_train_data.csv


In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import os

df = pd.read_csv("./cleaned_all_companies_long.csv")
df['Dates'] = pd.to_datetime(df['Dates'])
df = df.sort_values(['Company', 'Dates'])

train_companies = ["ADBE", "AMD", "AMZN", "AVGO", "CSCO", "GOOGL", "IBM", "INTC", "META", "MSFT", "NVDA"]
all_companies = train_companies + ["CRM", "ORCL", "QCOM", "TXN"]
df = df[df['Company'].isin(all_companies)]

factors_df = pd.read_csv("./selected_factors.csv", index_col=0)
features = factors_df.index.tolist()

df['log_return'] = df.groupby('Company')['PX_LAST'].transform(lambda x: np.log(x).diff())
df['market_return'] = df.groupby('Dates')['log_return'].transform('mean')
df['excess_return'] = df['log_return'] - df['market_return']
df['target'] = (
    df.groupby('Company')['excess_return']
    .transform(lambda x: x.shift(-1).ewm(span=20).mean())
    .shift(-9).clip(-0.3, 0.3)
)

if 'VIX_PE_interact' in features and 'VIX_PE_interact' not in df.columns:
    df['VIX_PE_interact'] = df['VIX Index'] * df['PE_RATIO']
if 'volatility_10d' in features and 'volatility_10d' not in df.columns:
    df['volatility_10d'] = df.groupby('Company')['PX_LAST'].rolling(10).std().reset_index(level=0, drop=True)
if 'return_lag1' in features and 'return_lag1' not in df.columns:
    df['return_lag1'] = df.groupby('Company')['log_return'].shift(1)

df = df.dropna(subset=features + ['target'])

train_df = df[(df['Company'].isin(train_companies)) & (df['Dates'].dt.year <= 2023)]
test_df = df[df['Dates'].dt.year == 2024]

scaler = StandardScaler()
target_scaler = MinMaxScaler(feature_range=(-1, 1))
X_train = scaler.fit_transform(train_df[features])
X_test = scaler.transform(test_df[features])
y_train = target_scaler.fit_transform(train_df[['target']]).flatten()
y_test = target_scaler.transform(test_df[['target']]).flatten()

WINDOW_SIZE = 30
def create_sequences(X, y, window=30):
    return np.array([X[i:i+window] for i in range(len(X)-window)]), np.array([y[i+window] for i in range(len(y)-window)])
X_train_seq, y_train_seq = create_sequences(X_train, y_train, WINDOW_SIZE)
X_test_seq, y_test_seq = create_sequences(X_test, y_test, WINDOW_SIZE)

In [8]:
X_train_seq.shape, y_train_seq.shape, X_test_seq.shape, y_test_seq.shape

((11443, 30, 20), (11443,), (3891, 30, 20), (3891,))

In [10]:
# 转换为传统机器学习可以处理的数据
X_train = X_train_seq.reshape(X_train_seq.shape[0], -1)
X_test = X_test_seq.reshape(X_test_seq.shape[0], -1)
X_train.shape, X_test.shape

((11443, 600), (3891, 600))

In [12]:
X_train[:, 0]

array([-0.25229018, -0.25229018, -0.27400394, ...,  1.47602133,
        1.43776376,  1.46257948])

In [14]:
y_train[0]

0.16662842033588504

In [16]:
import xgboost as xgb

best_model_xgb = xgb.XGBRegressor()  
best_model_xgb.fit(X_train, y_train_seq)

y_xgb_train = best_model_xgb.predict(X_train)
y_xgb_test = best_model_xgb.predict(X_test)

In [18]:
import os, matplotlib.pyplot as plt
model_path = "./result/"
result_dir = "./result/"
os.makedirs(model_path, exist_ok=True)
os.makedirs(result_dir, exist_ok=True)
# model.save(os.path.join(model_path, "lstm_final_mixed_model.h5"))

company_metrics = []
for company in test_df['Company'].unique():
    df_c = test_df[test_df['Company'] == company].copy().sort_values('Dates')
    X_c = scaler.transform(df_c[features])
    y_c = df_c['target'].values
    y_c_scaled = target_scaler.transform(y_c.reshape(-1, 1)).flatten()
    X_c_seq, y_c_seq = create_sequences(X_c, y_c_scaled, WINDOW_SIZE)
    X_c_seq = X_c_seq.reshape(X_c_seq.shape[0], -1)

    if len(X_c_seq) == 0: continue
    y_pred_scaled = best_model_xgb.predict(X_c_seq)# .flatten()
    y_pred = target_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
    y_true = target_scaler.inverse_transform(np.array(y_c_seq).reshape(-1, 1)).flatten()

    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    acc = np.mean(np.sign(y_true) == np.sign(y_pred))
    company_metrics.append({'Company': company, 'MAE': mae, 'R2': r2, 'Direction_Accuracy': acc})

    plt.figure(figsize=(10, 4))
    plt.plot(y_true, label="Real", linewidth=1.5, color='royalblue')
    plt.plot(y_pred, label="Predicted", linewidth=1.5, color='darkorange')
    plt.fill_between(range(len(y_true)), y_true, y_pred, color='gray', alpha=0.2)
    plt.title(f"Mixed XGBoost Prediction vs Real - {company}", fontsize=12)
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(os.path.join(result_dir, f"{company}_final_mixed_prediction.png"))
    plt.close()

company_metrics

findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font family 'KaiTi' not found.
findfont: Font f

[{'Company': 'ADBE',
  'MAE': 0.004674061759190569,
  'R2': -0.43597550305026145,
  'Direction_Accuracy': 0.3103448275862069},
 {'Company': 'AMD',
  'MAE': 0.005255132660860113,
  'R2': -1.2041719228589707,
  'Direction_Accuracy': 0.41379310344827586},
 {'Company': 'AMZN',
  'MAE': 0.003146258194630102,
  'R2': -0.7003322206690361,
  'Direction_Accuracy': 0.5086206896551724},
 {'Company': 'AVGO',
  'MAE': 0.0037545232123876024,
  'R2': -0.002769336555036528,
  'Direction_Accuracy': 0.6594827586206896},
 {'Company': 'CRM',
  'MAE': 0.003338101533299344,
  'R2': -0.26337147889928825,
  'Direction_Accuracy': 0.4698275862068966},
 {'Company': 'CSCO',
  'MAE': 0.0024606528624477432,
  'R2': -0.28434250742981715,
  'Direction_Accuracy': 0.5646551724137931},
 {'Company': 'GOOGL',
  'MAE': 0.0037124256122161485,
  'R2': -0.6270670197890167,
  'Direction_Accuracy': 0.47413793103448276},
 {'Company': 'IBM',
  'MAE': 0.0030703362174026886,
  'R2': -0.11943505374384644,
  'Direction_Accuracy': 0.5

In [20]:
# 保存评估结果
pd.DataFrame(company_metrics).to_csv(os.path.join(result_dir, "company_final_mixed_metrics.csv"), index=False)
print(f"最终模型与图像结果已保存：{result_dir}")

最终模型与图像结果已保存：./result/


In [22]:
!jupyter nbconvert --to script m5609.ipynb

This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr