In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import os

df = pd.read_csv(r"C:\Users\29415\Desktop\金融计量学\Project\all_companies_long_sorted(1).csv")
df['Dates'] = pd.to_datetime(df['Dates'])
df = df.sort_values(['Company', 'Dates'])

train_companies = ["ADBE", "AMD", "AMZN", "AVGO", "CSCO", "GOOGL", "IBM", "INTC", "META", "MSFT", "NVDA"]
all_companies = train_companies + ["CRM", "ORCL", "QCOM", "TXN"]
df = df[df['Company'].isin(all_companies)]

factors_df = pd.read_csv(r"C:\Users\29415\Desktop\金融计量学\Project\selected_factors.csv", index_col=0)
features = factors_df.index.tolist()

df['log_return'] = df.groupby('Company')['PX_LAST'].transform(lambda x: np.log(x).diff())
df['market_return'] = df.groupby('Dates')['log_return'].transform('mean')
df['excess_return'] = df['log_return'] - df['market_return']
df['target'] = (
    df.groupby('Company')['excess_return']
    .transform(lambda x: x.shift(-1).ewm(span=20).mean())
    .shift(-9).clip(-0.3, 0.3)
)

if 'VIX_PE_interact' in features and 'VIX_PE_interact' not in df.columns:
    df['VIX_PE_interact'] = df['VIX Index'] * df['PE_RATIO']
if 'volatility_10d' in features and 'volatility_10d' not in df.columns:
    df['volatility_10d'] = df.groupby('Company')['PX_LAST'].rolling(10).std().reset_index(level=0, drop=True)
if 'return_lag1' in features and 'return_lag1' not in df.columns:
    df['return_lag1'] = df.groupby('Company')['log_return'].shift(1)

df = df.dropna(subset=features + ['target'])

train_df = df[(df['Company'].isin(train_companies)) & (df['Dates'].dt.year <= 2023)]
test_df = df[df['Dates'].dt.year == 2024]

scaler = StandardScaler()
target_scaler = MinMaxScaler(feature_range=(-1, 1))
X_train = scaler.fit_transform(train_df[features])
X_test = scaler.transform(test_df[features])
y_train = target_scaler.fit_transform(train_df[['target']]).flatten()
y_test = target_scaler.transform(test_df[['target']]).flatten()

model = RandomForestRegressor(
    n_estimators=200,           
    max_depth=10,              
    min_samples_split=5,        
    min_samples_leaf=2,        
    max_features='sqrt',        
    random_state=42,          
    n_jobs=-1,                
    verbose=1                   
)

model.fit(X_train, y_train)


output_base_dir = r"C:\Users\29415\Desktop\金融计量学\Project\output"  
model_path = os.path.join(output_base_dir, "final_mixed_model")
result_dir = os.path.join(output_base_dir, "final_mixed_company_result")


os.makedirs(model_path, exist_ok=True)
os.makedirs(result_dir, exist_ok=True)


import joblib
joblib.dump(model, os.path.join(model_path, "random_forest_final_mixed_model.pkl"))


feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
feature_importance.to_csv(os.path.join(result_dir, "feature_importance.csv"), index=False)

company_metrics = []
for company in test_df['Company'].unique():
    df_c = test_df[test_df['Company'] == company].copy().sort_values('Dates')
    X_c = scaler.transform(df_c[features])
    y_c = df_c['target'].values
    y_c_scaled = target_scaler.transform(y_c.reshape(-1, 1)).flatten()
    
    y_pred_scaled = model.predict(X_c)
    y_pred = target_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
    y_true = target_scaler.inverse_transform(np.array(y_c_scaled).reshape(-1, 1)).flatten()
    
    min_length = min(len(y_pred), len(y_true))
    y_pred = y_pred[-min_length:]
    y_true = y_true[-min_length:]

    r2 = r2_score(y_true, y_pred) 
    mae = mean_absolute_error(y_true, y_pred)
    acc = np.mean(np.sign(y_true) == np.sign(y_pred))
    company_metrics.append({'Company': company, 'MAE': mae, 'R2': r2, 'Direction_Accuracy': acc})

    plt.figure(figsize=(10, 4))
    plt.plot(y_true, label="Real", linewidth=1.5, color='royalblue')
    plt.plot(y_pred, label="Predicted", linewidth=1.5, color='darkorange')
    plt.fill_between(range(len(y_true)), y_true, y_pred, color='gray', alpha=0.2)
    plt.title(f"Random Forest Prediction vs Real - {company}", fontsize=12)
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(os.path.join(result_dir, f"{company}_final_mixed_prediction.png"))
    plt.close()

pd.DataFrame(company_metrics).to_csv(os.path.join(result_dir, "company_final_mixed_metrics.csv"), index=False)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    0.5s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Do