In [15]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [25]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

# Load data
df = pd.read_csv("data_with_spread.csv")

# Define target variable
target = "monthly_return"

# Drop non-numeric columns
X = df.select_dtypes(include=["number"]).drop(columns=[target], errors="ignore")
y = df[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions (In-Sample)
y_train_pred_rf = rf_model.predict(X_train)

# Predictions (Out-of-Sample)
y_test_pred_rf = rf_model.predict(X_test)

# Evaluate Model - In-Sample (Training)
mse_rf_train = mean_squared_error(y_train, y_train_pred_rf)
mae_rf_train = mean_absolute_error(y_train, y_train_pred_rf)
r2_rf_train = r2_score(y_train, y_train_pred_rf)

# Evaluate Model - Out-of-Sample (Testing)
mse_rf_test = mean_squared_error(y_test, y_test_pred_rf)
mae_rf_test = mean_absolute_error(y_test, y_test_pred_rf)
r2_rf_test = r2_score(y_test, y_test_pred_rf)

print(f"\n📊 Random Forest (General Model) - Training Set:")
print(f"MSE: {mse_rf_train:.4f}, MAE: {mae_rf_train:.4f}, R²: {r2_rf_train:.4f}")

print(f"\n📊 Random Forest (General Model) - Test Set:")
print(f"MSE: {mse_rf_test:.4f}, MAE: {mae_rf_test:.4f}, R²: {r2_rf_test:.4f}")


📊 Random Forest (General Model) - Training Set:
MSE: 0.0052, MAE: 0.0426, R²: 0.8961

📊 Random Forest (General Model) - Test Set:
MSE: 0.0283, MAE: 0.1129, R²: 0.3216


In [26]:
sector_models = {}  # Dictionary to store trained models
sector_results_rf = []  # List to store sector results

# Loop through each sector and train a separate model
for sector in df["Sector"].unique():
    print(f"\n🌍 Training Random Forest for Sector: {sector}")

    # Filter data for this sector
    sector_data = df[df["Sector"] == sector]

    # Define X and y
    X_sector = sector_data.select_dtypes(include=["number"]).drop(columns=[target], errors="ignore")
    y_sector = sector_data[target]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_sector, y_sector, test_size=0.2, random_state=42)

    # Train Random Forest Model
    rf_sector = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_sector.fit(X_train, y_train)

    # Store trained model and data for later use
    sector_models[sector] = {"rf_model": rf_sector, "X_train": X_train}

    # Predictions - In-Sample
    y_train_pred_rf_sector = rf_sector.predict(X_train)

    # Predictions - Out-of-Sample
    y_test_pred_rf_sector = rf_sector.predict(X_test)

    # Evaluate Model - In-Sample (Training)
    mse_rf_train_sector = mean_squared_error(y_train, y_train_pred_rf_sector)
    mae_rf_train_sector = mean_absolute_error(y_train, y_train_pred_rf_sector)
    r2_rf_train_sector = r2_score(y_train, y_train_pred_rf_sector)

    # Evaluate Model - Out-of-Sample (Testing)
    mse_rf_test_sector = mean_squared_error(y_test, y_test_pred_rf_sector)
    mae_rf_test_sector = mean_absolute_error(y_test, y_test_pred_rf_sector)
    r2_rf_test_sector = r2_score(y_test, y_test_pred_rf_sector)

    print(f"\n📊 Sector: {sector} → Training Set:")
    print(f"MSE: {mse_rf_train_sector:.4f}, MAE: {mae_rf_train_sector:.4f}, R²: {r2_rf_train_sector:.4f}")

    print(f"\n📊 Sector: {sector} → Test Set:")
    print(f"MSE: {mse_rf_test_sector:.4f}, MAE: {mae_rf_test_sector:.4f}, R²: {r2_rf_test_sector:.4f}")

    # Store results
    sector_results_rf.append({
        "Sector": sector,
        "MSE_Train": mse_rf_train_sector,
        "MAE_Train": mae_rf_train_sector,
        "R²_Train": r2_rf_train_sector,
        "MSE_Test": mse_rf_test_sector,
        "MAE_Test": mae_rf_test_sector,
        "R²_Test": r2_rf_test_sector
    })

# Convert results to DataFrame and save
rf_results_df = pd.DataFrame(sector_results_rf)
rf_results_df.to_csv("random_forest_sector_results_initial.csv", index=False)

print("\n✅ 📂 Sector-Specific Random Forest Models Trained & Stored Successfully!")



🌍 Training Random Forest for Sector: Health Care

📊 Sector: Health Care → Training Set:
MSE: 0.0043, MAE: 0.0470, R²: 0.8610

📊 Sector: Health Care → Test Set:
MSE: 0.0269, MAE: 0.1281, R²: 0.1680

🌍 Training Random Forest for Sector: Information Technology

📊 Sector: Information Technology → Training Set:
MSE: 0.0056, MAE: 0.0519, R²: 0.8967

📊 Sector: Information Technology → Test Set:
MSE: 0.0410, MAE: 0.1411, R²: 0.1858

🌍 Training Random Forest for Sector: Financials

📊 Sector: Financials → Training Set:
MSE: 0.0026, MAE: 0.0351, R²: 0.9281

📊 Sector: Financials → Test Set:
MSE: 0.0221, MAE: 0.1003, R²: 0.4796

🌍 Training Random Forest for Sector: Consumer Staples

📊 Sector: Consumer Staples → Training Set:
MSE: 0.0030, MAE: 0.0394, R²: 0.8547

📊 Sector: Consumer Staples → Test Set:
MSE: 0.0135, MAE: 0.0888, R²: -0.1087

🌍 Training Random Forest for Sector: Industrials

📊 Sector: Industrials → Training Set:
MSE: 0.0120, MAE: 0.0485, R²: 0.8640

📊 Sector: Industrials → Test Set:
M

In [28]:
import pandas as pd

# 🛠️ **Ensure Sector-Based Results Exist**
if "rf_results_df" not in globals():
    raise ValueError("🚨 `rf_results_df` is missing. Make sure the sector-based results are stored correctly.")

# 🛠️ **Create General Model Results DataFrame**
general_rf_results = {
    "Sector": ["General Model"],
    "MSE_Train": [mse_rf_train],
    "MAE_Train": [mae_rf_train],
    "R²_Train": [r2_rf_train],
    "MSE_Test": [mse_rf_test],
    "MAE_Test": [mae_rf_test],
    "R²_Test": [r2_rf_test]
}

general_rf_df = pd.DataFrame(general_rf_results)

# 🛠️ **Ensure Column Names Match Before Merging**
rf_results_df.rename(columns={"MSE_Train": "MSE_Train", "MAE_Train": "MAE_Train", "R²_Train": "R²_Train",
                              "MSE_Test": "MSE_Test", "MAE_Test": "MAE_Test", "R²_Test": "R²_Test"}, inplace=True)
general_rf_df.rename(columns={"MSE_Train": "MSE_Train", "MAE_Train": "MAE_Train", "R²_Train": "R²_Train",
                              "MSE_Test": "MSE_Test", "MAE_Test": "MAE_Test", "R²_Test": "R²_Test"}, inplace=True)

# 🛠️ **Merge General & Sector-Specific Results**
full_rf_results_df = pd.concat([general_rf_df, rf_results_df], ignore_index=True)

# 🛠️ **Save All Results**
full_rf_results_df.to_csv("random_forest_all_results_initial.csv", index=False)

print("\n✅ 📂 Initial Random Forest results (with in-sample and out-of-sample metrics) saved successfully!")

# 🛠️ **Load and Check Merged Results**
df_check = pd.read_csv("random_forest_all_results_initial.csv")
print("\n📊 Merged Results Preview:")
print(df_check.head())



✅ 📂 Initial Random Forest results (with in-sample and out-of-sample metrics) saved successfully!

📊 Merged Results Preview:
                   Sector  MSE_Train  MAE_Train  R²_Train  MSE_Test  MAE_Test  \
0           General Model   0.005176   0.042584  0.896052  0.028256  0.112904   
1             Health Care   0.004335   0.047012  0.860980  0.026873  0.128071   
2  Information Technology   0.005640   0.051891  0.896744  0.040978  0.141100   
3              Financials   0.002629   0.035142  0.928080  0.022137  0.100258   
4        Consumer Staples   0.003013   0.039352  0.854687  0.013550  0.088757   

    R²_Test  
0  0.321601  
1  0.167992  
2  0.185823  
3  0.479578  
4 -0.108669  


In [17]:
# Compute feature importance for the general model
general_feature_importance = pd.Series(rf_model.feature_importances_, index=X.columns)
general_feature_importance = general_feature_importance.sort_values(ascending=False)

# Plot general model feature importance
plt.figure(figsize=(12, 6))
sns.barplot(x=general_feature_importance.values[:15], y=general_feature_importance.index[:15])
plt.xlabel("Feature Importance")
plt.ylabel("Features")
plt.title("Top 15 Feature Importances - General Model")

# Save the figure
plt.savefig(f"general_model_feature_importance.png", bbox_inches="tight")
plt.close()

print("\n✅ 📂 General Model Feature Importance Graph Saved Successfully!")


✅ 📂 General Model Feature Importance Graph Saved Successfully!


In [19]:
import pandas as pd

# Compute feature importance for the general model
general_feature_importance = pd.Series(rf_model.feature_importances_, index=X.columns)
general_feature_importance = general_feature_importance.sort_values(ascending=False)

# Convert to DataFrame
general_feature_importance_df = general_feature_importance.reset_index()
general_feature_importance_df.columns = ["Feature", "Importance"]

# Save to CSV
general_feature_importance_df.to_csv("general_feature_importance.csv", index=False)

print("\n✅ 📂 General Model Feature Importance CSV Saved Successfully!")


✅ 📂 General Model Feature Importance CSV Saved Successfully!


In [14]:
import os

# Define folder path
save_path = "sector_feature_importance"

# Create the folder if it doesn't exist
os.makedirs(save_path, exist_ok=True)

sector_feature_importance = {}

# Loop through each sector and compute feature importance
for sector, model_data in sector_models.items():
    print(f"\n📊 Computing Feature Importance for Sector: {sector}")

    # Get trained sector model and feature set
    rf_sector = model_data["rf_model"]
    X_sector = model_data["X_train"]

    # Compute feature importance
    feature_importance_sector = pd.Series(rf_sector.feature_importances_, index=X_sector.columns)
    feature_importance_sector = feature_importance_sector.sort_values(ascending=False)

    # Save for later analysis
    sector_feature_importance[sector] = feature_importance_sector

    # Plot sector feature importance
    plt.figure(figsize=(12, 6))
    sns.barplot(x=feature_importance_sector.values[:15], y=feature_importance_sector.index[:15])
    plt.xlabel("Feature Importance")
    plt.ylabel("Features")
    plt.title(f"Top 15 Feature Importances - {sector}")

    # Save the figure
    plt.savefig(f"{save_path}/{sector}_feature_importance.png", bbox_inches="tight")
    plt.close()  # Close plot to prevent display overflow

print("\n✅ 📂 Feature Importance Graphs for Each Sector Saved Successfully!")



📊 Computing Feature Importance for Sector: Health Care

📊 Computing Feature Importance for Sector: Information Technology

📊 Computing Feature Importance for Sector: Financials

📊 Computing Feature Importance for Sector: Consumer Staples

📊 Computing Feature Importance for Sector: Industrials

📊 Computing Feature Importance for Sector: Utilities

📊 Computing Feature Importance for Sector: Materials

📊 Computing Feature Importance for Sector: Real Estate

📊 Computing Feature Importance for Sector: Consumer Discretionary

📊 Computing Feature Importance for Sector: Energy

📊 Computing Feature Importance for Sector: Communication Services

✅ 📂 Feature Importance Graphs for Each Sector Saved Successfully!
