In [1]:
import pandas as pd

# Load dataset with spread return included
df = pd.read_csv("data_with_spread.csv")

# Check first few rows
print(df.head())

  fiscalDateEnding reportedCurrency_x   totalAssets  totalCurrentAssets  \
0       2018-07-31                USD  8.349000e+09        3.667000e+09   
1       2019-01-31                USD  8.952000e+09        3.712000e+09   
2       2019-04-30                USD  9.022000e+09        3.812000e+09   
3       2019-07-31                USD  8.625000e+09        3.457000e+09   
4       2019-10-31                USD  9.452000e+09        3.189000e+09   

   cashAndCashEquivalentsAtCarryingValue  cashAndShortTermInvestments  \
0                           2.131000e+09                 2.131000e+09   
1                           2.057000e+09                 2.057000e+09   
2                           2.155000e+09                 2.155000e+09   
3                           1.765000e+09                 1.765000e+09   
4                           1.382000e+09                 1.407000e+09   

     inventory  currentNetReceivables  totalNonCurrentAssets  \
0  623000000.0            733000000.0         

In [2]:
# Check available columns
print(df.columns)

# Check unique sectors
print("Unique Sectors:", df["Sector"].unique())

# Ensure only numerical columns are used for ML
df_numeric = df.select_dtypes(include=["number"])

print(df_numeric.head())

Index(['fiscalDateEnding', 'reportedCurrency_x', 'totalAssets',
       'totalCurrentAssets', 'cashAndCashEquivalentsAtCarryingValue',
       'cashAndShortTermInvestments', 'inventory', 'currentNetReceivables',
       'totalNonCurrentAssets', 'propertyPlantEquipment',
       ...
       'monthly_return', 'volatility_6m', 'momentum_3m', 'momentum_6m',
       'momentum_12m', 'momentum_rank', 'portfolio', 'spread_return_x',
       'spread_return_y', 'spread_return'],
      dtype='object', length=121)
Unique Sectors: ['Health Care' 'Information Technology' 'Financials' 'Consumer Staples'
 'Industrials' 'Utilities' 'Materials' 'Real Estate'
 'Consumer Discretionary' 'Energy' 'Communication Services']
    totalAssets  totalCurrentAssets  cashAndCashEquivalentsAtCarryingValue  \
0  8.349000e+09        3.667000e+09                           2.131000e+09   
1  8.952000e+09        3.712000e+09                           2.057000e+09   
2  9.022000e+09        3.812000e+09                           2

In [3]:
# Define target variable (Y) - Monthly Returns
target = "monthly_return"
y = df[target]

# Drop non-relevant columns (e.g., Ticker, Date)
X = df_numeric.drop(columns=[target, "Ticker"], errors="ignore")

# Check dataset shape
print(f"Feature Matrix (X) Shape: {X.shape}")
print(f"Target Variable (Y) Shape: {y.shape}")

Feature Matrix (X) Shape: (7046, 111)
Target Variable (Y) Shape: (7046,)


In [4]:
from sklearn.model_selection import train_test_split

# Create a dictionary to store sector-specific data
sector_models = {}

# Loop through each sector and train separate models
for sector in df["Sector"].unique():
    print(f"📊 Training Model for Sector: {sector}")

    # Filter dataset for the current sector
    df_sector = df[df["Sector"] == sector]

    # Define X and y
    X_sector = df_sector.select_dtypes(include=["number"]).drop(columns=[target, "Ticker"], errors="ignore")
    y_sector = df_sector[target]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_sector, y_sector, test_size=0.2, random_state=42)

    # Store training data for model training
    sector_models[sector] = {
        "X_train": X_train, "X_test": X_test,
        "y_train": y_train, "y_test": y_test
    }

📊 Training Model for Sector: Health Care
📊 Training Model for Sector: Information Technology
📊 Training Model for Sector: Financials
📊 Training Model for Sector: Consumer Staples
📊 Training Model for Sector: Industrials
📊 Training Model for Sector: Utilities
📊 Training Model for Sector: Materials
📊 Training Model for Sector: Real Estate
📊 Training Model for Sector: Consumer Discretionary
📊 Training Model for Sector: Energy
📊 Training Model for Sector: Communication Services


In [60]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Store results
results = []

for sector, data in sector_models.items():
    print(f"📊 Training Linear Regression for Sector: {sector}")

    # Train model
    lr = LinearRegression()
    lr.fit(data["X_train"], data["y_train"])

    # Make predictions (In-Sample)
    y_train_pred = lr.predict(data["X_train"])

    # Make predictions (Out-of-Sample)
    y_test_pred = lr.predict(data["X_test"])

    # Evaluate model - In-Sample
    mse_train = mean_squared_error(data["y_train"], y_train_pred)
    mae_train = mean_absolute_error(data["y_train"], y_train_pred)
    r2_train = r2_score(data["y_train"], y_train_pred)

    # Evaluate model - Out-of-Sample
    mse_test = mean_squared_error(data["y_test"], y_test_pred)
    mae_test = mean_absolute_error(data["y_test"], y_test_pred)
    r2_test = r2_score(data["y_test"], y_test_pred)

    # Store results
    results.append([sector, mse_train, mae_train, r2_train, mse_test, mae_test, r2_test])

# Convert results to DataFrame
results_df = pd.DataFrame(results, columns=["Sector", "MSE_Train", "MAE_Train", "R²_Train", "MSE_Test", "MAE_Test", "R²_Test"])
print(results_df)

📊 Training Linear Regression for Sector: Health Care
📊 Training Linear Regression for Sector: Information Technology
📊 Training Linear Regression for Sector: Financials
📊 Training Linear Regression for Sector: Consumer Staples
📊 Training Linear Regression for Sector: Industrials
📊 Training Linear Regression for Sector: Utilities
📊 Training Linear Regression for Sector: Materials
📊 Training Linear Regression for Sector: Real Estate
📊 Training Linear Regression for Sector: Consumer Discretionary
📊 Training Linear Regression for Sector: Energy
📊 Training Linear Regression for Sector: Communication Services
                    Sector  MSE_Train  MAE_Train  R²_Train   MSE_Test  \
0              Health Care   0.020964   0.107004  0.327640  18.503915   
1   Information Technology   0.034047   0.132153  0.376671   0.046128   
2               Financials   0.023173   0.110077  0.366081   0.037486   
3         Consumer Staples   0.009064   0.071286  0.562814   0.261582   
4              Industria

In [61]:
# Train on entire dataset (ignoring sector differences)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr_general = LinearRegression()
lr_general.fit(X_train, y_train)

# Predictions - In-Sample
y_train_pred_general = lr_general.predict(X_train)

# Predictions - Out-of-Sample
y_test_pred_general = lr_general.predict(X_test)

# Evaluate General Model - In-Sample
mse_general_train = mean_squared_error(y_train, y_train_pred_general)
mae_general_train = mean_absolute_error(y_train, y_train_pred_general)
r2_general_train = r2_score(y_train, y_train_pred_general)

# Evaluate General Model - Out-of-Sample
mse_general_test = mean_squared_error(y_test, y_test_pred_general)
mae_general_test = mean_absolute_error(y_test, y_test_pred_general)
r2_general_test = r2_score(y_test, y_test_pred_general)

print(f"\n📊 General Model - Training Set:")
print(f"MSE: {mse_general_train:.4f}, MAE: {mae_general_train:.4f}, R²: {r2_general_train:.4f}")

print(f"\n📊 General Model - Test Set:")
print(f"MSE: {mse_general_test:.4f}, MAE: {mae_general_test:.4f}, R²: {r2_general_test:.4f}")



📊 General Model - Training Set:
MSE: 0.0446, MAE: 0.1370, R²: 0.1040

📊 General Model - Test Set:
MSE: 0.0412, MAE: 0.1383, R²: 0.0111


In [62]:
import pandas as pd

# General model results
general_results = pd.DataFrame([["General Model", mse_general_train, mae_general_train, r2_general_train,
                                 mse_general_test, mae_general_test, r2_general_test]],
                               columns=["Sector", "MSE_Train", "MAE_Train", "R²_Train", "MSE_Test", "MAE_Test", "R²_Test"])

# Concatenate sector-specific results with general model
comparison_df = pd.concat([results_df, general_results], ignore_index=True)

# Save as CSV
comparison_df.to_csv("linear_sector_vs_general_comparison.csv", index=False)

print("✅ Linear Regression results saved as 'linear_sector_vs_general_comparison.csv'.")


✅ Linear Regression results saved as 'linear_sector_vs_general_comparison.csv'.


First part of ML Above - Linear model with just base parameters

Conclusions are that some of the sector based linear models outperform the general model, but it isn't reliable yet.
Additionally, some of the sector based models are really bad and have crazy errors, so, I will need to fix that.

I will do some testing to figure out what can be done to improve the R^2 and MAE for some of the sectoral models

In [21]:
# Convert R² to numeric to avoid sorting issues
comparison_df["R²"] = pd.to_numeric(comparison_df["R²"])

# Get the 3 worst sectors based on R² (most negative)
worst_r2_sectors = comparison_df.nsmallest(3, "R²")

print("📉 Worst Performing Sectors (R² Lowest to Highest):")
print(worst_r2_sectors)

📉 Worst Performing Sectors (R² Lowest to Highest):
                   Sector      MSE     MAE        R²
0             Health Care  18.5039  0.4629 -571.8933
3        Consumer Staples   0.2616  0.1934  -20.4031
8  Consumer Discretionary   1.3930  0.3609  -20.3180


In [22]:
# Count the number of observations per sector
sector_counts = df["Sector"].value_counts()
print("📊 Number of Data Points Per Sector:")
print(sector_counts)

📊 Number of Data Points Per Sector:
Sector
Financials                1181
Industrials               1135
Health Care                912
Information Technology     876
Consumer Discretionary     600
Utilities                  503
Real Estate                458
Consumer Staples           405
Materials                  376
Energy                     342
Communication Services     258
Name: count, dtype: int64


In [27]:
for sector in worst_r2_sectors["Sector"]:
    print(f"\n📊 Feature Correlation for {sector}:\n")

    # Filter data for the current sector
    sector_data = df[df["Sector"] == sector]

    # Ensure only numeric columns are selected for correlation
    sector_numeric = sector_data.select_dtypes(include=["number"])

    # Compute correlation matrix
    correlation_matrix = sector_numeric.corr()

    # Check if "monthly_return" exists in correlation matrix
    if "monthly_return" in correlation_matrix:
        # Show top correlated features with `monthly_return`
        print(correlation_matrix["monthly_return"].sort_values(ascending=False).head(10))
    else:
        print(f"⚠️ 'monthly_return' column not found in correlation matrix for {sector}.")


📊 Feature Correlation for Health Care:

monthly_return                       1.000000
Close                                0.139721
Low                                  0.134044
High                                 0.109304
Open                                 0.099428
surprise                             0.095337
proceedsFromIssuanceOfCommonStock    0.075736
momentum_rank                        0.063583
momentum_6m                          0.052286
assetTurnover                        0.049684
Name: monthly_return, dtype: float64

📊 Feature Correlation for Consumer Staples:

monthly_return                  1.000000
changeInOperatingLiabilities    0.156021
Close                           0.135261
Low                             0.131765
operatingCashFlowMargin         0.124999
surprise                        0.107429
returnOnAssets                  0.101319
comprehensiveIncomeNetOfTax     0.100476
High                            0.092556
momentum_rank                   0.083948
Name: 

In [29]:
for sector, data in sector_models.items():
    train_r2 = lr.score(data["X_train"], data["y_train"])
    test_r2 = lr.score(data["X_test"], data["y_test"])

    print(f"📊 Sector: {sector} → Training R²: {train_r2:.4f}, Testing R²: {test_r2:.4f}")

📊 Sector: Health Care → Training R²: -4082290.4860, Testing R²: -196272696.9845
📊 Sector: Information Technology → Training R²: -27660.8296, Testing R²: -298.5996
📊 Sector: Financials → Training R²: -620155181.0069, Testing R²: -907313397.7721
📊 Sector: Consumer Staples → Training R²: -493407.2034, Testing R²: -78.9968
📊 Sector: Industrials → Training R²: -79004875.6581, Testing R²: -341681.1849
📊 Sector: Utilities → Training R²: -129737290.3218, Testing R²: -11.0392
📊 Sector: Materials → Training R²: -3263995.2436, Testing R²: -27.6758
📊 Sector: Real Estate → Training R²: -111977409.8102, Testing R²: -3946756216.4277
📊 Sector: Consumer Discretionary → Training R²: -107326684.0786, Testing R²: -8294.1615
📊 Sector: Energy → Training R²: -8686161.3475, Testing R²: -42563398.8725
📊 Sector: Communication Services → Training R²: 0.5447, Testing R²: -2.0677


Ok, so, the issue may be overfitting, do I will try and use Lasso and then Ridge to try and see improved results. However, I think that the main issue is not linearity, which I will try and fix later

In [63]:
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

# Store results
lasso_results = []

for sector, data in sector_models.items():
    print(f"📊 Training Lasso Regression for Sector: {sector}")

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(data["X_train"])
    X_test_scaled = scaler.transform(data["X_test"])

    # Train Lasso with Cross-Validation to Find Best Alpha
    lasso = LassoCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10], cv=5)
    lasso.fit(X_train_scaled, data["y_train"])

    # Best alpha value
    best_alpha = lasso.alpha_

    # Make predictions - In-Sample
    y_train_pred_lasso = lasso.predict(X_train_scaled)

    # Make predictions - Out-of-Sample
    y_test_pred_lasso = lasso.predict(X_test_scaled)

    # Evaluate Model - In-Sample
    mse_train = mean_squared_error(data["y_train"], y_train_pred_lasso)
    mae_train = mean_absolute_error(data["y_train"], y_train_pred_lasso)
    r2_train = r2_score(data["y_train"], y_train_pred_lasso)

    # Evaluate Model - Out-of-Sample
    mse_test = mean_squared_error(data["y_test"], y_test_pred_lasso)
    mae_test = mean_absolute_error(data["y_test"], y_test_pred_lasso)
    r2_test = r2_score(data["y_test"], y_test_pred_lasso)

    # Store results
    lasso_results.append([sector, mse_train, mae_train, r2_train, mse_test, mae_test, r2_test, best_alpha])

# Convert results to DataFrame
lasso_results_df = pd.DataFrame(lasso_results, columns=["Sector", "MSE_Train", "MAE_Train", "R²_Train", "MSE_Test", "MAE_Test", "R²_Test", "Best Alpha"])
print(lasso_results_df)


📊 Training Lasso Regression for Sector: Health Care


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


📊 Training Lasso Regression for Sector: Information Technology


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(


📊 Training Lasso Regression for Sector: Financials


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(


📊 Training Lasso Regression for Sector: Consumer Staples


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


📊 Training Lasso Regression for Sector: Industrials


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


📊 Training Lasso Regression for Sector: Utilities


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


📊 Training Lasso Regression for Sector: Materials


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


📊 Training Lasso Regression for Sector: Real Estate


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


📊 Training Lasso Regression for Sector: Consumer Discretionary


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


📊 Training Lasso Regression for Sector: Energy


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


📊 Training Lasso Regression for Sector: Communication Services


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


                    Sector  MSE_Train  MAE_Train  R²_Train  MSE_Test  \
0              Health Care   0.022520   0.110687  0.277721  8.231302   
1   Information Technology   0.036356   0.136044  0.334394  0.046237   
2               Financials   0.024451   0.112451  0.331126  0.035862   
3         Consumer Staples   0.017186   0.097986  0.171008  0.012337   
4              Industrials   0.088364   0.151334  0.000000  0.042282   
5                Utilities   0.027370   0.100150  0.000000  0.013855   
6                Materials   0.042622   0.156203  0.000000  0.038051   
7              Real Estate   0.029025   0.126637  0.000000  0.034480   
8   Consumer Discretionary   0.049365   0.161245  0.161657  0.880659   
9                   Energy   0.068019   0.178424  0.193128  0.083556   
10  Communication Services   0.044250   0.155816  0.000000  0.056706   

    MAE_Test     R²_Test  Best Alpha  
0   0.342279 -253.846465       0.001  
1   0.149119    0.081341       0.001  
2   0.131722    0.

In [67]:
# Standardize Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Lasso with Cross-Validation
lasso_general = LassoCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10], cv=5)
lasso_general.fit(X_train_scaled, y_train)

# Get best alpha
best_alpha_general = lasso_general.alpha_

# Make Predictions - In-Sample
y_train_pred_lasso_general = lasso_general.predict(X_train_scaled)

# Make Predictions - Out-of-Sample
y_test_pred_lasso_general = lasso_general.predict(X_test_scaled)

# Evaluate General Lasso Model - In-Sample
mse_lasso_general_train = mean_squared_error(y_train, y_train_pred_lasso_general)
mae_lasso_general_train = mean_absolute_error(y_train, y_train_pred_lasso_general)
r2_lasso_general_train = r2_score(y_train, y_train_pred_lasso_general)

# Evaluate General Lasso Model - Out-of-Sample
mse_lasso_general_test = mean_squared_error(y_test, y_test_pred_lasso_general)
mae_lasso_general_test = mean_absolute_error(y_test, y_test_pred_lasso_general)
r2_lasso_general_test = r2_score(y_test, y_test_pred_lasso_general)

# Store General Model Results in a List
general_lasso_results = [
    ["General Model", mse_lasso_general_train, mae_lasso_general_train, r2_lasso_general_train,
     mse_lasso_general_test, mae_lasso_general_test, r2_lasso_general_test, best_alpha_general]
]

# Convert to DataFrame
general_lasso_results_df = pd.DataFrame(general_lasso_results,
                                        columns=["Sector", "MSE_Train", "MAE_Train", "R²_Train",
                                                 "MSE_Test", "MAE_Test", "R²_Test", "Best Alpha"])

# Display Results
print("\n📊 General Lasso Regression Results:")
print(general_lasso_results_df)

# Save to CSV
general_lasso_results_df.to_csv("general_lasso_results.csv", index=False)
print("✅ General Lasso results saved as 'general_lasso_results.csv'.")


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(



📊 General Lasso Regression Results:
          Sector  MSE_Train  MAE_Train  R²_Train  MSE_Test  MAE_Test  R²_Test  \
0  General Model   0.045451   0.138054  0.087158  0.038804  0.138251  0.06834   

   Best Alpha  
0       0.001  
✅ General Lasso results saved as 'general_lasso_results.csv'.


In [69]:
import pandas as pd

# Convert `general_lasso_results` to a DataFrame (if not already)
if not isinstance(general_lasso_results, pd.DataFrame):
    general_lasso_results_df = pd.DataFrame(general_lasso_results,
                                            columns=["Sector", "MSE_Train", "MAE_Train", "R²_Train",
                                                     "MSE_Test", "MAE_Test", "R²_Test", "Best Alpha"])
else:
    general_lasso_results_df = general_lasso_results

# Ensure both DataFrames exist
if "lasso_results_df" not in globals():
    raise ValueError("🚨 `lasso_results_df` is missing. Make sure the sector-specific Lasso results are stored correctly.")

# Merge sector-specific Lasso results with general model
lasso_comparison_df = pd.concat([lasso_results_df, general_lasso_results_df], ignore_index=True)

# Format numbers for better readability
for col in ["MSE_Train", "MAE_Train", "R²_Train", "MSE_Test", "MAE_Test", "R²_Test", "Best Alpha"]:
    lasso_comparison_df[col] = lasso_comparison_df[col].apply(lambda x: f"{x:.6f}")

# Save as CSV
lasso_comparison_df.to_csv("lasso_sector_vs_general.csv", index=False)

# Display results
print("\n✅ 📂 Lasso Regression results saved successfully!")
print(lasso_comparison_df)



✅ 📂 Lasso Regression results saved successfully!
                    Sector MSE_Train MAE_Train  R²_Train  MSE_Test  MAE_Test  \
0              Health Care  0.022520  0.110687  0.277721  8.231302  0.342279   
1   Information Technology  0.036356  0.136044  0.334394  0.046237  0.149119   
2               Financials  0.024451  0.112451  0.331126  0.035862  0.131722   
3         Consumer Staples  0.017186  0.097986  0.171008  0.012337  0.088547   
4              Industrials  0.088364  0.151334  0.000000  0.042282  0.145759   
5                Utilities  0.027370  0.100150  0.000000  0.013855  0.087539   
6                Materials  0.042622  0.156203  0.000000  0.038051  0.151391   
7              Real Estate  0.029025  0.126637  0.000000  0.034480  0.130853   
8   Consumer Discretionary  0.049365  0.161245  0.161657  0.880659  0.264638   
9                   Energy  0.068019  0.178424  0.193128  0.083556  0.206180   
10  Communication Services  0.044250  0.155816  0.000000  0.056706  0.

Now, lets compare Lasso vs the base linear model

We will analyze the differences in performance here

In [71]:
import pandas as pd

# Ensure sector-specific Lasso and Linear results exist
if "results_df" not in globals():
    raise ValueError("🚨 `results_df` (Linear Regression results) is missing.")
if "lasso_results_df" not in globals():
    raise ValueError("🚨 `lasso_results_df` (Lasso Regression results) is missing.")

# 🛠️ Merge sector-specific Linear & Lasso results
comparison_df = results_df.merge(lasso_results_df, on="Sector", suffixes=("_Linear", "_Lasso"))

# Convert general model results to DataFrame if stored as lists
if not isinstance(general_results, pd.DataFrame):
    general_results_df = pd.DataFrame(general_results,
                                      columns=["Sector", "MSE_Train", "MAE_Train", "R²_Train",
                                               "MSE_Test", "MAE_Test", "R²_Test"])
else:
    general_results_df = general_results

if not isinstance(general_lasso_results, pd.DataFrame):
    general_lasso_results_df = pd.DataFrame(general_lasso_results,
                                            columns=["Sector", "MSE_Train", "MAE_Train", "R²_Train",
                                                     "MSE_Test", "MAE_Test", "R²_Test", "Best Alpha"])
else:
    general_lasso_results_df = general_lasso_results

# Add a column to identify general model results
general_results_df["Sector"] = "General Model"
general_lasso_results_df["Sector"] = "General Model"

# Merge general model results
general_comparison_df = general_results_df.merge(general_lasso_results_df, on="Sector", suffixes=("_Linear", "_Lasso"))

# Append general model results to sector comparison
comparison_df = pd.concat([comparison_df, general_comparison_df], ignore_index=True)

# Format numbers for better readability
for col in ["MSE_Train_Linear", "MSE_Test_Linear", "MAE_Train_Linear", "MAE_Test_Linear", "R²_Train_Linear", "R²_Test_Linear",
            "MSE_Train_Lasso", "MSE_Test_Lasso", "MAE_Train_Lasso", "MAE_Test_Lasso", "R²_Train_Lasso", "R²_Test_Lasso", "Best Alpha"]:
    comparison_df[col] = comparison_df[col].apply(lambda x: f"{x:.6f}")

# Save comparison results to CSV
comparison_df.to_csv("lasso_vs_linear_comparison.csv", index=False)

# Display formatted table
print("\n✅ 📂 Lasso vs. Linear Regression results saved successfully!")
print(comparison_df)



✅ 📂 Lasso vs. Linear Regression results saved successfully!
                    Sector MSE_Train_Linear MAE_Train_Linear R²_Train_Linear  \
0              Health Care         0.020964         0.107004        0.327640   
1   Information Technology         0.034047         0.132153        0.376671   
2               Financials         0.023173         0.110077        0.366081   
3         Consumer Staples         0.009064         0.071286        0.562814   
4              Industrials         0.040323         0.141815        0.543673   
5                Utilities         0.021266         0.109180        0.223017   
6                Materials         0.022225         0.112640        0.478555   
7              Real Estate         0.011184         0.083230        0.614698   
8   Consumer Discretionary         0.035562         0.140059        0.396075   
9                   Energy         0.039391         0.140137        0.532727   
10  Communication Services         0.020149         0.09789

In [73]:
import pandas as pd

# Ensure R² values are numeric
comparison_df["R²_Train_Linear"] = pd.to_numeric(comparison_df["R²_Train_Linear"], errors="coerce")
comparison_df["R²_Test_Linear"] = pd.to_numeric(comparison_df["R²_Test_Linear"], errors="coerce")
comparison_df["R²_Train_Lasso"] = pd.to_numeric(comparison_df["R²_Train_Lasso"], errors="coerce")
comparison_df["R²_Test_Lasso"] = pd.to_numeric(comparison_df["R²_Test_Lasso"], errors="coerce")

# Compute the difference in R² (Lasso - Linear) for both in-sample & out-of-sample
comparison_df["R²_Train Difference"] = comparison_df["R²_Train_Lasso"] - comparison_df["R²_Train_Linear"]
comparison_df["R²_Test Difference"] = comparison_df["R²_Test_Lasso"] - comparison_df["R²_Test_Linear"]

# Format numbers for better readability
for col in ["R²_Train_Linear", "R²_Test_Linear", "R²_Train_Lasso", "R²_Test_Lasso",
            "R²_Train Difference", "R²_Test Difference"]:
    comparison_df[col] = comparison_df[col].apply(lambda x: f"{x:.6f}")

# Save the table to CSV
comparison_df.to_csv("r2_difference_comparison.csv", index=False)

# Display the table
print("\n✅ 📂 R² Differences (Lasso vs. Linear) saved successfully!")
print(comparison_df[["Sector", "R²_Train_Linear", "R²_Test_Linear",
                     "R²_Train_Lasso", "R²_Test_Lasso",
                     "R²_Train Difference", "R²_Test Difference"]])



✅ 📂 R² Differences (Lasso vs. Linear) saved successfully!
                    Sector R²_Train_Linear R²_Test_Linear R²_Train_Lasso  \
0              Health Care        0.327640    -571.893267       0.277721   
1   Information Technology        0.376671       0.083508       0.334394   
2               Financials        0.366081       0.118752       0.331126   
3         Consumer Staples        0.562814     -20.403111       0.171008   
4              Industrials        0.543673      -0.951346       0.000000   
5                Utilities        0.223017      -1.358016       0.000000   
6                Materials        0.478555      -2.417344       0.000000   
7              Real Estate        0.614698      -4.862215       0.000000   
8   Consumer Discretionary        0.396075     -20.317981       0.161657   
9                   Energy        0.532727      -1.334859       0.193128   
10  Communication Services        0.544658      -2.067705       0.000000   
11           General Model   

Now, I am going to do the Ridge Regression Model to see if it improves performance



In [74]:
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Store results
ridge_results = []

# Define range of alpha values for cross-validation
alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]

# Loop through each sector and train Ridge Regression
for sector, data in sector_models.items():
    print(f"📊 Training Ridge Regression for Sector: {sector}")

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(data["X_train"])
    X_test_scaled = scaler.transform(data["X_test"])

    # Train Ridge with Cross-Validation to Find Best Alpha
    ridge = RidgeCV(alphas=alphas, store_cv_values=True)
    ridge.fit(X_train_scaled, data["y_train"])

    # Best alpha value
    best_alpha = ridge.alpha_
    print(f"✅ Best Alpha for {sector}: {best_alpha}")

    # Make predictions - In-Sample
    y_train_pred_ridge = ridge.predict(X_train_scaled)

    # Make predictions - Out-of-Sample
    y_test_pred_ridge = ridge.predict(X_test_scaled)

    # Evaluate Model - In-Sample
    mse_train = mean_squared_error(data["y_train"], y_train_pred_ridge)
    mae_train = mean_absolute_error(data["y_train"], y_train_pred_ridge)
    r2_train = r2_score(data["y_train"], y_train_pred_ridge)

    # Evaluate Model - Out-of-Sample
    mse_test = mean_squared_error(data["y_test"], y_test_pred_ridge)
    mae_test = mean_absolute_error(data["y_test"], y_test_pred_ridge)
    r2_test = r2_score(data["y_test"], y_test_pred_ridge)

    # Store results
    ridge_results.append([sector, mse_train, mae_train, r2_train, mse_test, mae_test, r2_test, best_alpha])

# Convert results to DataFrame
ridge_results_df = pd.DataFrame(ridge_results, columns=["Sector", "MSE_Train", "MAE_Train", "R²_Train",
                                                         "MSE_Test", "MAE_Test", "R²_Test", "Best Alpha"])
print(ridge_results_df)


📊 Training Ridge Regression for Sector: Health Care
✅ Best Alpha for Health Care: 1.0
📊 Training Ridge Regression for Sector: Information Technology
✅ Best Alpha for Information Technology: 100.0
📊 Training Ridge Regression for Sector: Financials
✅ Best Alpha for Financials: 1.0
📊 Training Ridge Regression for Sector: Consumer Staples
✅ Best Alpha for Consumer Staples: 100.0
📊 Training Ridge Regression for Sector: Industrials
✅ Best Alpha for Industrials: 100.0
📊 Training Ridge Regression for Sector: Utilities
✅ Best Alpha for Utilities: 10.0
📊 Training Ridge Regression for Sector: Materials
✅ Best Alpha for Materials: 100.0
📊 Training Ridge Regression for Sector: Real Estate
✅ Best Alpha for Real Estate: 100.0
📊 Training Ridge Regression for Sector: Consumer Discretionary
✅ Best Alpha for Consumer Discretionary: 100.0
📊 Training Ridge Regression for Sector: Energy
✅ Best Alpha for Energy: 100.0
📊 Training Ridge Regression for Sector: Communication Services
✅ Best Alpha for Communicati

In [75]:
# Standardize Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Ridge with Cross-Validation
ridge_general = RidgeCV(alphas=alphas, store_cv_values=True)
ridge_general.fit(X_train_scaled, y_train)

# Get best alpha
best_alpha_general = ridge_general.alpha_
print(f"✅ Best Alpha for General Model: {best_alpha_general}")

# Make Predictions - In-Sample
y_train_pred_ridge_general = ridge_general.predict(X_train_scaled)

# Make Predictions - Out-of-Sample
y_test_pred_ridge_general = ridge_general.predict(X_test_scaled)

# Evaluate General Ridge Model - In-Sample
mse_ridge_general_train = mean_squared_error(y_train, y_train_pred_ridge_general)
mae_ridge_general_train = mean_absolute_error(y_train, y_train_pred_ridge_general)
r2_ridge_general_train = r2_score(y_train, y_train_pred_ridge_general)

# Evaluate General Ridge Model - Out-of-Sample
mse_ridge_general_test = mean_squared_error(y_test, y_test_pred_ridge_general)
mae_ridge_general_test = mean_absolute_error(y_test, y_test_pred_ridge_general)
r2_ridge_general_test = r2_score(y_test, y_test_pred_ridge_general)

# Store General Model Results in a List
general_ridge_results = [
    ["General Model", mse_ridge_general_train, mae_ridge_general_train, r2_ridge_general_train,
     mse_ridge_general_test, mae_ridge_general_test, r2_ridge_general_test, best_alpha_general]
]

# Convert to DataFrame
general_ridge_results_df = pd.DataFrame(general_ridge_results,
                                        columns=["Sector", "MSE_Train", "MAE_Train", "R²_Train",
                                                 "MSE_Test", "MAE_Test", "R²_Test", "Best Alpha"])

# Display Results
print("\n📊 General Ridge Regression Results:")
print(general_ridge_results_df)


✅ Best Alpha for General Model: 100.0

📊 General Ridge Regression Results:
          Sector  MSE_Train  MAE_Train  R²_Train  MSE_Test  MAE_Test  \
0  General Model   0.045701    0.13918   0.08215  0.038927  0.139553   

    R²_Test  Best Alpha  
0  0.065395       100.0  


In [76]:
# Merge sector-specific Ridge results with general model
ridge_comparison_df = pd.concat([ridge_results_df, general_ridge_results_df], ignore_index=True)

# Format numbers for better readability
for col in ["MSE_Train", "MSE_Test", "MAE_Train", "MAE_Test", "R²_Train", "R²_Test", "Best Alpha"]:
    ridge_comparison_df[col] = ridge_comparison_df[col].apply(lambda x: f"{x:.6f}")

# Save as CSV
ridge_comparison_df.to_csv("ridge_sector_vs_general.csv", index=False)

# Display results
print("\n✅ 📂 Ridge Regression results saved successfully!")
print(ridge_comparison_df)



✅ 📂 Ridge Regression results saved successfully!
                    Sector MSE_Train MAE_Train  R²_Train   MSE_Test  MAE_Test  \
0              Health Care  0.021143  0.107833  0.321883  18.995195  0.463167   
1   Information Technology  0.041825  0.145689  0.234268   0.051823  0.160547   
2               Financials  0.023335  0.110060  0.361654   0.037248  0.134292   
3         Consumer Staples  0.014672  0.090602  0.292302   0.013974  0.091318   
4              Industrials  0.056790  0.140975  0.357321   0.052062  0.160332   
5                Utilities  0.012148  0.078724  0.556173   0.015663  0.094546   
6                Materials  0.033061  0.138658  0.224324   0.044032  0.151376   
7              Real Estate  0.019537  0.108030  0.326885   0.036726  0.138423   
8   Consumer Discretionary  0.044762  0.153567  0.239829   2.070910  0.325423   
9                   Energy  0.065059  0.175243  0.228248   0.092940  0.219302   
10  Communication Services  0.035097  0.133362  0.206848   

In [77]:
import pandas as pd

# Ensure both DataFrames exist before merging
if "lasso_results_df" not in globals():
    raise ValueError("🚨 `lasso_results_df` (Lasso Regression results) is missing.")
if "ridge_results_df" not in globals():
    raise ValueError("🚨 `ridge_results_df` (Ridge Regression results) is missing.")

# Merge Lasso and Ridge results on Sector
comparison_df = lasso_results_df.merge(ridge_results_df, on="Sector", suffixes=("_Lasso", "_Ridge"))

# Convert R² values to numeric (handling potential errors)
for col in ["R²_Train_Lasso", "R²_Test_Lasso", "R²_Train_Ridge", "R²_Test_Ridge"]:
    comparison_df[col] = pd.to_numeric(comparison_df[col], errors="coerce")

# Compute the difference in R² (Ridge - Lasso) for both in-sample & out-of-sample
comparison_df["R²_Train Difference"] = comparison_df["R²_Train_Ridge"] - comparison_df["R²_Train_Lasso"]
comparison_df["R²_Test Difference"] = comparison_df["R²_Test_Ridge"] - comparison_df["R²_Test_Lasso"]

# Format numbers for better readability
for col in ["R²_Train_Lasso", "R²_Test_Lasso", "R²_Train_Ridge", "R²_Test_Ridge",
            "R²_Train Difference", "R²_Test Difference"]:
    comparison_df[col] = comparison_df[col].apply(lambda x: f"{x:.6f}" if pd.notnull(x) else "NaN")

# Save to CSV
comparison_df.to_csv("r2_difference_lasso_vs_ridge.csv", index=False)

# Display table
print("\n✅ 📂 R² Differences (Lasso vs. Ridge) saved successfully!")
print(comparison_df[["Sector", "R²_Train_Lasso", "R²_Test_Lasso",
                     "R²_Train_Ridge", "R²_Test_Ridge",
                     "R²_Train Difference", "R²_Test Difference"]])



✅ 📂 R² Differences (Lasso vs. Ridge) saved successfully!
                    Sector R²_Train_Lasso R²_Test_Lasso R²_Train_Ridge  \
0              Health Care       0.277721   -253.846465       0.321883   
1   Information Technology       0.334394      0.081341       0.234268   
2               Financials       0.331126      0.156931       0.361654   
3         Consumer Staples       0.171008     -0.009410       0.292302   
4              Industrials       0.000000     -0.010823       0.357321   
5                Utilities       0.000000     -0.080267       0.556173   
6                Materials       0.000000     -0.029009       0.224324   
7              Real Estate       0.000000     -0.015954       0.326885   
8   Consumer Discretionary       0.161657    -12.477685       0.239829   
9                   Energy       0.193128     -0.034909       0.228248   
10  Communication Services       0.000000     -0.012437       0.206848   

   R²_Test_Ridge R²_Train Difference R²_Test Differen

Now, I am going to train the Non-Linear Models, to see if I can get improvements comapre to the linear models, I am done with the Linear Models for now