# Data Loading and Merging
Following the approach in train_1, we add Chen-Zimmerman data as features by merging it with the main dataset.

In [9]:
import pandas as pd

# Load main data
main_df = pd.read_csv("merged_compustat_crsp.csv")

# Load Chen-Zimmerman data
cz_df = pd.read_csv("Chen-Zimmerman-PredictorLSretWide-feds-2021037.csv")

In [10]:
# Convert date columns (if present)
main_df['date'] = pd.to_datetime(main_df['date'], errors='coerce')
cz_df['date'] = pd.to_datetime(cz_df['date'], errors='coerce')

In [11]:
# Merge on common keys (e.g., 'date' and 'PERMNO')
# Adjust keys as needed for your data
merged_df = pd.merge(main_df, cz_df, on=['date'], how='left')

In [12]:
# Columns to exclude from features
exclude_columns = [
    'datadate', 'gvkey', 'cusip', 'cusip8', 'cik',
    'PERMNO', 'PERMCO', 'CUSIP', 'HdrCUSIP',
    'Ticker', 'TradingSymbol', 'MthCalDt', 'sprtrn',
    'SICCD', 'NAICS', 'date'
]

# Split into features (X) and target (y)
X = merged_df.drop(columns=exclude_columns + ['MthRet']).fillna(0)
y = merged_df['MthRet']

In [13]:
# Remove rows where the target variable y is NaN
valid_mask = ~y.isna()

X = X[valid_mask]
y = y[valid_mask]

merged_df = merged_df[valid_mask]

In [14]:
# Create train, validation, and test masks based on date
train_mask = merged_df['date'] < '2017-01-01'
val_mask = (merged_df['date'] >= '2017-01-01') & (merged_df['date'] < '2021-01-01')
test_mask = merged_df['date'] >= '2021-01-01'

# Split the data
X_train = X[train_mask]
y_train = y[train_mask]

X_val = X[val_mask]
y_val = y[val_mask]

X_test = X[test_mask]
y_test = y[test_mask]

# Model Training and Evaluation
Compare the effect of adding Chen-Zimmerman features using the same methods as in train_1.

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Validation prediction
y_val_pred = lr_model.predict(X_val)

# Evaluation metrics
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
r2 = r2_score(y_val, y_val_pred)

print(f"Validation RMSE (Linear Regression): {rmse:.5f}")
print(f"Validation R² (Linear Regression): {r2:.5f}")

Validation RMSE (Linear Regression): 394483170127.46637
Validation R² (Linear Regression): -8310907151909766983319552.00000


In [16]:
from sklearn.neural_network import MLPRegressor

# MLP Regressor (same structure as in train_1)
mlp = MLPRegressor(
    hidden_layer_sizes=(64, 32),
    activation='relu',
    solver='adam',
    max_iter=500,
    random_state=42
)
mlp.fit(X_train, y_train)

# Validation prediction
y_val_pred = mlp.predict(X_val)

# Evaluation metrics
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
r2 = r2_score(y_val, y_val_pred)

print(f"Validation RMSE (MLP): {rmse:.5f}")
print(f"Validation R² (MLP): {r2:.5f}")

Validation RMSE (MLP): 0.52163
Validation R² (MLP): -13.53160
