# Chen-Zimmerman Data Processing
This notebook processes the Chen-Zimmerman dataset: filtering by date, handling missing values, winsorizing, removing low-variance and highly correlated columns, and selecting important predictors.

In [3]:
import pandas as pd
import numpy as np
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Load Chen-Zimmerman data
cz = pd.read_csv("Chen-Zimmerman-PredictorLSretWide-feds-2021037.csv")

# Convert date column if needed
cz['date'] = pd.to_datetime(cz['date'], errors='coerce')
cz = cz[cz['date'] >= '2005-01-01']

# Remove columns with >30% missing values
missing_ratio = cz.isnull().mean()
cz_filtered = cz.loc[:, missing_ratio < 0.3]
print('Dropped columns due to high missing ratio:', (missing_ratio >= 0.3).sum())

# Winsorize numeric columns
def winsorize_df(df, lower_pct=0.01, upper_pct=0.99):
    df_winsorized = df.copy()
    for col in df.select_dtypes(include=[np.number]).columns:
        lower = df[col].quantile(lower_pct)
        upper = df[col].quantile(upper_pct)
        df_winsorized[col] = np.clip(df[col], lower, upper)
    return df_winsorized

cz_winsorized = winsorize_df(cz_filtered)

# Remove columns with very low variance (exclude datetime columns)
variances = cz_winsorized.select_dtypes(include=[np.number]).var()
low_var_cols = variances[variances < 1e-5].index
cz_var_filtered = cz_winsorized.drop(columns=low_var_cols)
print('Dropped columns due to low variance:', len(low_var_cols))

# Remove highly correlated columns (threshold=0.9)
corr_matrix = cz_var_filtered.select_dtypes(include=[np.number]).corr().abs()
upper_triangle = np.triu(corr_matrix, k=1)
upper_df = pd.DataFrame(upper_triangle, index=corr_matrix.index, columns=corr_matrix.columns)
threshold = 0.9
to_drop = [column for column in upper_df.columns if any(upper_df[column] > threshold)]
cz_corr_filtered = cz_var_filtered.drop(columns=to_drop)
print('Dropped columns due to high correlation:', len(to_drop))

# Normalize all numeric columns to [-1, 1] with 0 as the midpoint
cz_normalized = cz_corr_filtered.copy()

# Identify numeric columns (excluding 'date' if present)
numeric_cols = cz_normalized.select_dtypes(include=[np.number]).columns.tolist()

scaler = MinMaxScaler(feature_range=(-1, 1))
cz_normalized[numeric_cols] = scaler.fit_transform(cz_normalized[numeric_cols])

# Optionally, select a subset of important predictors (example: top 20 by variance after normalization)
top_predictors = cz_normalized[numeric_cols].var().sort_values(ascending=False).head(20).index.tolist()
cz_selected = cz_normalized[top_predictors + ['date'] if 'date' in cz_normalized.columns else top_predictors]

cz_selected = cz_selected.reset_index(drop=True)
cz_selected.head()

Dropped columns due to high missing ratio: 7
Dropped columns due to low variance: 0
Dropped columns due to high correlation: 15


Unnamed: 0,dNoa,IntMom,Spinoff,OScore,DelCOA,IndIPO,OptionVolume1,AbnormalAccruals,CompositeDebtIssuance,ChNNCOA,...,ShareIss1Y,grcapx,BookLeverage,GP,RDAbility,ChInv,NOA,GrSaleToGrOverhead,TotalAccruals,date
0,-0.00771,-0.059838,-0.515194,0.044252,-0.260386,0.336454,-0.304321,-0.507803,0.206545,0.081388,...,0.572072,-0.631207,-0.44644,0.145601,-0.108376,-0.199546,-0.206052,-0.350475,-0.809379,2005-01-31
1,-0.283252,-0.01634,-0.374925,0.271221,-0.523036,0.112113,-0.500234,0.036703,0.463645,0.428868,...,0.29717,-0.686103,-0.230533,-0.039105,-0.008462,-0.615287,-0.734171,0.034236,-0.729258,2005-02-28
2,-0.289969,0.132446,0.565202,0.518543,-0.469466,0.137732,0.294001,-0.524254,-0.025251,-0.066436,...,0.693108,-0.328309,-0.433232,0.082817,0.675049,-0.257839,-0.537627,0.007722,-0.700574,2005-03-31
3,-0.518364,0.311541,-0.039902,-0.02439,-0.526198,0.338549,-0.225437,-0.591327,-0.363917,-0.5234,...,0.368097,-0.733788,-0.51247,-0.000417,-0.149699,-0.59772,-0.473162,-0.446894,-0.806965,2005-04-29
4,-0.484885,-0.314734,0.975985,-0.113398,-0.365344,-0.215064,-0.134237,-0.365664,-0.553105,-0.145688,...,0.069511,-0.072046,0.194487,0.255973,-0.407752,0.0217,-0.071783,0.663,-0.33062,2005-05-31


In [4]:
# Save the normalized and selected Chen-Zimmerman data to CSV
cz_selected.to_csv("Chen-Zimmerman-processed.csv", index=False)
print("Saved processed Chen-Zimmerman data to Chen-Zimmerman-processed.csv")

Saved processed Chen-Zimmerman data to Chen-Zimmerman-processed.csv
