In [1]:
############################################################
# 0. Problem Definition & Setup
# ----------------------------------------------------------
# Task: Predict/understand firms' dividend behavior (dv, div_dummy)
#       using Compustat balance sheet, income statement and
#       cash flow variables for panel data (gvkey, fyear).
#
# This script implements:
#   - Data cleaning (formats, duplicates, missing values, outliers)
#   - Transformations (standardization, log)
#   - Feature engineering (ratios, cash-flow features, lags)
#   - Filter-based feature selection (correlation, VIF, chi-square)
############################################################

############################################################
# 0. Setup and Initial Checks + dtype fixes
############################################################
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

TRAIN_CUTOFF_LABEL_YEAR = 2022     # Train/Val labels up to and incl. 2022, Test labels after 2022
VAL_YEARS = 1                      # Hold out the last 1 label-year from the training pool as validation

# Rolling year-based CV folds (within train_pool)
N_SPLITS_TIME_CV = 5

WINSOR_LOWER_Q = 0.01
WINSOR_UPPER_Q = 0.99
# Read CSV, avoid chunked type guessing
df = pd.read_csv("data.csv", low_memory=False)

# 0.1 Datetime conversion
# -----------------------
# datadate is currently 'object' → convert to datetime
df['datadate'] = pd.to_datetime(df['datadate'], errors='coerce')

# 0.2 Numeric conversion for price columns (prcc_c, prcc_f)
# ---------------------------------------------------------
# In your extract these had mixed types; we force them to float
for col in ['prcc_c', 'prcc_f']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# 0.3 Panel key types
# -------------------
# gvkey, fyear, ismod are true integers → make them explicit
df['gvkey'] = df['gvkey'].astype('Int64')
df['fyear'] = df['fyear'].astype('Int64')
df['ismod'] = df['ismod'].astype('Int64')

# 0.4 Categorical / string columns (optional but nice)
# ----------------------------------------------------
for col in ['indfmt', 'datafmt', 'consol']:
    if col in df.columns:
        df[col] = df[col].astype('category')

# Company name stays as plain string
# df['conm'] is fine as object (string)

print(df.dtypes)
print("Numeric columns:", len(df.select_dtypes(include=[np.number]).columns))
print("Categorical columns:", len(df.select_dtypes(include=['category']).columns))
print("Datetime columns:", len(df.select_dtypes(include=['datetime']).columns))

FileNotFoundError: [Errno 2] No such file or directory: 'data.csv'

In [None]:

df.select_dtypes(include=['datetime']).describe()

In [None]:


############################################################
# 1. Data Cleaning (Chapter 2: Data Cleaning)
# ----------------------------------------------------------
# - Standardize formats (numeric, categorical, date)
# - Remove duplicates
# - Diagnose missing values
# - Basic missing-value imputation
# - Outlier detection & handling (winsorization)
# - Transformations (standardization, log)
############################################################

# 1.1 Standardize formats
# -----------------------

# Date variable -> datetime
if 'datadate' in df.columns:
    df['datadate'] = pd.to_datetime(df['datadate'], errors='coerce')


# Create numeric firm_id from gvkey
df['firm_id'], _ = pd.factorize(df['gvkey'])

# Sort panel
df = df.sort_values(['firm_id', 'fyear']).reset_index(drop=True)

# 1.2 Remove duplicates
# ---------------------
df = df.drop_duplicates().reset_index(drop=True)

# 1.3 Missing value diagnostics
# -----------------------------
total_rows = len(df)
missing_report = []

for col in df.columns:
    n_miss = df[col].isna().sum()
    if n_miss > 0:
        missing_report.append({
            "variable": col,
            "dtype": str(df[col].dtype),
            "n_missing": n_miss,
            "pct_missing": n_miss / total_rows * 100
        })

missing_df = pd.DataFrame(missing_report).sort_values("pct_missing", ascending=False)
print("******** Missing Values Report (top 20) ********")
print(missing_df.head(20))


In [10]:
# =============================================================================
# 2. TRAIN / VALIDATION / TEST SPLIT (Temporal, based on label_year)
# =============================================================================
# 2.4 label_year and distress definition at time t
df['label_year'] = df['fyear'] + 1

train_pool = df[df['label_year'] <= TRAIN_CUTOFF_LABEL_YEAR].copy()
test = df[df['label_year'] > TRAIN_CUTOFF_LABEL_YEAR].copy()

if train_pool.empty:
    raise ValueError("Training pool is empty after applying label_year cutoff. Check TRAIN_CUTOFF_LABEL_YEAR.")

unique_label_years = np.sort(train_pool['label_year'].dropna().unique())
val_years = unique_label_years[-VAL_YEARS:] if len(unique_label_years) >= VAL_YEARS else unique_label_years

val = train_pool[train_pool['label_year'].isin(val_years)].copy()
train = train_pool[~train_pool['label_year'].isin(val_years)].copy()

print("----- Split Summary (based on label_year = fyear+1) -----")
print(f"Train label_year max: {train['label_year'].max()} | n={len(train)}")
print(f"Val   label_years: {list(val_years)} | n={len(val)}")
print(f"Test  label_year min: {test['label_year'].min()} | n={len(test)}")

----- Split Summary (based on label_year = fyear+1) -----
Train label_year max: 2021 | n=48458
Val   label_years: [np.int64(2022)] | n=6851
Test  label_year min: 2023 | n=19696


In [None]:
############################################################
# 1.6 Log Transformations
###########################################################
# Log-transform size-related, strictly positive variables to reduce skewness
for col in ['at', 'mkvalt']:
    if col in df.columns:
        s = pd.to_numeric(df[col], errors='coerce')
        m = s > 0
        log_s = pd.Series(np.nan, index=s.index, dtype='float64')
        log_s.loc[m] = np.log(s.loc[m])
        df[f"log_{col}"] = log_s

In [None]:


# 1.6 Transformations (standardization & log)
# -------------------------------------------
z_vars = ['ib', 'at', 'dltt', 'che', 're', 'seq', 'xrd', 'dv', 'sale', 'ni',
          'oancf', 'ivncf', 'fincf']

for col in z_vars:
    if col in df.columns:
        mean_val = df[col].mean()
        std_val = df[col].std(ddof=0)
        if std_val != 0:
            df[f"z_{col}"] = (df[col] - mean_val) / std_val
        else:
            df[f"z_{col}"] = np.nan


In [None]:


############################################################
# 2. Feature Engineering (Chapter 2: Feature Engineering)
# ----------------------------------------------------------
# - Financial ratios (profitability, leverage, liquidity, payout, innovation)
# - Cash-flow based features
# - Lagged features for time-series structure
############################################################

# Profitability
df['roa'] = np.where(df['at'] > 0, df['ib'] / df['at'], np.nan)          # Income before extraord. / assets
df['profit_margin'] = np.where(df['sale'] > 0, df['ni'] / df['sale'], np.nan)

# Leverage
df['leverage'] = np.where(df['at'] > 0, df['dltt'] / df['at'], np.nan)

# Liquidity
df['cash_ratio'] = np.where(df['at'] > 0, df['che'] / df['at'], np.nan)
df['current_ratio'] = np.where(df['lct'] > 0, df['act'] / df['lct'], np.nan)

# Retained earnings capacity
df['re_ratio'] = np.where(df['seq'] > 0, df['re'] / df['seq'], np.nan)

# Innovation / investment
df['rd_ratio'] = np.where(df['at'] > 0, df['xrd'] / df['at'], np.nan)
df['capx_ratio'] = np.where(df['at'] > 0, df['capx'] / df['at'], np.nan)

# Cash-flow structure (operating / investing / financing CF scaled by assets)
df['cf_oancf_at'] = np.where(df['at'] > 0, df['oancf'] / df['at'], np.nan)
df['cf_ivncf_at'] = np.where(df['at'] > 0, df['ivncf'] / df['at'], np.nan)
df['cf_fincf_at'] = np.where(df['at'] > 0, df['fincf'] / df['at'], np.nan)

# Dividend behavior
df['div_dummy'] = np.where(df['dv'].notna(), (df['dv'] > 0).astype(int), np.nan)
df['payout_ratio_ni'] = np.where(df['ni'] != 0, df['dv'] / df['ni'], np.nan)
df['payout_ratio_at'] = np.where(df['at'] > 0, df['dv'] / df['at'], np.nan)

# 2.1 Lagged Features (time-series / panel aspect)
# ------------------------------------------------
lag_base_vars = ['roa', 'leverage', 'cash_ratio',
                 'cf_oancf_at', 'cf_ivncf_at', 'cf_fincf_at', 'payout_ratio_ni']

for var in lag_base_vars:
    if var in df.columns:
        df[f"lag_{var}"] = df.groupby('firm_id')[var].shift(1)

# Keep lags only for fyear >= 2023, if desired (align with Stata logic)
df.loc[df['fyear'] < 2023,
       [c for c in df.columns if c.startswith('lag_')]] = np.nan

In [None]:

############################################################
# 3. Feature Selection Prep (Chapter 2: Feature Selection)
# ----------------------------------------------------------
# Filter methods:
#   - Correlation analysis for numeric features vs target dv
#   - VIF for multicollinearity diagnostics
#   - Chi-square for categorical vs target (sic_clean vs div_dummy)
############################################################

feature_vars = [
    'roa', 'leverage', 'cash_ratio', 'current_ratio', 're_ratio',
    'rd_ratio', 'capx_ratio', 'profit_margin', 'log_at', 'log_sale',
    'cf_oancf_at', 'cf_ivncf_at', 'cf_fincf_at', 'payout_ratio_ni',
    'payout_ratio_at'
]
# Retain only existing ones
feature_vars = [v for v in feature_vars if v in df.columns]

# 3.1 Correlation with target dv (filter method)
# ---------------------------------------------
print("----- Correlation of Features with Target Variable (dv) -----")
corr_matrix = df[['dv'] + feature_vars].corr()

for var in feature_vars:
    r = corr_matrix.loc['dv', var]
    print(f"{var:<20} r = {r: .4f}")

# 3.2 VIF (Variance Inflation Factor) – pure NumPy
# -----------------------------------------------
vif_vars = feature_vars
X_df = df[vif_vars].dropna()
X = X_df.values
var_names = list(vif_vars)

print("\n----- VIF for selected features -----")

def compute_vif(X, j):
    y = X[:, j]
    X_other = np.delete(X, j, axis=1)
    X_other_const = np.column_stack([np.ones(X_other.shape[0]), X_other])
    beta, _, _, _ = np.linalg.lstsq(X_other_const, y, rcond=None)
    y_pred = X_other_const @ beta
    ss_res = np.sum((y - y_pred) ** 2)
    ss_tot = np.sum((y - y.mean()) ** 2)
    r2 = 1 - ss_res / ss_tot if ss_tot > 0 else 0.0
    if r2 >= 1:
        return np.inf
    return 1.0 / (1.0 - r2)

for j, name in enumerate(var_names):
    vif_val = compute_vif(X, j)
    print(f"{name:<20} VIF = {vif_val: .4f}")

# 3.3 Variability checks (IQR & Std. Dev.)
# ----------------------------------------
print("\n----- Variability Checks (IQR & Std. Dev.) -----")
for var in feature_vars:
    series = df[var].dropna()
    if series.empty:
        print(f"{var:<20} IQR: NA, Std. Dev.: NA")
        continue
    q25 = series.quantile(0.25)
    q75 = series.quantile(0.75)
    iqr = q75 - q25
    std = series.std(ddof=1)
    print("----------------------------------------")
    print(f"{var} IQR: {iqr:.6f}")
    print(f"{var} Std. Dev.: {std:.6f}")

# 3.4 Chi-Square: sic_clean vs div_dummy (categorical vs target)
# --------------------------------------------------------------
chi_df = df[['sic_clean', 'div_dummy']].dropna()
contingency_table = pd.crosstab(chi_df['sic_clean'], chi_df['div_dummy'])

observed = contingency_table.values
row_sums = observed.sum(axis=1, keepdims=True)
col_sums = observed.sum(axis=0, keepdims=True)
total = observed.sum()

expected = row_sums @ col_sums / total

chi2_stat = ((observed - expected) ** 2 / expected).sum()
dof = (observed.shape[0] - 1) * (observed.shape[1] - 1)

print("\n----- Chi-Square Test: sic_clean vs div_dummy -----")
print("Chi2 statistic:", chi2_stat)
print("Degrees of freedom:", dof)
print("\nContingency table:")
print(contingency_table)

