In [2]:
import pandas as pd
from scipy.stats.mstats import winsorize
import numpy as np
from tqdm import tqdm
import glob
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [4]:
# ---------------------------------
# Uploading Firmquarter
# ---------------------------------

# Load the CSV version you just uploaded
prisk_df = pd.read_csv("firmquarter_permno.csv")

# Remove NaNs and convert to string
prisk_df = prisk_df.dropna(subset=['permno'])
prisk_df['permno'] = prisk_df['permno'].astype(str).str.strip()

# Count unique permnos
unique_permnos_PRisk = prisk_df['permno'].nunique()
print(f"Unique permnos PRisk (cleaned): {unique_permnos_PRisk}")
# Unique permnos PRisk (cleaned): 5096

# ---------------------------------
# Lading Volatility Document
# ---------------------------------

vol_raw = pd.read_csv("vol_raw.csv", parse_dates=['date'])
print("vol_raw loaded from CSV")
unique_permnos_vol = vol_raw['PERMNO'].nunique()
print(f"Unique permnos PRisk (cleaned): {unique_permnos_vol}")

# vol_raw loaded from CSV
# Unique permnos PRisk (cleaned): 21651

Unique permnos PRisk (cleaned): 5096
vol_raw loaded from CSV
Unique permnos PRisk (cleaned): 21651


##### Working only with Firmquarter PERMNOS data extracted from vol_raw

In [5]:
# ---------------------------------
# Working only with Firmquarter PERMNOS data extracted from vol_raw
# This reduces the computational load to 1/4 (only 5k out of 20k)
# ---------------------------------

# Uploading PRisk
prisk_df['permno'] = prisk_df['permno'].astype(float).astype(int).astype(str)
needed_permnos = set(prisk_df['permno'].unique())
print(f"Need volatility for {len(needed_permnos)} permnos")


# Working with Vol
chunks = []

for chunk in pd.read_csv("vol_raw.csv", parse_dates=['date'], chunksize=500_000):
    chunk['PERMNO'] = chunk['PERMNO'].astype(str)
    filtered = chunk[chunk['PERMNO'].isin(needed_permnos)].copy()
    chunks.append(filtered)

vol_raw_filtered = pd.concat(chunks, ignore_index=True)
print(f"Filtered vol_raw shape: {vol_raw_filtered.shape}")

# Saving vol_raw_filtered to .csv
vol_raw_filtered.to_csv("vol_raw_needed.csv", index=False)
print("Saved: vol_raw_needed.csv")   

# ---------------------------------
# Verifying that they are the same 5,096 after filtering vol_raw_needed
# ---------------------------------

df = pd.read_csv("vol_raw_needed.csv")
df['PERMNO'] = df['PERMNO'].astype(str)
unique_permnos = df['PERMNO'].unique()
print(f"Unique permnos: {len(unique_permnos)}")

Need volatility for 5096 permnos
Filtered vol_raw shape: (14939431, 4)
Saved: vol_raw_needed.csv
Unique permnos: 5096


##### Creating Realized Volatility as variable

In [6]:
# -------------------------------
# Upload file with daily returns from CRSP
# -------------------------------
vol_raw = pd.read_csv("vol_raw_needed.csv", parse_dates=['date'])

# -------------------------------
# Normalize, clean and Q Column
# -------------------------------
vol_raw['PERMNO'] = vol_raw['PERMNO'].astype(str)
vol_raw['RET'] = pd.to_numeric(vol_raw['RET'], errors='coerce') * 100  # Convertir a porcentaje
vol_raw = vol_raw.dropna(subset=['RET'])
vol_raw['quarter'] = vol_raw['date'].dt.to_period('Q').astype(str).str.upper()

# -------------------------------
# Calculate number of observations and standard deviation per PERMNO–quarter
# -------------------------------
agg = vol_raw.groupby(['PERMNO', 'quarter'])['RET'].agg(
    n_obs='count',
    volatility_raw='std'
).reset_index()

agg = agg[agg['n_obs'] >= 60].copy() # Filter by at least 60 days obs

p01 = agg['volatility_raw'].quantile(0.01)
p99 = agg['volatility_raw'].quantile(0.99)
agg['volatility_winz'] = agg['volatility_raw'].clip(lower=p01, upper=p99) # Winsorization

mean_ = agg['volatility_winz'].mean()
std_ = agg['volatility_winz'].std()
agg['volatility_std'] = (agg['volatility_winz'] - mean_) / std_ # Standardization

volatility_df = agg[['PERMNO', 'quarter', 'volatility_std']]
volatility_df.to_csv("volatility_hassan_style.csv", index=False)
print("File saved as 'volatility_hassan_style.csv'")


File saved as 'volatility_hassan_style.csv'


##### Adding Assets to the panel_volatility_merged

Reason for this: All specifications include a log of firm assets as a control. Page 17 of Hassan's technical file

In [7]:
# -------------------------------
# Loading Cap Inv file
# -------------------------------
prisk_df = pd.read_csv("firmquarter_permno.csv")
volatility = pd.read_csv("volatility_hassan_style.csv")  # Includes Volatility variable
capinv = pd.read_csv("capinv.csv")  # File with ATQ (Assets)

prisk_df = prisk_df.dropna(subset=['permno'])
prisk_df['PERMNO'] = prisk_df['permno'].astype(float).astype(int).astype(str)
prisk_df['quarter'] = prisk_df['quarter'].astype(str).str.strip().str.upper()

volatility['PERMNO'] = volatility['PERMNO'].astype(str)
volatility['quarter'] = volatility['quarter'].astype(str).str.strip().str.upper()

panel = pd.merge( #Merging PRisk + Vol
    prisk_df,
    volatility[['PERMNO', 'quarter', 'volatility_std']],
    on=['PERMNO', 'quarter'],
    how='left'
)

# -------------------------------
# Calculating Log(Assets)
# -------------------------------
capinv['datadate'] = pd.to_datetime(capinv['datadate'], errors='coerce')
capinv = capinv.dropna(subset=['datadate', 'atq'])
capinv = capinv[capinv['atq'] > 0].copy()

capinv['quarter'] = capinv['datadate'].dt.to_period('Q').astype(str).str.strip().str.upper()
capinv['log_assets'] = np.log(capinv['atq'])
capinv['gvkey'] = capinv['gvkey'].astype(int).astype(str)

capinv_subset = capinv[['gvkey', 'quarter', 'log_assets']]

panel['gvkey'] = panel['gvkey'].astype(int).astype(str)
panel['quarter'] = panel['quarter'].astype(str).str.strip().str.upper()

# -------------------------------
# Adding Log(Assets) to panel
# -------------------------------
panel = pd.merge(panel, capinv_subset, on=['gvkey', 'quarter'], how='left')
panel.to_csv("panel_volatility_with_assets.csv", index=False)
print("File saved as 'panel_volatility_with_assets.csv'")


File saved as 'panel_volatility_with_assets.csv'


In [9]:
# sanity checks

prisk_df = pd.read_csv("firmquarter_permno.csv")
volatility = pd.read_csv("volatility_hassan_style.csv")

prisk_df = prisk_df.dropna(subset=['permno'])
prisk_df['PERMNO'] = prisk_df['permno'].astype(float).astype(int).astype(str)
prisk_df['quarter'] = prisk_df['quarter'].astype(str).str.upper().str.strip()

volatility['PERMNO'] = volatility['PERMNO'].astype(str)
volatility['quarter'] = volatility['quarter'].astype(str).str.upper().str.strip()

# -------------------------------
# Diagnosis prior merging
# -------------------------------
n_prisk = prisk_df[['PERMNO', 'quarter']].drop_duplicates().shape[0]
n_vol = volatility[['PERMNO', 'quarter']].drop_duplicates().shape[0]

print(f"PERMNO–quarter matches en PRisk: {n_prisk:,}")
print(f"PERMNO–quarter matches in Volatility: {n_vol:,}")

# -------------------------------
# For merged data
# -------------------------------
merged = pd.merge(prisk_df, 
                  volatility[['PERMNO', 'quarter', 'volatility_std']], 
                  on=['PERMNO', 'quarter'], how='left')

n_merged = merged[['PERMNO', 'quarter', 'volatility_std']].dropna().shape[0]
print(f"Observations PRisk + Volatility: {n_merged:,}")


PERMNO–quarter matches en PRisk: 166,477
PERMNO–quarter matches in Volatility: 235,124
Observations PRisk + Volatility: 156,430
