Merge processed compustat data (with new features and filled gaps from processed_data_compustat.csv using the preprocess_compustat_data.ipynb notebook) with the processed returns data (stock_returns_with_smoothing.csv) from preprocess_returns.ipynb

In [None]:
#Mount using Collab
from google.colab import drive
import os
import pandas as pd
drive.mount('/content/gdrive',force_remount=True)
folder = '/content/gdrive/My Drive/datasets_mlfin'

Mounted at /content/gdrive


In [None]:
#Load using collab mount
returns_df = pd.read_csv(os.path.join(folder, 'stock_returns_with_smoothing.csv'))
compustat_df = pd.read_csv(os.path.join(folder, 'processed_data_compustat.csv'))

# Optionnal, load using PC if enough RAM
# returns_df = pd.read_csv('datasets/PostProcessed/Returns/stock_returns_with_smoothing.csv')
# compustat_df = pd.read_csv('datasets/PostProcesed/processed_data_compustat.csv')

In [None]:
import pandas as pd
import numpy as np


# Rename maj CUSIP to min cusip to match the column name
returns_df = returns_df.rename(columns={'CUSIP': 'cusip'})

returns_df['cusip'] = returns_df['cusip'].astype(str).str.strip().str[:8]
compustat_df['cusip'] = compustat_df['cusip'].astype(str).str.strip().str[:8]

# Convert dates to datetime (sanity check if not done previously)
returns_df['MthCalDt'] = pd.to_datetime(returns_df['MthCalDt'])
compustat_df['datadate'] = pd.to_datetime(compustat_df['datadate'])

# Checks
print("Returns dataset shape:", returns_df.shape)
print("Compustat dataset shape:", compustat_df.shape)
print("Returns date range:", returns_df['MthCalDt'].min(), "to", returns_df['MthCalDt'].max())
print("Compustat date range:", compustat_df['datadate'].min(), "to", compustat_df['datadate'].max())
print("Common CUSIPs:", len(set(returns_df['cusip']) & set(compustat_df['cusip'])))

# Perform the merge_asof merge
merged_df = pd.merge_asof(
    returns_df.sort_values('MthCalDt'),
    compustat_df.sort_values('datadate'),
    left_on='MthCalDt',
    right_on='datadate',
    by='cusip',
    direction='backward',
    tolerance=pd.Timedelta('7 days')  # Allow matching within the last 7 days
)

# Check the merge results
print("\nMerged dataset shape:", merged_df.shape)
print("Merged dataset unique CUSIPs:", merged_df['cusip'].nunique())

# Check rows with Compustat data
merged_df['has_compustat'] = merged_df['datadate'].notna()
print("Rows with Compustat data:", merged_df['has_compustat'].sum())
print("Percentage with Compustat data:", merged_df['has_compustat'].mean() * 100, "%")

# 9. Define the predictors we want to use
predictors = [col for col in [
    'epspxy', 'oiadpy', 'saley', 'earnings_growth',
    'revenue_growth', 'eps_surprise', 'dividend_change',
    'repurchase_intensity'
] if col in merged_df.columns]

# Ensure we have returns
merged_df = merged_df.dropna(subset=['MthRet'])

# Check missing value percentages for each predictor
missing_percentages = merged_df[predictors].isna().mean() * 100
print("\nMissing value percentages for each predictor:")
for pred, pct in missing_percentages.items():
    print(f"{pred}: {pct:.2f}%")

# Only keep rows with Compustat data and no missing predictors
merged_df_strict = merged_df[merged_df['has_compustat']].dropna(subset=predictors)
print("\nStrict dataset (no missing values) shape:", merged_df_strict.shape)

#Drop unused columns
if 'available_date' in compustat_df.columns:
    compustat_df.drop(columns=['available_date'])
if 'has_compustat' in compustat_df.columns:
    compustat_df.drop(columns=['has_compustat'])

# save merged dataset
merged_df_strict.to_csv(os.path.join(folder, 'merged_compustat_returns_cleaned_data_mv_avg.csv'), index=False)

print("\nSample of strict dataset:")
print(merged_df_strict[['cusip', 'MthCalDt', 'MthRet', 'datadate'] + predictors].head())

print("\nFinal dataset shapes:")
print(merged_df_strict.shape)


Returns dataset shape: (4566488, 18)
Compustat dataset shape: (3848428, 43)
Returns date range: 1925-12-31 00:00:00 to 2024-12-31 00:00:00
Compustat date range: 1961-03-31 00:00:00 to 2025-04-30 00:00:00
Common CUSIPs: 23200

Merged dataset shape: (4566488, 60)
Merged dataset unique CUSIPs: 52553
Rows with Compustat data: 1608612
Percentage with Compustat data: 35.22645849501849 %

Missing value percentages for each predictor:
epspxy: 64.92%
oiadpy: 65.96%
saley: 68.19%
earnings_growth: 68.86%
revenue_growth: 69.57%
eps_surprise: 65.07%
dividend_change: 88.77%
repurchase_intensity: 73.47%

Strict dataset (no missing values) shape: (490639, 61)

Sample of strict dataset:
           cusip   MthCalDt    MthRet   datadate  epspxy  oiadpy   saley  \
150706  86666510 1973-01-31  0.018182 1973-01-31    0.27   1.382   8.453   
157898  86666510 1973-02-28 -0.080357 1973-02-28    0.27   1.382   8.453   
168493  86666510 1973-04-30 -0.106383 1973-04-30    0.32   1.382  10.704   
174122  86666510 

Then, we merge the moving average datasets and Compustat data with the sentiment Analysis data.

In [None]:
from preprocessing.merge_sentiment_moving_average import (
    create_custom_financial_dataset_with_sentiment_base,
)

# Example usage
first_df_path = 'merged_compustat_returns_cleaned_data_mv_avg.csv'
second_df_path = 'merged_datasets_with_sentiment.csv'
output_path = 'final_dataset_compustat_ma_sentiment.csv'

merged_df = create_custom_financial_dataset_with_sentiment_base(first_df_path, second_df_path, output_path)
