Preprocess the Compustat data (i.e CompFirmCharac.csv) to fill NaNs and Gaps, and interpolate quarterly data to monthly data using ffill.

In [None]:
# mount using collab
from google.colab import drive
import os
import pandas as pd
drive.mount('/content/gdrive',force_remount=True)
folder = '/content/gdrive/My Drive/datasets_mlfin'

In [None]:
#collab filepath, optionnaly use raw compuer if you have enoguh RAM
filepath = (os.path.join(folder, 'CompFirmCharac.csv')) 
# filepath = 'datasets/Predictors/CompFirmCharac.csv'

In [None]:
# Preprocess using Collab
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import interpolate
import seaborn as sns
from preprocessing.preprocessing_compustat_bis import CompustatPreprocessor

# Initialize the preprocessor class from preprocessing/preprocessing_compustat_bis
preprocessor1 = CompustatPreprocessor(date_column='datadate', id_column='cusip')

# Load the data
# Replace with your actual file path
preprocessor1.load(filepath)

# Explore the data to 
preprocessor1.explore_data()

# Visualize missing values
preprocessor1.plot_missing_values(top_n=100)

# Convert dates to proper format
preprocessor1.convert_dates()

# load cusip inside returns list to only focus on cusips that are both inside returns and compustat
cusips = pd.read_csv((os.path.join(folder, 'cusip_list.csv')))

#Filter compustat by cusips
preprocessor1.filter_by_returns_cusips(cusips)

# Check column coverage and save to file
coverage = preprocessor1.check_column_coverage()
print(coverage)

# Get columns with good coverage (at least x% of data available)
recommended_columns = preprocessor1.recommend_columns_for_sentiment(min_coverage=30)
print("Recommended columns:", recommended_columns)

# Select important financial columns
preprocessor1.select_important_columns()

# Sort data by date
preprocessor1.global_sort_by_date()

# Fix data types and standardize them
preprocessor1.fix_data_types()

# Remove duplicates
preprocessor1.handle_duplicates()

# Clean string 'nan' values
preprocessor1.clean_string_nan_values()

# Step 13: Replace remaining NaNs in string columns with empty strings
preprocessor1.handle_missing_values(method='empty_strings')

# Filter columns by coverage (keep only columns with at least 30% coverage)
preprocessor1.filter_columns_by_coverage(min_coverage=30)

# Get the processed quarterly data before interpolation
quarterly_data = preprocessor1.get_data()
print("\n=== Quarterly Data Summary (Before Interpolation) ===")
print(f"Shape: {quarterly_data.shape}")
print("\nSample of quarterly data:")
print(quarterly_data.head())

# Optional: Export the cleaned quarterly data without ffil
# preprocessor1.export_data((os.path.join(folder, 'quarterly_data.csv')), format='csv')s

# Interpolate quarterly data to monthly frequency using Ffill
print("Starting Quarterly to Monthly Interpolation")
preprocessor1.interpolate_quarterly_to_monthly(min_coverage=30)

# Create financial indicators for sentiment analysis
preprocessor1.create_sentiment_indicators()

# Get the processed monthly data after interpolation
monthly_data = preprocessor1.get_data()
print("\n=== Monthly Data Summary (After Interpolation) ===")
print(f"Shape: {monthly_data.shape}")
print("\nSample of monthly data:")
print(monthly_data.head())

# Plot examples of interpolation for visual inspection
print("\n=== Plotting Interpolation Examples ===")
# Try to find good columns for visualization
good_columns = ['epspxy', 'oiadpy', 'saley', 'earnings_growth', 'revenue_growth']
for col in good_columns:
    if col in monthly_data.columns:
        print(f"Plotting interpolation example for {col}")
        preprocessor1.plot_interpolation_example(column=col, n_samples=2)
        break

# Export the processed monthly data
preprocessor1.export_data((os.path.join(folder, 'processed_data_compustat.csv')), format='csv')

