### TODO 
- Train a specific BERT before using it
- Add a step to use an LLM (probably LLnan3 locally) to add features, labels etc

In [None]:
import pandas as pd
import numpy as np
import os
import glob
from helper_functions.topic_modelling.flatten_articles import flatten_articles
from helper_functions.topic_modelling.sentiment_analysis import perform_sentiment_analysis
from helper_functions.topic_modelling.add_counts_columns_parallel import (
    add_synonym_frequency_columns,
    add_category_count_columns
)
from helper_functions.topic_modelling.aggregate_articles import aggregate_articles, FS_CATEGORIES

# 1️⃣ Load enriched event dataset with:
#    - articles (list of cleaned + truncated text strings)
#    - NER_admin0_list / admin1 / admin2
#    - event metadata (ADMIN0/1/2, CS_score, period, etc.)
# ================================================================

exploded_df = pd.read_parquet("../data/gdelt/events/scraped_urls/cleaned_filtered_urls.parquet")

# ================================================================
# 3️⃣ (OPTIONAL SAFETY) Ensure clean_text is string
# ================================================================
exploded_df["clean_text"] = exploded_df["clean_text"].astype(str)

# ================================================================
# 4️⃣ Sentiment Analysis
# ================================================================
exploded_df = perform_sentiment_analysis(exploded_df, text_col="clean_text")
print("Sentiment analysis done.")

# ================================================================
# 5️⃣ Keyword frequency & category count features
# ================================================================
exploded_df = add_synonym_frequency_columns(exploded_df, text_col="clean_text")
print("Frequency counts done")
exploded_df = add_category_count_columns(exploded_df, text_col="clean_text")
print("Keyword & category counts added.")

# ================================================================
# 6️⃣ Save in chunks
# ================================================================
out_dir = "../data/gdelt/events/5_modelled"
os.makedirs(out_dir, exist_ok=True)
exploded_df.to_parquet(out_dir + "/events_exploded_with_counts.parquet", index=False)

print("✅ Processing complete.")
exploded_df.head()