In [None]:
import glob
import os
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, multilabel_confusion_matrix
pd.set_option('display.max_columns', None)

df = pd.read_parquet("../data/gdelt/final_gdelt_dataset/data.parquet")
print(len(df.index))
print(df.columns)
df.head()

In [None]:
import pandas as pd
import numpy as np

# --- Helpers ---

def array_to_stats(x):
    """Convert list/array/string into (mean, std, max)."""
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return (np.nan, np.nan, np.nan)
    if isinstance(x, (list, np.ndarray)):
        vals = [float(v) for v in x if str(v).lower() not in ["nan", "none"]]
    else:
        s = str(x).replace("[", "").replace("]", "").replace("'", "").strip()
        if s == "" or s.lower() == "nan":
            return (np.nan, np.nan, np.nan)
        vals = []
        for v in s.split():
            try:
                vals.append(float(v))
            except:
                continue
    if not vals:
        return (np.nan, np.nan, np.nan)
    return (np.mean(vals), np.std(vals), np.max(vals))


def clean_numeric_array(x):
    """Convert array/string of numbers into mean (for sentiment, tone, etc.)."""
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return np.nan
    if isinstance(x, (list, np.ndarray)):
        vals = [float(v) for v in x if str(v).lower() not in ["nan", "none"]]
        return np.mean(vals) if vals else np.nan
    # if string representation like "[1.23 2.34]"
    s = str(x).replace("[", "").replace("]", "").replace("'", "").split()
    try:
        vals = [float(v) for v in s]
        return np.mean(vals) if vals else np.nan
    except:
        return np.nan


def clean_listlike(x):
    """Convert pred_impact_type/pred_urgency into list of strings."""
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return []
    if isinstance(x, (list, np.ndarray)):
        return [str(v).strip() for v in x if v not in [None, "nan", "NaN"]]
    # fallback: parse string
    s = str(x).replace("[", "").replace("]", "").replace("'", "").replace('"', "").strip()
    if s == "" or s.lower() == "nan":
        return []
    return [v.strip() for v in s.split() if v.strip()]


def clean_sentiment(x):
    """Reduce sentiment arrays/lists/strings to mean value."""
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return np.nan
    if isinstance(x, (list, np.ndarray)):
        try:
            return float(np.mean(x))
        except:
            return np.nan
    s = str(x).replace("[", "").replace("]", "").replace("'", "").strip()
    if s == "" or s.lower() == "nan":
        return np.nan
    try:
        values = [float(v) for v in s.split()]
        return float(np.mean(values)) if values else np.nan
    except:
        return np.nan


# --- Main preprocessing ---
def preprocess(df):
    # 1. Fix sentiment arrays -> numeric mean
    sentiment_cols = [
        "compound_score_events", "neg_score_events", "neu_score_events", "pos_score_events",
        "compound_score_gkg", "neg_score_gkg", "neu_score_gkg", "pos_score_gkg"
    ]
    for col in sentiment_cols:
        if col in df.columns:
            df[col] = df[col].apply(clean_sentiment)
    
    # 2. Handle categorical predictions -> one-hot
    for col in ["pred_impact_type_events", "pred_impact_type_gkg",
                "pred_urgency_events", "pred_urgency_gkg"]:
        if col in df.columns:
            df[col] = df[col].apply(clean_listlike)
            dummies = df[col].explode().str.get_dummies().groupby(level=0).sum()
            dummies = dummies.add_prefix(f"{col}_")
            df = pd.concat([df.drop(columns=[col]), dummies], axis=1)

    # 3. Tone arrays -> mean/std/max
    for col in ["tone", "tone_abs"]:
        if col in df.columns:
            stats = df[col].apply(array_to_stats)
            df[f"{col}_mean"] = stats.apply(lambda x: x[0])
            df[f"{col}_std"]  = stats.apply(lambda x: x[1])
            df[f"{col}_max"]  = stats.apply(lambda x: x[2])
            df = df.drop(columns=[col])

    # 4. Normalize counts by NumArticles (create _per_article cols)
    if "NumArticles" in df.columns:
        count_cols = [c for c in df.columns if c.endswith("_count_events") or c.endswith("_count_gkg")]
        for col in count_cols:
            df[f"{col}_per_article"] = df[col] / (df["NumArticles"] + 1e-6)

    # 5. *_per_article arrays -> mean/std/max
    per_article_cols = [c for c in df.columns if c.endswith("_per_article")]
    for col in per_article_cols:
        stats = df[col].apply(array_to_stats)
        df[f"{col}_mean"] = stats.apply(lambda x: x[0])
        df[f"{col}_std"]  = stats.apply(lambda x: x[1])
        df[f"{col}_max"]  = stats.apply(lambda x: x[2])
        df = df.drop(columns=[col])

    # 6. Period -> year/month
    if "period" in df.columns:
        df["period"] = pd.to_numeric(df["period"], errors="coerce")
        df["year"] = (df["period"] // 100).astype("Int64")
        df["month"] = (df["period"] % 100).astype("Int64")

    # 7. Drop unused text/ID fields
    drop_cols = [
        "SQLDATE", "EventCode", "SOURCEURL", "NumMentions", "NumSources", "NumArticles",
        "V2Themes", "DocumentIdentifier", "clean_text"
    ]
    df = df.drop(columns=[c for c in drop_cols if c in df.columns])

    return df

df.head()

In [None]:
df = preprocess(df)
df.fillna(0.0, inplace=True)
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].round(3)
df.head()

In [None]:
# Output verification: Check shape after merging
print(f"df shape after merging: {df.shape}")
df['CS_score'] = df['CS_score'].astype(float)

# Filter CS_score
df = df[df['CS_score'].isin([1.0, 2.0, 3.0, 4.0, 5.0])]

# Output verification: Check shape after filtering CS_score
print(f"df shape after filtering CS_score: {df.shape}")

# Round CS_score
df['CS_score'] = np.ceil(df['CS_score'])

# Split data into train and test
train = df[df['period'] != max(df['period'])]
test = df[df['period'] == max(df['period'])]

# Output verification: Check shapes of train and test sets
print(f"train shape: {train.shape}")
print(f"test shape: {test.shape}")

# Initialize summary DataFrame
base_summary = pd.DataFrame(columns=['Model', 'Test Accuracy', 'Test Precision', 'Test Recall', 'F1'])
base_models = ['PPS', 'SPLY', 'Max-2PP']
accuracies = []
precisions = []
recalls = []
f1s = []

# First naive model: PPS
train = train[train['period'].isin(sorted(train['period'].unique())[-1:])]
print(f"train shape for PPS: {train.shape}")

train = train[['ADMIN0', 'ADMIN1', 'ADMIN2', 'CS_score', 'period']]
train.rename(columns={'CS_score': 'predicted'}, inplace=True)
test = pd.merge(test, train, on=['ADMIN0', 'ADMIN1', 'ADMIN2'], how='left')
test['predicted'] = test['predicted'].fillna(0)
print(f"test shape after PPS merge: {test.shape}")

accuracies.append(accuracy_score(test['CS_score'], test['predicted']))
precisions.append(precision_score(test['CS_score'], test['predicted'], average='weighted'))
recalls.append(recall_score(test['CS_score'], test['predicted'], average='weighted'))
f1s.append(f1_score(test['CS_score'], test['predicted'], average='weighted'))

# Second naive model: SPLY
train = df[df['period'] != max(df['period'])]
test = df[df['period'] == max(df['period'])]

train = train[train['period'].isin(sorted(train['period'].unique())[-3:-2])]
print(f"train shape for SPLY: {train.shape}")

train = train[['ADMIN0', 'ADMIN1', 'ADMIN2', 'CS_score', 'period']]
train.rename(columns={'CS_score': 'predicted'}, inplace=True)
test = pd.merge(test, train, on=['ADMIN0', 'ADMIN1', 'ADMIN2'], how='left')
test['predicted'] = test['predicted'].fillna(0)
print(f"test shape after SPLY merge: {test.shape}")

accuracies.append(accuracy_score(test['CS_score'], test['predicted']))
precisions.append(precision_score(test['CS_score'], test['predicted'], average='weighted'))
recalls.append(recall_score(test['CS_score'], test['predicted'], average='weighted'))
f1s.append(f1_score(test['CS_score'], test['predicted'], average='weighted'))

# Third naive model: Max-2PP
train = df[df['period'] != max(df['period'])]
test = df[df['period'] == max(df['period'])]

train = train[train['period'].isin(sorted(train['period'].unique())[-2:])]
print(f"train shape for Max-2PP: {train.shape}")

train = train[['ADMIN0', 'ADMIN1', 'ADMIN2', 'CS_score', 'period']]
train = pd.DataFrame(train.groupby(['ADMIN0', 'ADMIN1', 'ADMIN2'])['CS_score'].max()).reset_index()
train.rename(columns={'CS_score': 'predicted'}, inplace=True)
test = pd.merge(test, train, on=['ADMIN0', 'ADMIN1', 'ADMIN2'], how='left')
test['predicted'] = test['predicted'].fillna(0)
print(f"test shape after Max-2PP merge: {test.shape}")

accuracies.append(accuracy_score(test['CS_score'], test['predicted']))
precisions.append(precision_score(test['CS_score'], test['predicted'], average='weighted'))
recalls.append(recall_score(test['CS_score'], test['predicted'], average='weighted'))
f1s.append(f1_score(test['CS_score'], test['predicted'], average='weighted'))

# Compile results
base_summary['Model'] = base_models
base_summary['Test Accuracy'] = accuracies
base_summary['Test Precision'] = precisions
base_summary['Test Recall'] = recalls
base_summary['F1'] = f1s

# Display summary
print(base_summary)

### Actual ML model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_class_weight

# Sort by region + period
df = df.sort_values(["ADMIN0", "ADMIN1", "ADMIN2", "period"])

# Create lags safely (these are past values only)
df["CS_score_lag1"] = df.groupby(["ADMIN0","ADMIN1","ADMIN2"])["CS_score"].shift(1)
df["CS_score_lag2"] = df.groupby(["ADMIN0","ADMIN1","ADMIN2"])["CS_score"].shift(2)

# Drop rows where lag values are NaN (first periods)
df = df.dropna(subset=["CS_score_lag1", "CS_score_lag2"])
df.reset_index(drop=True, inplace=True)
print(len(df.index))

# Then split train/test by period
max_period = df["period"].max()
train = df[df["period"] != max_period]
train.reset_index(drop=True, inplace=True)
print(len(train.index))

test = df[df["period"] == max_period]
test.reset_index(drop=True, inplace=True)
print(len(test.index))

# Prepare features
target = "CS_score"
drop_cols = ["ADMIN0", "ADMIN1", "ADMIN2", "period", target]
X_train = train.drop(columns=[c for c in drop_cols if c in train.columns])
y_train = train[target]

X_test = test.drop(columns=[c for c in drop_cols if c in test.columns])
y_test = test[target]

# --- Random Forest ---
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    random_state=42,
    class_weight="balanced",
    n_jobs=-1
)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

rf_acc = accuracy_score(y_test, y_pred_rf)
rf_prec = precision_score(y_test, y_pred_rf, average="weighted")
rf_rec = recall_score(y_test, y_pred_rf, average="weighted")
rf_f1 = f1_score(y_test, y_pred_rf, average="weighted")

base_summary.loc[len(base_summary)] = ["RandomForest", rf_acc, rf_prec, rf_rec, rf_f1]

# --- CatBoost ---
classes = np.unique(y_train)
weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

cat = CatBoostClassifier(
    iterations=500,
    depth=8,
    learning_rate=0.05,
    random_seed=42,
    verbose=False,
    class_weights=class_weights
)

cat.fit(X_train, y_train)

y_pred_cat = cat.predict(X_test)

cat_acc = accuracy_score(y_test, y_pred_cat)
cat_prec = precision_score(y_test, y_pred_cat, average="weighted")
cat_rec = recall_score(y_test, y_pred_cat, average="weighted")
cat_f1 = f1_score(y_test, y_pred_cat, average="weighted")

base_summary.loc[len(base_summary)] = ["CatBoost", cat_acc, cat_prec, cat_rec, cat_f1]

print(base_summary)


In [None]:
import plotly.express as px
import pandas as pd

# Extract feature importances
importances = rf.feature_importances_
feature_names = X_train.columns

# Put into DataFrame
feat_imp = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)

# Select top N features
top_n = 20  # show more features if you want
feat_top = feat_imp.head(top_n)

fig = px.bar(
    feat_top,
    x="importance",
    y="feature",
    orientation="h",
    title=f"Random Forest - Top {top_n} Feature Importances",
    labels={"importance": "Importance", "feature": "Feature"},
)

fig.update_layout(
    yaxis=dict(autorange="reversed"),
    plot_bgcolor="white",
    xaxis=dict(showgrid=True, gridcolor="lightgray"),
    height=40 * top_n  # 40px per bar (adjust to taste)
)

fig.show()
