# GDELT Conflict Prediction - Exploratory Analysis

This notebook provides exploratory data analysis for the GDELT conflict prediction project.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from src.utils.config_loader import load_config

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

# Load config
config = load_config('../configs/config.yaml')

## 1. Load Data

In [None]:
# Choose region
region = 'israel_palestine'  # Change to 'russia_ukraine' or 'india_pakistan'

# Load processed data
processed_file = Path(f'../data/processed/{region}_processed.parquet')
labels_file = Path(f'../data/ground_truth/{region}_labels.csv')

if processed_file.exists() and labels_file.exists():
    df = pd.read_parquet(processed_file)
    labels = pd.read_csv(labels_file, parse_dates=['date'])
    
    # Merge
    data = df.merge(labels, on='date', how='left')
    
    print(f"Loaded data for {region}")
    print(f"Shape: {data.shape}")
    print(f"Date range: {data['date'].min()} to {data['date'].max()}")
    print(f"\nConflict days: {data['is_conflict'].sum()} ({data['is_conflict'].mean()*100:.2f}%)")
else:
    print("Data not found. Run the pipeline first.")

## 2. Time Series Visualization

In [None]:
# Plot event count over time
fig, axes = plt.subplots(3, 1, figsize=(14, 10), sharex=True)

# Event count
axes[0].plot(data['date'], data['event_count'], alpha=0.7)
conflict_mask = data['is_conflict'] == 1
axes[0].fill_between(data['date'], 0, data['event_count'].max(), 
                      where=conflict_mask, alpha=0.3, color='red', label='Conflict Period')
axes[0].set_ylabel('Event Count')
axes[0].set_title(f'Event Count Over Time - {region}')
axes[0].legend()
axes[0].grid(True)

# Goldstein scale (event intensity)
axes[1].plot(data['date'], data['goldstein_mean'], alpha=0.7, color='orange')
axes[1].fill_between(data['date'], data['goldstein_mean'].min(), data['goldstein_mean'].max(),
                      where=conflict_mask, alpha=0.3, color='red')
axes[1].axhline(y=0, color='black', linestyle='--', alpha=0.5)
axes[1].set_ylabel('Goldstein Scale (Mean)')
axes[1].set_title('Event Intensity (Goldstein Scale)')
axes[1].grid(True)

# Tone
axes[2].plot(data['date'], data['tone_mean'], alpha=0.7, color='green')
axes[2].fill_between(data['date'], data['tone_mean'].min(), data['tone_mean'].max(),
                      where=conflict_mask, alpha=0.3, color='red')
axes[2].axhline(y=0, color='black', linestyle='--', alpha=0.5)
axes[2].set_ylabel('Tone (Mean)')
axes[2].set_xlabel('Date')
axes[2].set_title('Event Tone')
axes[2].grid(True)

plt.tight_layout()
plt.show()

## 3. Feature Distribution: Normal vs Conflict

In [None]:
# Compare feature distributions
features = ['event_count', 'goldstein_mean', 'mentions_total', 'tone_mean']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.ravel()

for i, feature in enumerate(features):
    normal_data = data[data['is_conflict'] == 0][feature]
    conflict_data = data[data['is_conflict'] == 1][feature]
    
    axes[i].hist(normal_data, bins=50, alpha=0.6, label='Normal', color='blue')
    axes[i].hist(conflict_data, bins=50, alpha=0.6, label='Conflict', color='red')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Frequency')
    axes[i].set_title(f'{feature} Distribution')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Correlation Analysis

In [None]:
# Select numeric columns
numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [col for col in numeric_cols if col not in ['is_conflict', 'year', 'month', 'day_of_week', 'day_of_month']]
numeric_cols = numeric_cols[:20]  # Limit to first 20 features

# Compute correlation
corr_data = data[numeric_cols + ['is_conflict']].corr()

# Plot correlation with conflict
conflict_corr = corr_data['is_conflict'].sort_values(ascending=False)[1:]  # Exclude self-correlation

plt.figure(figsize=(10, 8))
plt.barh(range(len(conflict_corr)), conflict_corr.values)
plt.yticks(range(len(conflict_corr)), conflict_corr.index)
plt.xlabel('Correlation with Conflict')
plt.title('Feature Correlation with Conflict Labels')
plt.axvline(x=0, color='black', linestyle='--', alpha=0.5)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Event Code Distribution

In [None]:
# Get event code columns
event_code_cols = [col for col in data.columns if col.startswith('event_code_')]

if len(event_code_cols) > 0:
    # Compare normal vs conflict
    normal_events = data[data['is_conflict'] == 0][event_code_cols].sum()
    conflict_events = data[data['is_conflict'] == 1][event_code_cols].sum()
    
    # Top 10 event codes
    top_events = (normal_events + conflict_events).nlargest(10).index
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Normal
    normal_events[top_events].plot(kind='barh', ax=axes[0], color='blue')
    axes[0].set_title('Top Event Codes - Normal Periods')
    axes[0].set_xlabel('Count')
    
    # Conflict
    conflict_events[top_events].plot(kind='barh', ax=axes[1], color='red')
    axes[1].set_title('Top Event Codes - Conflict Periods')
    axes[1].set_xlabel('Count')
    
    plt.tight_layout()
    plt.show()

## 6. Summary Statistics

In [None]:
# Summary by conflict status
summary_features = ['event_count', 'goldstein_mean', 'mentions_total', 'tone_mean']

summary = data.groupby('is_conflict')[summary_features].agg(['mean', 'std', 'min', 'max'])
print("\nSummary Statistics by Conflict Status:")
print(summary)