In [None]:
# Imports
import pandas as pd
import numpy as np
from geopy.distance import great_circle 
from datetime import timedelta, datetime
import os

print("Pandas version:", pd.__version__)
print("NumPy version:", np.__version__)

pd.options.mode.chained_assignment = None 

In [None]:
# Configuration Parameters

# Matching Criteria
RADIUS_KM = 50.0     # Max distance (in km) for a spatial match on the same day.

# Data Filtering Parameters
CUTOFF_DATE_STR = "2020-12-31" 
CUTOFF_DATE = pd.to_datetime(CUTOFF_DATE_STR)

# Column Names
TALIBAN_TEXT_COLUMN = 'body' 

# Input/Output File Paths
TALIBAN_PROCESSED_DATA_PATH = 'data/processed/taliban_extracted_events.csv'
ACLED_RAW_DATA_PATH = 'data/raw/afgh_may25.csv'
MATCH_RESULTS_OUTPUT_DIR = 'data/results/' 
MATCH_RESULTS_FILENAME = f'taliban_acled_matches_ExactTime_upto_{CUTOFF_DATE_STR}_R{RADIUS_KM}KM.xlsx'

In [None]:
# Load and Preprocess Taliban Data

taliban_df_processed = pd.DataFrame() 

try:
    taliban_df_processed = pd.read_csv(TALIBAN_PROCESSED_DATA_PATH)
    print(f"[*] Successfully loaded processed Taliban data from: {TALIBAN_PROCESSED_DATA_PATH}")

    taliban_df_processed['event_date'] = pd.to_datetime(taliban_df_processed['event_date'], errors='coerce')
    taliban_df_processed['latitude'] = pd.to_numeric(taliban_df_processed['latitude'], errors='coerce')
    taliban_df_processed['longitude'] = pd.to_numeric(taliban_df_processed['longitude'], errors='coerce')

    if 'taliban_event_id' not in taliban_df_processed.columns:
        if 'event_id' in taliban_df_processed.columns:
            taliban_df_processed.rename(columns={'event_id': 'taliban_event_id'}, inplace=True)
        elif 'id' in taliban_df_processed.columns:
            taliban_df_processed.rename(columns={'id': 'taliban_event_id'}, inplace=True)
        else:
            taliban_df_processed['taliban_event_id'] = taliban_df_processed.index 
            print("  [!] Warning: Using DataFrame index as Taliban event ID.")
            
    if TALIBAN_TEXT_COLUMN not in taliban_df_processed.columns:
        print(f"  [!] Warning: Column '{TALIBAN_TEXT_COLUMN}' not found. Adding placeholder.")
        taliban_df_processed[TALIBAN_TEXT_COLUMN] = "No Taliban text available for this entry."
        
    print(f"  Initial Taliban data shape: {taliban_df_processed.shape}")

except FileNotFoundError:
    print(f"[!!!] ERROR: Processed Taliban data file not found at {TALIBAN_PROCESSED_DATA_PATH}. Matching will be skipped.")
except Exception as e:
    print(f"[!!!] ERROR loading or processing Taliban data: {e}")
    
initial_rows_taliban_before_cutoff = len(taliban_df_processed)
if not taliban_df_processed.empty and 'event_date' in taliban_df_processed.columns:
    taliban_df_processed = taliban_df_processed[taliban_df_processed['event_date'] <= CUTOFF_DATE].copy()
    print(f"  Taliban data shape after filtering by CUTOFF_DATE ('{CUTOFF_DATE_STR}'): {taliban_df_processed.shape} (removed {initial_rows_taliban_before_cutoff - len(taliban_df_processed)} rows)")
else:
    print("  Taliban data is empty or 'event_date' column missing; cannot apply CUTOFF_DATE filter.")

initial_rows_taliban_before_dropna = len(taliban_df_processed)
if not taliban_df_processed.empty:
    taliban_df_processed.dropna(subset=['event_date', 'latitude', 'longitude', 'taliban_event_id'], inplace=True)
    dropped_rows_taliban = initial_rows_taliban_before_dropna - len(taliban_df_processed)
    if dropped_rows_taliban > 0:
        print(f"  Dropped {dropped_rows_taliban} rows from Taliban data due to NaNs in key columns.")
print(f"  Final Taliban data shape for matching: {taliban_df_processed.shape}")

if taliban_df_processed.empty:
    print("\n[!!!] Taliban DataFrame is empty after preprocessing. Matching process will be skipped.")

In [None]:
# Helper Function for Distance Calculation

def calculate_distance_km(coords1, coords2):
    """Calculates great-circle distance (Haversine) in kilometers."""
    if any(c is None or pd.isna(c) for c in coords1) or \
       any(c is None or pd.isna(c) for c in coords2):
        return float('inf')
    
    try:
        coords1_float = (float(coords1[0]), float(coords1[1]))
        coords2_float = (float(coords2[0]), float(coords2[1]))
        return great_circle(coords1_float, coords2_float).km
    except ValueError: 
        return float('inf')
    except Exception as e:
        print(f"  [Error in distance calculation]: {e} for {coords1} and {coords2}")
        return float('inf')

In [None]:
# Core Matching Logic

print(f"\n[*] Starting geo-temporal matching process for events up to {CUTOFF_DATE_STR}...")
print(f"    Matching criteria: EXACT DAY MATCH and spatial RADIUS_KM = {RADIUS_KM}...")

matched_events_records = [] 

ACLED_DETAIL_COLS = [
    'event_type', 'sub_event_type', 'actor1', 'assoc_actor_1', 
    'fatalities', 'notes', 'source', 'time_precision', 'geo_precision',
    'admin1', 'admin2', 'location', 'event_date'
]

if taliban_df_processed.empty or acled_df_processed.empty:
    print("\n[!] Skipping matching as one or both dataframes are empty after preprocessing.")
else:
    for taliban_idx, taliban_row in taliban_df_processed.iterrows():
        taliban_id = taliban_row['taliban_event_id']
        t_date = taliban_row['event_date'] 
        t_coords = (taliban_row['latitude'], taliban_row['longitude'])
        t_text = taliban_row.get(TALIBAN_TEXT_COLUMN, "N/A") 

        acled_same_day_candidates = acled_df_processed[
            acled_df_processed['event_date'].dt.date == t_date.date()
        ].copy()

        base_taliban_info = {
            'taliban_event_id': taliban_id, 
            't_event_date': t_date,
            't_lat': t_coords[0], 
            't_lon': t_coords[1], 
            TALIBAN_TEXT_COLUMN: t_text,
        }
        
        default_acled_info = {f'a_{col}': None for col in ACLED_DETAIL_COLS}
        default_acled_info.update({
            'acled_event_id_cnty': None, 'a_event_date': None, 
            'a_lat': None, 'a_lon': None,
        })

        if acled_same_day_candidates.empty:
            record = {
                **base_taliban_info, 
                **default_acled_info, 
                'time_difference_days': 0, 
                'distance_km': None, 
                'match_type': 'No ACLED Event on Exact Date'
            }
            matched_events_records.append(record)
            continue 

        potential_spatial_matches = []
        for acled_idx, acled_row in acled_same_day_candidates.iterrows():
            a_coords = (acled_row['latitude'], acled_row['longitude'])
            distance = calculate_distance_km(t_coords, a_coords)
            
            if distance <= RADIUS_KM: 
                acled_specific_info = {f'a_{col}': acled_row.get(col) for col in ACLED_DETAIL_COLS}
                acled_specific_info.update({
                     'acled_event_id_cnty': acled_row['event_id_cnty'], 
                     'a_event_date': acled_row['event_date'], 
                     'a_lat': a_coords[0], 
                     'a_lon': a_coords[1],
                })
                
                match_type = "ExactTime_SpatialRadius_Match"
                if distance < 0.1: 
                    match_type = "ExactTime_NearExactSpatial_Match"

                candidate_record = {
                    **base_taliban_info, 
                    **acled_specific_info, 
                    'time_difference_days': 0, 
                    'distance_km': distance, 
                    'match_type': match_type
                }
                potential_spatial_matches.append(candidate_record)

        if potential_spatial_matches:
            potential_spatial_matches.sort(key=lambda x: x['distance_km'])
            best_spatial_match = potential_spatial_matches[0] 
            matched_events_records.append(best_spatial_match)
        else:
            record = {
                **base_taliban_info, 
                **default_acled_info, 
                'time_difference_days': 0, 
                'distance_km': None, 
                'match_type': 'No Spatial Match on Exact Date (ACLED events existed)'
            }
            matched_events_records.append(record)

matches_df = pd.DataFrame(matched_events_records)

if not matches_df.empty:
    num_taliban_processed = len(taliban_df_processed)
    num_actual_matches = len(matches_df[matches_df['acled_event_id_cnty'].notna()])
    print(f"\n[*] Matching process complete. Processed {num_taliban_processed} Taliban events.")
    print(f"    Found {num_actual_matches} potential ACLED matches (within {CUTOFF_DATE_STR} and {RADIUS_KM}km radius).")
else:
    print("\n[!] Matching process completed, but no Taliban events were processed or no matches found.")

In [None]:
# Analyze and Save Match Results

print("\n--- Analysis of Matches ---")

if not matches_df.empty:
    cols_to_display_sample = [
        'taliban_event_id', 't_event_date', 't_lat', 't_lon', TALIBAN_TEXT_COLUMN, 
        'acled_event_id_cnty', 'a_event_date', 'a_lat', 'a_lon', 'a_fatalities', 'a_notes', 'a_source',
        'distance_km', 'match_type'
    ]
    cols_to_display_sample = [col for col in cols_to_display_sample if col in matches_df.columns]
    
    print("\nSample of all match attempts (first 5 rows with selected columns):")
    display(matches_df[cols_to_display_sample].head())

    successful_matches = matches_df[matches_df['acled_event_id_cnty'].notna()].copy()
    print(f"\nTotal number of successful matches: {len(successful_matches)}")
    
    if not successful_matches.empty:
        print("\nSample of successful matches (first 5 rows with selected columns):")
        display(successful_matches[cols_to_display_sample].head())

        print("\nDistribution of Match Types for successful matches:")
        print(successful_matches['match_type'].value_counts())

        print("\nStatistics for successful matches (distance in km):") 
        print(successful_matches[['distance_km']].describe())
    else:
        print("\nNo successful matches found to analyze further.")

    unmatched_taliban_events_df = matches_df[matches_df['acled_event_id_cnty'].isna()].copy()
    print(f"\nNumber of Taliban events not matched (within cutoff date, but no ACLED match): {len(unmatched_taliban_events_df)}")
    if not unmatched_taliban_events_df.empty:
        cols_for_unmatched_display = ['taliban_event_id', 't_event_date', TALIBAN_TEXT_COLUMN, 'match_type']
        cols_for_unmatched_display = [col for col in cols_for_unmatched_display if col in unmatched_taliban_events_df.columns]
        print("\nSample of Taliban events not matched (first 5 rows):")
        display(unmatched_taliban_events_df[cols_for_unmatched_display].head())
    else:
        print("\nAll Taliban events within the cutoff date were processed and potentially matched.")

    os.makedirs(MATCH_RESULTS_OUTPUT_DIR, exist_ok=True)
    matches_output_path = os.path.join(MATCH_RESULTS_OUTPUT_DIR, MATCH_RESULTS_FILENAME)
    
    try:
        matches_df.to_excel(matches_output_path, index=False, engine='openpyxl')
        print(f"\n[+] Detailed match results saved successfully to '{matches_output_path}'")
    except ImportError:
        print(f"\n[!!!] Warning: 'openpyxl' library not found. Cannot save to Excel (.xlsx).")
        print("      Install with: 'pip install openpyxl'")
        csv_fallback_path = matches_output_path.replace('.xlsx', '.csv')
        matches_df.to_csv(csv_fallback_path, index=False)
        print(f"      Results saved as CSV instead to '{csv_fallback_path}'")
    except Exception as e:
        print(f"\n[!!!] Error saving match results: {e}")
        csv_fallback_path = matches_output_path.replace('.xlsx', '.csv')
        matches_df.to_csv(csv_fallback_path, index=False)
        print(f"      Results saved as CSV instead to '{csv_fallback_path}' due to an error.")
else:
    print("\n[!] Matches DataFrame is empty. No results to analyze or save.")

print("\n--- Matching process concluded ---")