In [None]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from helper_functions.gdelt_data_mapping_optimized import load_gadm_data, process_gdelt_data, consolidate_and_merge_fews
gadm_path = "../data/gdam/gadm_410_filtered_v2.gpkg"  # Path to GADM GeoPackage
fews_path = "../data/fews/fews_with_conflicts_admin2.parquet"  # FEWS NET dataset, so far all data is here

# Load the FEWS NET data
fews_df = pd.read_parquet(fews_path)
fews_df = fews_df[['ADMIN0', 'ADMIN1', 'ADMIN2', 'period', 'CS_score']]  ## Good to here!!!

# Load and filter the GADM data (sets the global gadm_gdf used by get_admin_areas)
print("Loading GADM")
gadm_gdf, fews_df = load_gadm_data(gadm_path, fews_df) ## Good to here!!!

In [None]:
fews_df = fews_df[['ADMIN0', 'ADMIN1', 'ADMIN2']].drop_duplicates()
#fews_df.to_csv("fewsnet_admin2_names.csv", index=False)

gadm_gdf = gadm_gdf[['NAME_0', 'NAME_1', 'NAME_2']].drop_duplicates()
gadm_gdf.columns = ['country', 'admin1', 'admin2']
#gadm_gdf.to_csv("gadm_admin2_names.csv", index=False)

In [None]:

import unicodedata
import re

def normalize(s):
    s = str(s).lower()
    s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode()  # Remove accents
    s = re.sub(r"\bal\b", "", s)  # Remove Arabic article 'al'
    s = re.sub(r"[^a-z0-9 ]", " ", s)  # Remove punctuation
    s = re.sub(r"\s+", " ", s)  # Collapse multiple spaces
    return s.strip()

# Read the datasets
fewsnet_df = pd.read_csv('fewsnet_admin2_names.csv')
gadm_df_names = pd.read_csv('gadm_admin2_names.csv')

gadm_df_names = gadm_df_names[gadm_df_names['country'].isin(fewsnet_df['ADMIN0'].unique())]

# Get unique admin combinations from both datasets
fewsnet_combinations = fewsnet_df[['ADMIN0', 'ADMIN1', 'ADMIN2']].drop_duplicates()
gadm_combinations = gadm_df_names[['country', 'admin1', 'admin2']].drop_duplicates()

print(f"FEWSNET has {len(fewsnet_combinations)} unique admin combinations")
print(f"GADM has {len(gadm_combinations)} unique admin combinations")

# Group FEWSNET by country for faster lookup
fewsnet_by_country = {}
for _, row in fewsnet_combinations.iterrows():
    country = row['ADMIN0']
    if country not in fewsnet_by_country:
        fewsnet_by_country[country] = []
    fewsnet_by_country[country].append(row)

print(f"FEWSNET has {len(fewsnet_by_country)} unique countries")

# Create the result dataframe
# Create the result dataframe
result_data = []

for _, acled_row in gadm_combinations.iterrows():
    country = acled_row['country']
    
    # Only compare with FEWSNET entries from the same country
    if country in fewsnet_by_country:
        fewsnet_same_country = fewsnet_by_country[country]
        
        # Create search strings for this country only
        fewsnet_strings = []
        for row in fewsnet_same_country:
            search_string = normalize(f"{row['ADMIN1']} {row['ADMIN2']}")
            fewsnet_strings.append(search_string)
        
        # Search string for ACLED
        search_string = normalize(f"{acled_row['admin1']} {acled_row['admin2']}")
        
        # Find best match
        best_match = process.extractOne(search_string, fewsnet_strings, scorer=fuzz.WRatio)
        
        if best_match and best_match[1] >= 50:
            # Find the corresponding FEWSNET row
            best_match_index = fewsnet_strings.index(best_match[0])
            best_match_row = fewsnet_same_country[best_match_index]
            
            result_data.append({
                'country': acled_row['country'],  # Keep original key
                'admin1': acled_row['admin1'],    # Keep original key
                'admin2': acled_row['admin2'],    # Keep original key
                'ADMIN0_matched': best_match_row['ADMIN0'],
                'ADMIN1_matched': best_match_row['ADMIN1'],
                'ADMIN2_matched': best_match_row['ADMIN2']
            })
        else:
            result_data.append({
                'country': acled_row['country'],  # Keep original key
                'admin1': acled_row['admin1'],    # Keep original key
                'admin2': acled_row['admin2'],    # Keep original key
                'ADMIN0_matched': None,
                'ADMIN1_matched': None,
                'ADMIN2_matched': None
            })
    else:
        # Country not found in FEWSNET
        result_data.append({
            'country': acled_row['country'],  # Keep original key
            'admin1': acled_row['admin1'],    # Keep original key
            'admin2': acled_row['admin2'],    # Keep original key
            'ADMIN0_matched': None,
            'ADMIN1_matched': None,
            'ADMIN2_matched': None
        })

# Create the result dataframe
result_df = pd.DataFrame(result_data)

# Save to CSV
result_df.to_csv('gadm_fewsnet_mapping.csv', index=False)

print(f"\nMapping saved to 'gadm_fewsnet_mapping.csv'")
print(f"Total GADM entries: {len(gadm_combinations)}")
print(f"Entries with matches: {len(result_df.dropna(subset=['ADMIN0_matched']))}")
print(f"Entries without matches: {len(result_df[result_df['ADMIN0_matched'].isna()])}")

# Show some examples
print("\nExample mappings:")
print(result_df.head(10).to_string(index=False))