In [1]:
import os
import math
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.prepared import prep
from shapely.geometry import Point
from concurrent.futures import ThreadPoolExecutor

from wsi.mapping.iso_name import ISO_NAME
from wsi.mapping.iso_gw import ISO_GW
from wsi.mapping.iso_iso2 import ISO_ISO2
from wsi.utils import raw_data_path, processed_data_path

# Constants
EARTH_RADIUS_KM = 6371
FILE_PATTERN = "gpw_v4_population_count_adjusted_to_2015_unwpp_country_totals_rev11_2020_30_sec_{tile}.asc"


In [2]:
import logging

logging.basicConfig(
    level=logging.INFO,
    filename=processed_data_path("shocks","proximity_conflict", 'conflict_logs.log'),   # Output file path
    filemode='a',                          # Append mode
    format='%(asctime)s - %(levelname)s - %(message)s'
)

logger = logging.getLogger(__name__)


In [3]:

def read_population_count(file_path):
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return None
    with open(file_path, 'r') as f:
        metadata = {}
        for _ in range(6):
            key, value = f.readline().strip().split()
            metadata[key.lower()] = float(value)
    data = np.loadtxt(file_path, skiprows=6)
    gt = (
        metadata['xllcorner'],
        metadata['cellsize'],
        0,
        metadata['yllcorner'] + metadata['nrows'] * metadata['cellsize'],
        0,
        -metadata['cellsize']
    )
    return {
        "file": file_path,
        "data": data,
        "geotransform": gt,
        "no_data_value": metadata['nodata_value']
    }


def prepare_pixel_grid(geotransform, shape):
    origin_x, pixel_w, _, origin_y, _, pixel_h = geotransform
    rows, cols = shape
    row_grid, col_grid = np.ogrid[0:rows, 0:cols]
    lat_grid = origin_y + row_grid * pixel_h
    lon_grid = origin_x + col_grid * pixel_w
    lat_grid = np.broadcast_to(lat_grid, (rows, cols))
    lon_grid = np.broadcast_to(lon_grid, (rows, cols))
    return lat_grid, lon_grid

def haversine_distance_vector(lat1, lon1, lat2, lon2):
    lat1, lon1 = math.radians(lat1), math.radians(lon1)
    lat2, lon2 = np.radians(lat2), np.radians(lon2)
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    return EARTH_RADIUS_KM * 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

def get_population_in_conflict_area(all_data, conflict_coords, radius_km=50):
    total_population = 0
    union_grid_points = []
    radius_deg_lat = radius_km / 111.0

    for dataset in all_data:
        data = dataset["data"]
        gt = dataset["geotransform"]
        nodata = dataset["no_data_value"]
        lat_grid, lon_grid = prepare_pixel_grid(gt, data.shape)
        mask = np.zeros(data.shape, dtype=bool)

        for lat, lon in conflict_coords:
            radius_deg_lon = radius_km / (111.0 * math.cos(math.radians(lat)))
            lat_min, lat_max = lat - radius_deg_lat, lat + radius_deg_lat
            lon_min, lon_max = lon - radius_deg_lon, lon + radius_deg_lon
            conflict_mask = (
                (lat_grid >= lat_min) & (lat_grid <= lat_max) &
                (lon_grid >= lon_min) & (lon_grid <= lon_max)
            )
            dists = haversine_distance_vector(lat, lon, lat_grid[conflict_mask], lon_grid[conflict_mask])
            tmp_mask = np.zeros_like(mask)
            tmp_mask[conflict_mask] = dists <= radius_km
            mask |= tmp_mask

        valid_mask = mask & (data != nodata)
        total_population += data[valid_mask].sum()
        if np.any(valid_mask):
            union_grid_points += np.column_stack((lat_grid[valid_mask], lon_grid[valid_mask], data[valid_mask])).tolist()

    return total_population, union_grid_points

def clip_grid_points_to_country(grid_points, country_polygon):
    prepped = prep(country_polygon)
    return [pt for pt in grid_points if prepped.contains(Point(pt[1], pt[0]))]

def filter_conflicts(df, country_code, year):
    return df[(df['year'] == year) & df['country_id'].astype(str).str.contains(str(country_code))]

def get_conflict_coordinates(df):
    return df[['latitude', 'longitude']].dropna().values.tolist()

def process_country_code(country_code, years, countries, event_csv, df_pop, all_data):
    summary_rows = []
    heatmap_points = []

    iso3 = next((iso for iso, code in ISO_GW.items() if str(code) == country_code), None)
    if not iso3:
        logger.warning(f"ISO3 code not found for country_code: {country_code}")
        return None

    iso2 = ISO_ISO2[iso3]
    country_gdf = countries[countries['ISO'] == iso2]
    if country_gdf.empty:
        logger.warning(f"Country geometry not found for ISO3: {iso3}/ ISO2: {iso2})")
        return None

    polygon = country_gdf.geometry.iloc[0]

    for yr in years:
        conflict_df = filter_conflicts(event_csv, country_code, yr)
        coords = get_conflict_coordinates(conflict_df)

        if not coords:
            pop_in_conflict = 0
            union_grid_points = []
        else:
            pop_in_conflict, union_grid_points = get_population_in_conflict_area(all_data, coords)
            union_grid_points = clip_grid_points_to_country(union_grid_points, polygon)
            pop_in_conflict = sum(pt[2] for pt in union_grid_points)
            # store grid with year tag
            for pt in union_grid_points:
                heatmap_points.append({
                    'year': yr,
                    'latitude': pt[0],
                    'longitude': pt[1],
                    'population': pt[2]
                })

        national_pop = df_pop[(df_pop['ISO_code'] == iso3) & (df_pop['Year'] == yr)]['Population']
        if not national_pop.empty and national_pop.iloc[0] > 0:
            pct = (pop_in_conflict / national_pop.iloc[0]) * 100
        else:
            pct = None

        summary_rows.append({
            'gw_code': country_code,
            'iso3': iso3,
            'year': yr,
            'pop_in_conflict': pop_in_conflict,
            'national_pop': national_pop.iloc[0] if not national_pop.empty else None,
            'percent': pct
        })

    # Save individual files
    pd.DataFrame(summary_rows).to_csv(processed_data_path("shocks", "proximity_conflict", f"conflict_summary_{iso3}.csv"),index=False)
    pd.DataFrame(heatmap_points).to_csv(processed_data_path("shocks", "proximity_conflict", f"heatmap_grid_{iso3}.csv"),index=False)

    return iso3

In [4]:
# Load shared data (outside parallel scope)

## POPULATION DNESITY
all_data = []
for tile in range(1, 9):
    fp = raw_data_path("shocks", "gpw-v4", FILE_PATTERN.format(tile=tile))
    result = read_population_count(fp)
    if result:
        all_data.append(result)

## SHAPEFILE
# TODO: make secondary shapefile dataset when country not availbale in first
countries = gpd.read_file(raw_data_path("shocks", "country_shapefiles", "World_Countries_Generalized.shp")).to_crs("EPSG:4326")

## CONFLICT EVENTS
UcdpPrioConflict_csv = pd.read_csv(raw_data_path("shocks", "UcdpPrioConflict_v25_1.csv"))
event_csv = pd.read_csv(raw_data_path("shocks", "GEDEvent_v25_1.csv"))
event_csv = event_csv[event_csv['conflict_new_id'].isin(UcdpPrioConflict_csv['conflict_id'].unique())]

# fitler events, at least one fatality, also more than one death at event per country per year per dyad (i.e. exclude small conflicts)
event_csv = event_csv[event_csv['best'] > 0]

# total deaths per dyad-country-year
death_sums = (
    event_csv.groupby(['dyad_new_id', 'country_id', 'year'])['best']
    .sum()
    .reset_index(name='group_best_sum')
)

# Keep only groups where total deaths > 1
valid_groups = death_sums[death_sums['group_best_sum'] > 1]

# Merge back to filter the original event-level data
event_csv = event_csv.merge(
    valid_groups[['dyad_new_id', 'country_id', 'year']],
    on=['dyad_new_id', 'country_id', 'year'],
    how='inner'
)


  event_csv = pd.read_csv(raw_data_path("shocks", "GEDEvent_v25_1.csv"))


In [5]:
# df_pop = pd.read_excel(
#     raw_data_path("shocks", 'WPP2024_GEN_F01_DEMOGRAPHIC_INDICATORS_COMPACT.xlsx'),
#     sheet_name="Estimates", skiprows=16
# )[['ISO3 Alpha-code', 'Year', 'Total Population, as of 1 January (thousands)']]
# df_pop.columns = ['ISO_code', 'Year', 'Population']
# df_pop.dropna(inplace=True)
# df_pop['Year'] = df_pop['Year'].astype(int)
# df_pop['Population'] *= 1000

In [6]:
## POPULATION

# this can go into utils and combine/replace son bias, same data source
CONFIG = {
    "son_bias": {
        "file": "WPP2024_GEN_F01_DEMOGRAPHIC_INDICATORS_COMPACT.xlsx",
        "sheet": "Estimates",
        "indicator_col": "Sex Ratio at Birth (males per 100 female births)",
        "output_col": "Son Bias",
    },
    "son_bias_medium": {
        "file": "WPP2024_GEN_F01_DEMOGRAPHIC_INDICATORS_COMPACT.xlsx",
        "sheet": "Medium variant",
        "indicator_col": "Sex Ratio at Birth (males per 100 female births)",
        "output_col": "Son Bias",
    },
    "total_population": {
        "file": "WPP2024_GEN_F01_DEMOGRAPHIC_INDICATORS_COMPACT.xlsx",
        "sheet": "Estimates",
        "indicator_col": "Total Population, as of 1 January (thousands)",
        "output_col": "Population",
    },
    "total_population_medium": {
        "file": "WPP2024_GEN_F01_DEMOGRAPHIC_INDICATORS_COMPACT.xlsx",
        "sheet": "Medium variant",
        "indicator_col": "Total Population, as of 1 January (thousands)",
        "output_col": "Population",
    },
}


def load_raw(name: str) -> pd.DataFrame:
    cfg = CONFIG[name]
    path = raw_data_path("indicators", cfg["file"])
    return pd.read_excel(path, sheet_name=cfg["sheet"], skiprows=16)


def process_indicator_raw(
    df: pd.DataFrame, config_key: str, iso_codes: list[str] | None = None
) -> pd.DataFrame:
    cfg = CONFIG[config_key]
    indicator_col = cfg["indicator_col"]
    output_col = cfg["output_col"]

    cols = ["ISO3 Alpha-code", "Year", indicator_col]
    df = df[cols].dropna(subset=cols)

    df = df.rename(
        columns={
            "ISO3 Alpha-code": "ISO_code",
            indicator_col: output_col,
        }
    )
    df["Year"] = df["Year"].astype(int)
    df[output_col] = pd.to_numeric(df[output_col], errors="coerce")

    if iso_codes is not None:
        df = df[df["ISO_code"].isin(iso_codes)]

    return df.reset_index(drop=True)


def build_indicator_df(
    base_key: str, projection_key: str, projection_years=[2024, 2025], iso_codes: list[str] | None = None
) -> pd.DataFrame:
    df_hist = process_indicator_raw(load_raw(base_key), base_key, iso_codes)
    df_proj = (
        process_indicator_raw(load_raw(projection_key), projection_key, iso_codes)
        .query("Year in @projection_years")
    )

    output_col = CONFIG[base_key]["output_col"]

    combined = pd.concat(
        [df_hist[["ISO_code", "Year", output_col]], df_proj[["ISO_code", "Year", output_col]]],
        ignore_index=True,
    )

    return combined


In [7]:
# For Total Population
df_pop = build_indicator_df("total_population", "total_population_medium")
df_pop.dropna(inplace=True)
df_pop['Year'] = df_pop['Year'].astype(int)
df_pop['Population'] *= 1000

# Save all lat/long of relevant events
# Invert ISO_GW: {GW_code → ISO3}
GW_ISO = {str(v): k for k, v in ISO_GW.items()}
event_csv['ISO3'] = event_csv['country_id'].astype(str).map(GW_ISO)
all_events = event_csv[['year', 'country_id', 'conflict_name', 'dyad_name', 'best','latitude', 'longitude']].copy()
all_events.to_csv(processed_data_path("shocks", "proximity_conflict", f"event_level_coords.csv"),index=False)

In [None]:
# Parallel execution
years = list(range(1995,2025))

valid_gw_codes = ["811", "840", "850", "900"]  # Cambodia, Phillipines, Indonesia, Australia
valid_gw_codes = ["900"] #Afghanistan
valid_gw_codes = GW_ISO.keys()

from concurrent.futures import ThreadPoolExecutor, as_completed

completed = []

with ThreadPoolExecutor() as executor:
    futures = {executor.submit(process_country_code, code, years, countries, event_csv, df_pop, all_data): code for code in valid_gw_codes}
    for future in as_completed(futures):
        result = future.result()
        if result:
            completed.append(result)
            print(f"✅ Saved results for {result}")

✅ Saved results for AUS
