## Water Conflict Dataset



The Water Conflict dataset [1] was copied and stored into data/Water Conflicts.
This notebook summarises all the events into 5 year periods, so that it fits with the emigration data, and just saves the trigger events.

In [1]:
import pandas as pd
import numpy as np
import re

wc = pd.read_csv("data/Water Conflicts.csv")
replace_map = {"Cote D'Ivoire":"Côte d'Ivoire",
    "Democratic Republic of the Congo":"Democratic Republic of Congo",
    "Scotland" : "United Kingdom", "Syria" : "Syrian Arab Republic",
    "Türkiye" : "Turkey"}
wc = wc.replace(replace_map)

In [2]:
#some entries contain several countries
def list_split(x):
    try:
        l = list(set(x.split(', ')))
        if l[0][0] in ['"',' ']:
            l[0] = l[0][1:]
        return l
    except AttributeError:
        return ''
wc['Country'] = wc['Country'].apply(list_split)

In [3]:
def extract_start_year(date_str):
    if isinstance(date_str, str):
        years = re.findall(r'\b\d{4}\b', date_str)
        if years:
            return min(map(int, years))
    #cases like '1980s' -> 1980
    if isinstance(date_str, str) and date_str.endswith('s'):
         decade_match = re.match(r'(\d{4})s', date_str)
         if decade_match:
             return int(decade_match.group(1))
    return np.nan

wc['Start_Year'] = wc['Date'].apply(extract_start_year)
wc.dropna(subset=['Start_Year'], inplace=True)
wc['Start_Year'] = wc['Start_Year'].astype(int)
wc_filtered = wc[wc['Start_Year'] >= 1980].copy()

min_year = 1980
max_year = wc_filtered['Start_Year'].max()

bins = list(range(min_year, max_year + 6, 5))
labels = ['1985', '1990', '1995', '2000', '2005', '2010', '2015', '2020', '2024']


if len(labels) > len(bins) - 1:
    labels = labels[:len(bins)-1]
elif len(labels) < len(bins) - 1 and bins: #incomplete final bin
        last_bin_start = bins[-2]
        labels.append(f"{last_bin_start}-{last_bin_start+4}")

wc_filtered['Year'] = pd.cut(wc_filtered['Start_Year'], bins=bins, labels=labels, right=False, include_lowest=True)

wc_exploded = wc_filtered.explode('Country')


In [4]:
wc_exploded = wc_exploded[wc_exploded['Conflict Type'].str.contains('Trigger')]
summary_triggers = wc_exploded.groupby(['Year', 'Country'], observed=False).size().reset_index(name='Event_Count')
summary_triggers.to_csv('data/Water Conflict - Triggers.csv',index=False)

[1] Pacific Institute (2024) Water Conflict Chronology. Pacific Institute, Oakland, CA. https://www.worldwater.org/water-conflict/.