# Data cleaning

This notebook performs initial cleaning of the Global Terrorism Database and writes a cleaned base CSV to `data/processed/gtd_cleaned_base.csv`.

In [14]:
# Imports and load data (robust paths)
from pathlib import Path
import os
import pandas as pd

def find_repo_root(start=Path.cwd()):
    p = start
    while True:
        # heuristics: repository root contains 'notebooks', '.git' or 'README.md'
        if (p / 'notebooks').exists() or (p / '.git').exists() or (p / 'README.md').exists():
            return p
        if p == p.parent:
            raise FileNotFoundError('Could not find repository root from cwd')
        p = p.parent

ROOT = find_repo_root()
DATA_DIR = ROOT / 'data'
RAW_PATH = DATA_DIR / 'raw' / 'globalterrorismdb_0718dist.csv'
if not RAW_PATH.exists():
    raise FileNotFoundError(f'Raw file not found at {RAW_PATH!s}. Check that data/raw contains the CSV or start the kernel from the project root.')
gtd = pd.read_csv(RAW_PATH, encoding='ISO-8859-1', engine='python')

print('loaded from', RAW_PATH)
print('shape:', gtd.shape)
print('columns:', list(gtd.columns))
gtd.info()
gtd.head()

loaded from /Users/kanishkraghavendra/Documents/Project/data-minds-causal-analysis/data/raw/globalterrorismdb_0718dist.csv
shape: (181691, 135)
columns: ['eventid', 'iyear', 'imonth', 'iday', 'approxdate', 'extended', 'resolution', 'country', 'country_txt', 'region', 'region_txt', 'provstate', 'city', 'latitude', 'longitude', 'specificity', 'vicinity', 'location', 'summary', 'crit1', 'crit2', 'crit3', 'doubtterr', 'alternative', 'alternative_txt', 'multiple', 'success', 'suicide', 'attacktype1', 'attacktype1_txt', 'attacktype2', 'attacktype2_txt', 'attacktype3', 'attacktype3_txt', 'targtype1', 'targtype1_txt', 'targsubtype1', 'targsubtype1_txt', 'corp1', 'target1', 'natlty1', 'natlty1_txt', 'targtype2', 'targtype2_txt', 'targsubtype2', 'targsubtype2_txt', 'corp2', 'target2', 'natlty2', 'natlty2_txt', 'targtype3', 'targtype3_txt', 'targsubtype3', 'targsubtype3_txt', 'corp3', 'target3', 'natlty3', 'natlty3_txt', 'gname', 'gsubname', 'gname2', 'gsubname2', 'gname3', 'gsubname3', 'motive',

Unnamed: 0,eventid,iyear,imonth,iday,approxdate,extended,resolution,country,country_txt,region,...,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related
0,197000000001,1970,7,2,,0,,58,Dominican Republic,2,...,,,,,PGIS,0,0,0,0,
1,197000000002,1970,0,0,,0,,130,Mexico,1,...,,,,,PGIS,0,1,1,1,
2,197001000001,1970,1,0,,0,,160,Philippines,5,...,,,,,PGIS,-9,-9,1,1,
3,197001000002,1970,1,0,,0,,78,Greece,8,...,,,,,PGIS,-9,-9,1,1,
4,197001000003,1970,1,0,,0,,101,Japan,4,...,,,,,PGIS,-9,-9,1,1,


In [15]:
# CONVERT iyear, idate, imonth -> Datetime and basic cleaning
gtd['imonth'] = gtd['imonth'].replace(0, 1)
gtd['iday'] = gtd['iday'].replace(0, 1)

gtd['date'] = pd.to_datetime(
    gtd[['iyear', 'imonth', 'iday']].astype(str).agg('-'.join, axis=1),
    errors='coerce'
)

gtd = gtd[gtd['iyear'] >= 1990]

cols = [
    'eventid', 'date', 'country_txt', 'region_txt', 'city',
    'latitude', 'longitude', 'attacktype1_txt', 'targtype1_txt',
    'weaptype1_txt', 'nkill', 'nwound', 'gname', 'summary'
]
gtd = gtd[cols]

gtd = gtd.dropna(subset=['latitude', 'longitude'])

gtd.shape

(138255, 14)

In [16]:
# Ensure output directory and save cleaned CSV to data/processed
out_dir = ROOT / 'data' / 'processed'
if not out_dir.is_dir():
    raise FileNotFoundError(f"Directory {out_dir!s} not found. Please create it before running this cell (e.g. mkdir -p {out_dir}).")
out_path = out_dir / 'gtd_cleaned_base.csv'
gtd.to_csv(out_path.as_posix(), index=False)
print('Saved cleaned data to', out_path)

Saved cleaned data to /Users/kanishkraghavendra/Documents/Project/data-minds-causal-analysis/data/processed/gtd_cleaned_base.csv


## Outcome 1 — Effectiveness: prepare monthly panel

Create a monthly panel of attack counts for `airports` (treated) and `govt` (control), add treatment indicators, and save to `data/processed/panel_effectiveness.csv`.

In [17]:
# Prepare monthly panel for Outcome 1 (Effectiveness) using the cleaned file on disk
import os
import numpy as np

# Read cleaned data from data/processed (produced earlier in this notebook)
cleaned_path = ROOT / 'data' / 'processed' / 'gtd_cleaned_base.csv'
print('Loading cleaned data from', cleaned_path)
if not cleaned_path.exists():
    raise FileNotFoundError(f'Cleaned file not found at {cleaned_path!s}. Run the cleaning cells first to create it.')
gtd_clean = pd.read_csv(cleaned_path.as_posix(), parse_dates=['date'])

# Map target types to treatment/control groups
mapping = {
    'Airports & Aircraft': 'airports',
    'Government Building/Facility/Diplomatic': 'govt'
}

# Filter to only the two groups
mask = gtd_clean['targtype1_txt'].isin(mapping.keys())
panel_df = gtd_clean.loc[mask].copy()
panel_df['group_name'] = panel_df['targtype1_txt'].map(mapping)

# Ensure date is datetime and set as index for resampling
panel_df['date'] = pd.to_datetime(panel_df['date'], errors='coerce')
panel_df = panel_df.dropna(subset=['date'])
panel_df = panel_df.set_index('date')

# Aggregate monthly attack counts by group (count eventid)
monthly = panel_df.groupby('group_name').resample('M')['eventid'].count().reset_index(name='attack_count')

# Build a balanced panel (both groups x full monthly range)
months = pd.date_range(start=monthly['date'].min(), end=monthly['date'].max(), freq='M')
groups = ['airports', 'govt']
idx = pd.MultiIndex.from_product([groups, months], names=['group_name', 'date'])
panel = monthly.set_index(['group_name', 'date']).reindex(idx, fill_value=0).reset_index()

# Treatment indicators for DiD
panel['post'] = (panel['date'] >= pd.to_datetime('2001-09-01')).astype(int)
panel['treated'] = (panel['group_name'] == 'airports').astype(int)
panel['treated_post'] = panel['post'] * panel['treated']

# Reorder columns
panel = panel[['group_name', 'date', 'attack_count', 'treated', 'post', 'treated_post']]

# Ensure output directory exists and save
out_dir = ROOT / 'data' / 'processed'
if not out_dir.is_dir():
    raise FileNotFoundError(f"Directory {out_dir!s} not found. Please create it before running this cell (e.g. mkdir -p {out_dir}).")
out_path = out_dir / 'panel_effectiveness.csv'
panel.to_csv(out_path.as_posix(), index=False)
print('Saved panel to', out_path)
panel.head()

Loading cleaned data from /Users/kanishkraghavendra/Documents/Project/data-minds-causal-analysis/data/processed/gtd_cleaned_base.csv
Saved panel to /Users/kanishkraghavendra/Documents/Project/data-minds-causal-analysis/data/processed/panel_effectiveness.csv
Saved panel to /Users/kanishkraghavendra/Documents/Project/data-minds-causal-analysis/data/processed/panel_effectiveness.csv


  monthly = panel_df.groupby('group_name').resample('M')['eventid'].count().reset_index(name='attack_count')
  months = pd.date_range(start=monthly['date'].min(), end=monthly['date'].max(), freq='M')


Unnamed: 0,group_name,date,attack_count,treated,post,treated_post
0,airports,1990-01-31,4,1,0,0
1,airports,1990-02-28,2,1,0,0
2,airports,1990-03-31,1,1,0,0
3,airports,1990-04-30,2,1,0,0
4,airports,1990-05-31,2,1,0,0
