In [10]:
import pandas as pd
import numpy as np

In [11]:
# Load the dataset
df = pd.read_csv("../data/raw/security_incidents.csv")

In [12]:
df.head()

Unnamed: 0,Incident ID,Year,Month,Day,Country Code,Country,Region,District,City,UN,...,Attack context,Location,Latitude,Longitude,Motive,Actor type,Actor name,Details,Verified,Source
0,1,1997,1.0,,KH,Cambodia,Banteay Meanchey,,,0,...,Unknown,Unknown,14.070929,103.099916,Unknown,Unknown,Unknown,1 ICRC national staff killed while working in ...,Archived,Archived
1,2,1997,1.0,,RW,Rwanda,Northern,Musanze,Ruhengeri,0,...,Raid,Office/compound,-1.49984,29.63497,Unknown,Unknown,Unknown,"3 INGO international (Spanish) staff killed, 1...",Archived,Archived
2,3,1997,2.0,,TJ,Tajikistan,,,,4,...,Unknown,Unknown,38.628173,70.815654,,Unknown,Unknown,"3 UN national staff, 1 UN international (Niger...",Archived,Archived
3,4,1997,2.0,,SO,Somalia,Lower Juba,Kismayo,Kismayo,0,...,Unknown,Unknown,-0.358216,42.545087,Political,Non-state armed group: Regional,Al-Itihaad al-Islamiya,1 INGO international staff killed by Al ittiha...,Archived,Archived
4,5,1997,2.0,14.0,RW,Rwanda,Kigali,Kigali,Kigali,1,...,Individual attack,Unknown,-1.950851,30.061508,Political,Unknown,Unknown,1 UN national staff shot and killed in Kigali ...,Archived,Archived


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4337 entries, 0 to 4336
Data columns (total 41 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Incident ID               4337 non-null   int64  
 1   Year                      4337 non-null   int64  
 2   Month                     4289 non-null   float64
 3   Day                       3957 non-null   float64
 4   Country Code              4304 non-null   object 
 5   Country                   4332 non-null   object 
 6   Region                    3970 non-null   object 
 7   District                  3601 non-null   object 
 8   City                      3349 non-null   object 
 9   UN                        4337 non-null   int64  
 10  INGO                      4337 non-null   int64  
 11  ICRC                      4328 non-null   float64
 12  NRCS and IFRC             4328 non-null   float64
 13  NNGO                      4328 non-null   float64
 14  Other   

In [14]:
df.shape

(4337, 41)

In [15]:
# count missing values in each column
missing_counts = df.isnull().sum()

# dispplay columns with missing values only
missing_counts[missing_counts > 0].sort_values(ascending=False)


City             988
District         736
Day              380
Region           367
Month             48
Country Code      33
Latitude          13
Longitude         13
ICRC               9
NRCS and IFRC      9
NNGO               9
Actor name         8
Country            5
Motive             4
Verified           1
dtype: int64

In [16]:
# Fill categorical columns with 'Unknown'
fill_unknown_cols = ['City', 'District', 'Region', 'Country', 'Country Code', 'Actor name', 'Motive', 'Verified']
df[fill_unknown_cols] = df[fill_unknown_cols].fillna('Unknown')

# Fill Month/Day with neutral placeholders
df['Month'] = df['Month'].fillna(6).astype(int)
df['Day'] = df['Day'].fillna(15).astype(int)

# Fill 0/1 org involvement flags with 0
for org_col in ['ICRC', 'NRCS and IFRC', 'NNGO']:
    df[org_col] = df[org_col].fillna(0).astype(int)

# Drop rows with missing coordinates
df = df.dropna(subset=['Latitude', 'Longitude'])



In [17]:
df.shape

(4324, 41)

In [19]:
df.dtypes

Incident ID                   int64
Year                          int64
Month                         int64
Day                           int64
Country Code                 object
Country                      object
Region                       object
District                     object
City                         object
UN                            int64
INGO                          int64
ICRC                          int64
NRCS and IFRC                 int64
NNGO                          int64
Other                         int64
Nationals killed              int64
Nationals wounded             int64
Nationals kidnapped           int64
Total nationals               int64
Internationals killed         int64
Internationals wounded        int64
Internationals kidnapped      int64
Total internationals          int64
Total killed                  int64
Total wounded                 int64
Total kidnapped               int64
Total affected                int64
Gender Male                 

In [20]:
df.drop(columns=['Day'], inplace=True)


In [21]:
df.shape

(4324, 40)

In [22]:
df.drop(columns=['Latitude', 'Longitude'], inplace=True)


In [23]:
print(df['Means of attack'].unique())

['Unknown' 'Shooting' 'Kidnapping' 'Kidnap-killing' 'Aerial bombardment'
 'Landmine ' 'Shelling' 'Body-borne IED' 'Bodily assault' 'Roadside IED'
 'Vehicle-born IED' 'Other Explosives' 'Rape/sexual assault '
 'Complex attack']


In [24]:
print(df['Attack context'].unique())

['Unknown' 'Raid' 'Individual attack' 'Combat/Crossfire' 'Ambush'
 'Detention' 'Mob violence']


In [27]:
print(df['Motive'].unique())

['Unknown' 'Political' 'Incidental' 'Economic' 'Disputed' 'Other']


In [28]:
print(df['Actor type'].unique())

['Unknown' 'Non-state armed group: Regional'
 'Non-state armed group: National' 'Non-state armed group: Unknown'
 'Staff member' 'Non-state armed group: Subnational' 'Unaffiliated'
 'State: unknown' 'Police or paramilitary' 'Host state' 'Aid recipient'
 'Non-state armed group: Global' 'Criminal' 'Foreign or coalition forces'
 'Host State' 'Non-state armed group: Global ']


In [30]:
print(df['Actor name'].unique())

['Unknown' 'Al-Itihaad al-Islamiya'
 'Alliance of Democratic Forces for the Liberation of Congo-Zaire (ADFL)'
 'Not applicable' 'Security guard' 'Clan'
 'Armed Forces Revolutionary Council (AFRC)'
 'Uganda National Rescue Front' 'Taliban' 'Armed tribesmen' 'Interahamwe'
 'Militia' 'National Union for the Total Independence of Angola'
 'Osman Atto Militia' 'Mobile Brigade Corps (Brimod)'
 'Indonesian National Police' 'Sudan Liberation Movement/Army (SLA/AW)'
 'Provincial Organization of Volunteers of Civil Defence of Angola (OPVDCA)'
 'Mai Mai Militia' 'Muse Sudi & Omar Finish militia groups'
 "Lord's Resistance Army (LRA)" 'Al-Qaeda'
 "Sudan People's Liberation Army (SPLA)" 'Israel Defense Forces (IDF)'
 'Ninja militia'
 'Liberians United for Reconciliation and Democracy (LURD)'
 'Al-Ittihad al-Islami' 'Contractor' 'Congolese National Police'
 'Jaysh al-Mahdi' 'Janjaweed' 'Maoist rebel group (name unknown)'
 'Indonesian National Armed Forces' 'Youths'
 'Revolutionary Armed Forces of Co

In [32]:
df.shape

(4324, 38)

In [33]:
df.to_csv("../data/processed/security_incidents_clean.csv", index=False)

In [34]:
# Define the mapping
actor_group_map = {
    'Non-state armed group: Regional': 'Non-state Armed Group',
    'Non-state armed group: National': 'Non-state Armed Group',
    'Non-state armed group: Unknown': 'Non-state Armed Group',
    'Non-state armed group: Subnational': 'Non-state Armed Group',
    'Non-state armed group: Global': 'Non-state Armed Group',
    'Non-state armed group: Global ': 'Non-state Armed Group',  # trailing space

    'Host state': 'State Actor',
    'Host State': 'State Actor',
    'State: unknown': 'State Actor',
    'Police or paramilitary': 'State Actor',
    'Foreign or coalition forces': 'State Actor',

    'Staff member': 'Individual',
    'Unaffiliated': 'Individual',
    'Aid recipient': 'Individual',

    'Criminal': 'Criminal',
    'Unknown': 'Unknown'
}

# Apply the mapping
df['ActorGroup'] = df['Actor type'].map(actor_group_map)


In [35]:
df.to_csv("../data/processed/security_incidents_clean.csv", index=False)