# UNSC List Cleanup

## Import packages

In [4]:
import pandas as pd
from lxml import etree
import re

## UNSC Individual

In [29]:
df = pd.read_xml("consolidatedLegacyByPRN.xml",xpath=".//INDIVIDUAL")

In [30]:
# Flatten nested columns (if any)
if "NATIONALITY" in df.columns:
    df["NATIONALITY"] = df["NATIONALITY"].apply(lambda x: x.get("VALUE") if isinstance(x, dict) else None)
if "LIST_TYPE" in df.columns:
    df["LIST_TYPE"] = df["LIST_TYPE"].apply(lambda x: x.get("VALUE") if isinstance(x, dict) else None)
if "LAST_DAY_UPDATED" in df.columns:
    df["LAST_DAY_UPDATED"] = df["LAST_DAY_UPDATED"].apply(lambda x: x.get("VALUE") if isinstance(x, dict) else None)

# Nested ADDRESS, DOB, POB columns
if "INDIVIDUAL_ADDRESS" in df.columns:
    df["ADDRESS_COUNTRY"] = df["INDIVIDUAL_ADDRESS"].apply(lambda x: x.get("COUNTRY") if isinstance(x, dict) else None)
    df["ADDRESS_NOTE"] = df["INDIVIDUAL_ADDRESS"].apply(lambda x: x.get("NOTE") if isinstance(x, dict) else None)
    df.drop(columns=["INDIVIDUAL_ADDRESS"], inplace=True)

if "INDIVIDUAL_DATE_OF_BIRTH" in df.columns:
    df["DOB_TYPE"] = df["INDIVIDUAL_DATE_OF_BIRTH"].apply(lambda x: x.get("TYPE_OF_DATE") if isinstance(x, dict) else None)
    df["DOB_YEAR"] = df["INDIVIDUAL_DATE_OF_BIRTH"].apply(lambda x: x.get("YEAR") if isinstance(x, dict) else None)
    df.drop(columns=["INDIVIDUAL_DATE_OF_BIRTH"], inplace=True)

if "INDIVIDUAL_PLACE_OF_BIRTH" in df.columns:
    df["POB_COUNTRY"] = df["INDIVIDUAL_PLACE_OF_BIRTH"].apply(lambda x: x.get("COUNTRY") if isinstance(x, dict) else None)
    df.drop(columns=["INDIVIDUAL_PLACE_OF_BIRTH"], inplace=True)

In [31]:
df.columns

Index(['DATAID', 'VERSIONNUM', 'FIRST_NAME', 'SECOND_NAME', 'UN_LIST_TYPE',
       'REFERENCE_NUMBER', 'LISTED_ON', 'GENDER', 'COMMENTS1', 'NATIONALITY',
       'LIST_TYPE', 'LAST_DAY_UPDATED', 'INDIVIDUAL_ALIAS',
       'INDIVIDUAL_DOCUMENT', 'SORT_KEY', 'SORT_KEY_LAST_MOD', 'DESIGNATION',
       'TITLE', 'THIRD_NAME', 'FOURTH_NAME', 'NAME_ORIGINAL_SCRIPT',
       'ADDRESS_COUNTRY', 'ADDRESS_NOTE', 'DOB_TYPE', 'DOB_YEAR',
       'POB_COUNTRY'],
      dtype='object')

In [28]:
df.count()

DATAID                  684
VERSIONNUM              684
FIRST_NAME              684
SECOND_NAME             675
UN_LIST_TYPE            684
REFERENCE_NUMBER        684
LISTED_ON               684
GENDER                   54
COMMENTS1               603
NATIONALITY               0
LIST_TYPE                 0
LAST_DAY_UPDATED          0
INDIVIDUAL_ALIAS        684
INDIVIDUAL_DOCUMENT     262
SORT_KEY                  0
SORT_KEY_LAST_MOD         0
DESIGNATION             286
TITLE                   158
THIRD_NAME              337
FOURTH_NAME             164
NAME_ORIGINAL_SCRIPT    338
ADDRESS_COUNTRY           0
ADDRESS_NOTE              0
DOB_TYPE                  0
DOB_YEAR                  0
POB_COUNTRY               0
dtype: int64

In [17]:
df.columns

Index(['DATAID', 'VERSIONNUM', 'FIRST_NAME', 'SECOND_NAME', 'UN_LIST_TYPE',
       'REFERENCE_NUMBER', 'LISTED_ON', 'GENDER', 'COMMENTS1', 'NATIONALITY',
       'LIST_TYPE', 'LAST_DAY_UPDATED', 'INDIVIDUAL_ALIAS',
       'INDIVIDUAL_DOCUMENT', 'SORT_KEY', 'SORT_KEY_LAST_MOD', 'DESIGNATION',
       'TITLE', 'THIRD_NAME', 'FOURTH_NAME', 'NAME_ORIGINAL_SCRIPT',
       'ADDRESS_COUNTRY', 'ADDRESS_NOTE', 'DOB_TYPE', 'DOB_YEAR',
       'POB_COUNTRY'],
      dtype='object')

## UNSC Entity

In [23]:
df_ent = pd.read_xml("consolidatedLegacyByPRN.xml",xpath=".//INDIVIDUAL")

In [24]:
df_ent.head(10)

Unnamed: 0,DATAID,VERSIONNUM,FIRST_NAME,SECOND_NAME,UN_LIST_TYPE,REFERENCE_NUMBER,LISTED_ON,GENDER,COMMENTS1,NATIONALITY,...,INDIVIDUAL_DATE_OF_BIRTH,INDIVIDUAL_PLACE_OF_BIRTH,INDIVIDUAL_DOCUMENT,SORT_KEY,SORT_KEY_LAST_MOD,DESIGNATION,TITLE,THIRD_NAME,FOURTH_NAME,NAME_ORIGINAL_SCRIPT
0,6907993,1,ERIC,BADEGE,DRC,CDi.001,2012-12-31,Male,He fled to Rwanda in March 2013 and is still l...,\n,...,\n,\n,,,,,,,,
1,6907994,1,FRANK KAKOLELE,BWAMBALE,DRC,CDi.002,2005-11-01,Male,Left the CNDP in January 2008. As of June 2011...,\n,...,\n,\n,,,,\n,,,,
2,6907995,1,GASTON,IYAMUREMYE,DRC,CDi.003,2010-12-01,Male,INTERPOL-UN Security Council Special Notice we...,\n,...,\n,\n,,,,\n,,,,
3,6907996,1,INNOCENT,KAINA,DRC,CDi.004,2012-11-30,Male,: Became M23 deputy commander after the flight...,,...,\n,\n,,,,\n,,,,
4,6907997,1,JÉRÔME,KAKWAVU BUKANDE,DRC,CDi.005,2005-11-01,Male,Given the rank of General in the FARDC in Dece...,\n,...,\n,\n,,,,,,,,
5,6907998,1,GERMAIN,KATANGA,DRC,CDi.006,2005-11-01,,Appointed General in the FARDC in December 200...,\n,...,\n,\n,,,,,,,,
6,6908023,1,THOMAS,LUBANGA,DRC,CDi.007,2005-11-01,Male,Arrested in Kinshasa in March 2005 for UPC/L i...,\n,...,\n,\n,,,,,,,,
7,6907999,1,SULTANI,MAKENGA,DRC,CDi.008,2012-11-12,Male,A military leader of the Mouvement du 23 Mars ...,\n,...,\n,\n,,,,,,,,
8,6908000,1,KHAWA PANGA,MANDRO,DRC,CDi.009,2005-11-01,Male,Placed in prison in Bunia in April 2005 for sa...,\n,...,\n,\n,,,,,,,,
9,6908001,1,CALLIXTE,MBARUSHIMANA,DRC,CDi.010,2009-03-03,Male,Arrested in Paris on 3 October 2010 under ICC ...,\n,...,\n,\n,,,,\n,,,,
