In [1]:
import pandas as pd
import numpy as np

In [2]:
#Read in data into df from json
df = pd.read_json('../../src/response_iati.json')
df.iloc[385:388]

Unnamed: 0,iati_identifier,title_narrative,activity_date_type,activity_status_code,default_aid_type_code,description_narrative,last_updated_datetime,activity_date_iso_date,recipient_country_code,reporting_org_narrative,...,policy_marker_code,policy_marker_narrative,location_name_narrative,sector_vocabulary,document_link_url,tag_code,tag_narrative,tag_vocabulary,activity_date_narrative,recipient_region_code
385,GB-CHC-202918-LEBA83,[Increased access to water and dignified lives...,"[2, 4]",3,,[Oxfam will equip refugees with key informatio...,2021-09-08T13:09:12Z,"[2019-01-01T00:00:00Z, 2020-02-29T00:00:00Z]",[LB],[Oxfam GB],...,,,,"[99, 99, 99, 99]",,,,,"[start-actual, end-actual]",
386,GB-CHC-202918-RVNA86,[Pro-poor Policy Monitoring and Analysis in Vi...,"[2, 4]",3,,[This is a revision of the ongoing post -WTO p...,2015-09-22T12:12:14Z,"[2010-07-01T00:00:00Z, 2014-06-30T00:00:00Z]",[VN],[Oxfam GB],...,,,,"[99, 99, 99]",,,,,"[start-actual, end-actual]",
387,GB-CHC-202918-LEBA84,[Addressing extreme vulnerabilities among refu...,"[2, 3]",2,,[- Provide 4 months temporary cash assistance ...,2022-06-28T14:45:00Z,"[2018-12-01T00:00:00Z, 2023-03-31T00:00:00Z]",[LB],[Oxfam GB],...,,,,[99],,,,,"[start-actual, end-planned]",


In [3]:
nan_count = df[df['title_narrative'].isna()]
nan_count.head()

Unnamed: 0,iati_identifier,title_narrative,activity_date_type,activity_status_code,default_aid_type_code,description_narrative,last_updated_datetime,activity_date_iso_date,recipient_country_code,reporting_org_narrative,...,policy_marker_code,policy_marker_narrative,location_name_narrative,sector_vocabulary,document_link_url,tag_code,tag_narrative,tag_vocabulary,activity_date_narrative,recipient_region_code


In [4]:
# create new empty df to fill with transformed data

trans_df = pd.DataFrame()

## **Feature Transformation & Engineering****

### IATI ID

In [5]:
trans_df["iati_id"] = df["iati_identifier"].values

trans_df.head(2)

Unnamed: 0,iati_id
0,NL-KVK-32092131-2062
1,NL-KVK-32092131-2063


### Title

In [6]:
################
# Add en title #
################

trans_df["title_en"] = "NaN"

for index, row in df.iterrows():
    lang_list = row['title_narrative_xml_lang']
    title_row = row['title_narrative']

    # nan in pandas is type float
    # check if nan and if yes take first entry in lang
    if isinstance(lang_list, float):
        if isinstance(title_row, float):
            trans_df["title_en"][index] = "NaN"
        else:
            trans_df["title_en"][index] = title_row[0]
    else:
        for j in range(0, len(lang_list)):
            if "en" or "EN" in lang_list:
                if lang_list[j].lower() == "en":
                    title = title_row[j]
                    trans_df["title_en"][index] = title
            else:
                trans_df["title_en"][index] = "NaN"
        
trans_df.head(2)

Unnamed: 0,iati_id,title_en
0,NL-KVK-32092131-2062,UG-2062 Covid response Uganda
1,NL-KVK-32092131-2063,GH-2063 Covid response Ghana


In [7]:
###################
# Add other title #
###################


trans_df["title_other"] = "NaN"

for index, row in df.iterrows():
    lang_list = row['title_narrative_xml_lang']

    # every title which is has no lang attribute is cassified as english and therefore not in other
    if isinstance(lang_list, float):
        trans_df["title_other"][index] = "NaN"
    else:
        for j in range(0, len(lang_list)):
            if lang_list[j].lower() != "en":
                title = row['title_narrative'][j]
                if trans_df["title_other"][index] == "NaN":
                    trans_df["title_other"][index] = title
                else:
                    trans_df["title_other"][index] = f"{trans_df['title_other'][index]}; {title}"

trans_df.head(200)

Unnamed: 0,iati_id,title_en,title_other
0,NL-KVK-32092131-2062,UG-2062 Covid response Uganda,
1,NL-KVK-32092131-2063,GH-2063 Covid response Ghana,
2,NL-KVK-32092131-2064,GH-2064 EDU Active Learning Project,
3,NL-KVK-32092131-2065,UG-2065 EDU Active Learning Project,
4,NL-KVK-32092131-2066,ET-2066 EDU Active Learning Project,
...,...,...,...
195,XM-DAC-41122-Democratic Republic of Congo-0990...,,(C) YE302-04 ZC Intermediate Results (IR) 3.4 ...
196,XM-DAC-41122-Democratic Republic of Congo-0990...,,YY306-02 NAT Intermediate Results (IR) 7.2 PRO...
197,XM-DAC-41122-Democratic Republic of Congo-0990...,,IR 1.8 REPONSE & SORTIE DES URGENCES EAH
198,XM-DAC-41122-Democratic Republic of Congo-0990...,,SURVIE MISE EN OEUVRE DU PLAN Harmonized Appro...


### Organization

In [8]:
trans_df['organization'] = df['reporting_org_narrative'].apply(lambda x: x[0])

print(trans_df.organization.value_counts())
trans_df.head(2)

Oxfam GB                                                          1642
UNICEF                                                             221
Edukans                                                             88
Nederlands Instituut voor Meerpartijendemocratie                    32
NGO PLAN BELGIUM                                                     9
European Commission - Service for Foreign Policy Instruments         3
AECID Spanish Agency for International Development Cooperation       3
AFD                                                                  2
Name: organization, dtype: int64


Unnamed: 0,iati_id,title_en,title_other,organization
0,NL-KVK-32092131-2062,UG-2062 Covid response Uganda,,Edukans
1,NL-KVK-32092131-2063,GH-2063 Covid response Ghana,,Edukans


### Country

In [9]:
trans_df["country"] = df["recipient_country_code"]
trans_df.head(5)

Unnamed: 0,iati_id,title_en,title_other,organization,country
0,NL-KVK-32092131-2062,UG-2062 Covid response Uganda,,Edukans,[UG]
1,NL-KVK-32092131-2063,GH-2063 Covid response Ghana,,Edukans,[GH]
2,NL-KVK-32092131-2064,GH-2064 EDU Active Learning Project,,Edukans,[GH]
3,NL-KVK-32092131-2065,UG-2065 EDU Active Learning Project,,Edukans,[UG]
4,NL-KVK-32092131-2066,ET-2066 EDU Active Learning Project,,Edukans,[ET]


### Region

In [10]:
trans_df['region'] = df['recipient_region_code']
trans_df.head(5)

Unnamed: 0,iati_id,title_en,title_other,organization,country,region
0,NL-KVK-32092131-2062,UG-2062 Covid response Uganda,,Edukans,[UG],
1,NL-KVK-32092131-2063,GH-2063 Covid response Ghana,,Edukans,[GH],
2,NL-KVK-32092131-2064,GH-2064 EDU Active Learning Project,,Edukans,[GH],
3,NL-KVK-32092131-2065,UG-2065 EDU Active Learning Project,,Edukans,[UG],
4,NL-KVK-32092131-2066,ET-2066 EDU Active Learning Project,,Edukans,[ET],


### Location

In [11]:
trans_df['location'] = df['location_name_narrative']
trans_df.head(5)

Unnamed: 0,iati_id,title_en,title_other,organization,country,region,location
0,NL-KVK-32092131-2062,UG-2062 Covid response Uganda,,Edukans,[UG],,
1,NL-KVK-32092131-2063,GH-2063 Covid response Ghana,,Edukans,[GH],,
2,NL-KVK-32092131-2064,GH-2064 EDU Active Learning Project,,Edukans,[GH],,
3,NL-KVK-32092131-2065,UG-2065 EDU Active Learning Project,,Edukans,[UG],,
4,NL-KVK-32092131-2066,ET-2066 EDU Active Learning Project,,Edukans,[ET],,


### Descriptions

In [12]:
#################################
# Description english and other #
#################################

trans_df["description_en"] = "NaN"
trans_df["description_other"] = "NaN"

for index, row in df.iterrows():
    descr_list = row['description_narrative_xml_lang']
    descr_row = row['description_narrative']

    # nan in pandas is type float
    # check if nan and if yes take first entry in descr
    if isinstance(descr_list, float):
        if isinstance(descr_row, float):
            trans_df["description_en"][index] = "NaN"
        else:
            trans_df["description_en"][index] = descr_row[0]
    else:
        for j in range(0, len(descr_list)):
            if descr_list[j].lower() == "en":
                if type(descr_row) == float:
                    descr = "NaN"
                else:
                    descr = descr_row[j]
                if trans_df["description_en"][index] == "NaN":
                    trans_df["description_en"][index] = descr
                else:
                    trans_df["description_en"][index] = f"{trans_df['description_en'][index]}; {descr}"
            else:
                if type(descr_row) == float:
                    descr = "NaN"
                else:
                    descr = descr_row[j]
                if trans_df["description_other"][index] == "NaN":
                    trans_df["description_other"][index] = descr
                else:
                    trans_df["description_other"][index] = f"{trans_df['description_other'][index]}; {descr}"
        
trans_df.head(5)

Unnamed: 0,iati_id,title_en,title_other,organization,country,region,location,description_en,description_other
0,NL-KVK-32092131-2062,UG-2062 Covid response Uganda,,Edukans,[UG],,,Extra budget for covid response,
1,NL-KVK-32092131-2063,GH-2063 Covid response Ghana,,Edukans,[GH],,,Extra budget covid response,
2,NL-KVK-32092131-2064,GH-2064 EDU Active Learning Project,,Edukans,[GH],,,Correctbooks,
3,NL-KVK-32092131-2065,UG-2065 EDU Active Learning Project,,Edukans,[UG],,,The EDU Active Learning Project contains a lea...,
4,NL-KVK-32092131-2066,ET-2066 EDU Active Learning Project,,Edukans,[ET],,,The EDU Active Learning Project contains a lea...,


### Status

In [13]:
# https://iatistandard.org/en/iati-standard/203/codelists/activitystatus/
activity_status = {
    1: "Pipeline/identification",
    2: "Implementation",
    3: "Finalisation",
    4: "Closed",
    5: "Cancelled",
    6: "Suspended"
}

trans_df["status"] = df.activity_status_code
trans_df['status'] = trans_df['status'].replace(activity_status)

trans_df.head(2)
    

Unnamed: 0,iati_id,title_en,title_other,organization,country,region,location,description_en,description_other,status
0,NL-KVK-32092131-2062,UG-2062 Covid response Uganda,,Edukans,[UG],,,Extra budget for covid response,,Closed
1,NL-KVK-32092131-2063,GH-2063 Covid response Ghana,,Edukans,[GH],,,Extra budget covid response,,Closed


### Date

In [14]:
#############
# Date Type #
#############

# One Hot
# 1 -> Yes
# 0 -> No

# Codes:
# 1 Planned start
# 2 Actual start
# 3 Planned end
# 4 Actual end

trans_df["planned_start"] = "NaN"
trans_df["actual_start"] = "NaN"
trans_df["planned_end"] = "NaN"
trans_df["actual_end"] = "NaN"

date_types = {
    1: "planned_start",
    2: "actual_start",
    3: "planned_end",
    4: "actual_end"
}

for index, row in df.iterrows():
    dtype_list = row["activity_date_type"]
    iso_date_list = row["activity_date_iso_date"]

    combined_list = list(zip(dtype_list, iso_date_list))

    # replace nums with column names from date_types
    combined_list = [(date_types[int(t[0])], t[1]) for t in combined_list]

    for i in combined_list:
        trans_df[i[0]] = i[1]

trans_df.head(1800)


Unnamed: 0,iati_id,title_en,title_other,organization,country,region,location,description_en,description_other,status,planned_start,actual_start,planned_end,actual_end
0,NL-KVK-32092131-2062,UG-2062 Covid response Uganda,,Edukans,[UG],,,Extra budget for covid response,,Closed,2012-02-17T00:00:00Z,2021-04-01T00:00:00Z,2022-03-31T00:00:00Z,2022-03-31T00:00:00Z
1,NL-KVK-32092131-2063,GH-2063 Covid response Ghana,,Edukans,[GH],,,Extra budget covid response,,Closed,2012-02-17T00:00:00Z,2021-04-01T00:00:00Z,2022-03-31T00:00:00Z,2022-03-31T00:00:00Z
2,NL-KVK-32092131-2064,GH-2064 EDU Active Learning Project,,Edukans,[GH],,,Correctbooks,,Implementation,2012-02-17T00:00:00Z,2021-04-01T00:00:00Z,2022-03-31T00:00:00Z,2022-03-31T00:00:00Z
3,NL-KVK-32092131-2065,UG-2065 EDU Active Learning Project,,Edukans,[UG],,,The EDU Active Learning Project contains a lea...,,Implementation,2012-02-17T00:00:00Z,2021-04-01T00:00:00Z,2022-03-31T00:00:00Z,2022-03-31T00:00:00Z
4,NL-KVK-32092131-2066,ET-2066 EDU Active Learning Project,,Edukans,[ET],,,The EDU Active Learning Project contains a lea...,,Implementation,2012-02-17T00:00:00Z,2021-04-01T00:00:00Z,2022-03-31T00:00:00Z,2022-03-31T00:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1795,GB-CHC-202918-SASC02,South Asia WASH Capacity Building Project,,Oxfam GB,,[679],,The Regional Aim 3 PIP aims to reduce sufferin...,,Finalisation,2012-02-17T00:00:00Z,2021-04-01T00:00:00Z,2022-03-31T00:00:00Z,2022-03-31T00:00:00Z
1796,GB-CHC-202918-ZIMB60,Cholera Preparedness and Response in Harare Me...,,Oxfam GB,[ZW],,,The project will conduct hygiene awareness thr...,,Implementation,2012-02-17T00:00:00Z,2021-04-01T00:00:00Z,2022-03-31T00:00:00Z,2022-03-31T00:00:00Z
1797,GB-CHC-202918-SASC03,South Asia Logistic & Supply Capacity Building,,Oxfam GB,,[679],,"The regional Aim 3 PIP, aims to reduce suffer...",,Finalisation,2012-02-17T00:00:00Z,2021-04-01T00:00:00Z,2022-03-31T00:00:00Z,2022-03-31T00:00:00Z
1798,GB-CHC-202918-ZIMB61,Gutu drought response project,,Oxfam GB,[ZW],,,The project will meet the immediate food needs...,,Finalisation,2012-02-17T00:00:00Z,2021-04-01T00:00:00Z,2022-03-31T00:00:00Z,2022-03-31T00:00:00Z


### Last Update

In [15]:
trans_df['last_update'] = df['last_updated_datetime']
trans_df.head(2)

Unnamed: 0,iati_id,title_en,title_other,organization,country,region,location,description_en,description_other,status,planned_start,actual_start,planned_end,actual_end,last_update
0,NL-KVK-32092131-2062,UG-2062 Covid response Uganda,,Edukans,[UG],,,Extra budget for covid response,,Closed,2012-02-17T00:00:00Z,2021-04-01T00:00:00Z,2022-03-31T00:00:00Z,2022-03-31T00:00:00Z,2021-08-04T14:53:05.494Z
1,NL-KVK-32092131-2063,GH-2063 Covid response Ghana,,Edukans,[GH],,,Extra budget covid response,,Closed,2012-02-17T00:00:00Z,2021-04-01T00:00:00Z,2022-03-31T00:00:00Z,2022-03-31T00:00:00Z,2021-08-04T14:53:06.898Z


### CRS 5

In [16]:
sector_codes = {
    1: "OECD DAC CRS Purpose Codes (5 digit)",
    2: "OECD DAC CRS Purpose Codes (3 digit)",
    3: "Classification of the Functions of Government (UN)",
    4: "Statistical classification of economic activities in the European Community",
    5: "National Taxonomy for Exempt Entities (USA)",
    6: "AidData",
    7: "SDG Goal",
    8: "SDG Target",
    9: "SDG Indicator",
    10: "Humanitarian Global Clusters (Inter-Agency Standing Committee)",
    11: "North American Industry Classification System (NAICS)",
    12: "UN System Function",
    99: "Reporting Organisation", # The sector reported corresponds to a sector vocabulary maintained by the reporting organisation for this activity
    98: "Reporting Organisation 2" # The sector reported corresponds to a sector vocabulary maintained by the reporting organisation for this activity (if they are referencing more than one)
}

In [17]:
crs5_df = pd.read_csv("../../src/crs5_codes.csv")
crs5_df.head(2)

Unnamed: 0,code,name,description,language,category,category-name,category-description
0,11110,Education policy and administrative management,"Education sector policy, planning and programm...",en,111,"Education, Level Unspecified",The codes in this category are to be used only...
1,11120,Education facilities and training,"Educational buildings, equipment, materials; s...",en,111,"Education, Level Unspecified",The codes in this category are to be used only...


In [18]:
crs3_df = pd.read_csv("../../src/crs3_codes.csv")
crs3_df.head(2)

Unnamed: 0,code,name,description,language,category,category-name,category-description
0,111,"Education, Level Unspecified",The codes in this category are to be used only...,en,,,
1,112,Basic Education,,en,,,


In [40]:
# to process different variants of codes (crs3, crs5 etc.)

def process_codes(combined_list, translation_df, code_index):
    code_text = ""
    codes_nums = ""
    if any(item[0] == code_index for item in combined_list):
        for i in combined_list:
            if i[0] == code_index:
                translation = translation_df.loc[translation_df['code'] == int(i[1]), 'name'].values[0]
                code_text += f"{translation}; "
                codes_nums += f"{i[1]}; "
        return code_text, codes_nums
    else:
        return "NaN", "NaN"
    

In [46]:
###############
# ECTRACT CRS #
###############
trans_df["crs_5_code"] = "NaN"
trans_df["crs_5_name"] = "NaN"

trans_df["crs_3_code"] = "NaN"
trans_df["crs_3_name"] = "NaN"

for index, row in df.iterrows():
    crs_voc_list = row['sector_vocabulary']
    crs_code_list = row['sector_code']

    if type(crs_voc_list) == float:
        pass
    else:
        combined_list = list(zip(crs_voc_list, crs_code_list))

        # CRS 5
        crs5_str, crs5_codes = process_codes(combined_list, crs5_df, "1")
        trans_df["crs_5_code"][index] = crs5_codes
        trans_df["crs_5_name"][index] = crs5_str

        # CRS 3
        crs3_str, crs3_codes = process_codes(combined_list, crs3_df, "2")
        trans_df["crs_3_code"][index] = crs3_codes
        trans_df["crs_3_name"][index] = crs3_str

trans_df.head(100)

Unnamed: 0,iati_id,title_en,title_other,organization,country,region,location,description_en,description_other,status,planned_start,actual_start,planned_end,actual_end,last_update,crs_5_code,crs_5_name,crs_3_code,crs_3_name
0,NL-KVK-32092131-2062,UG-2062 Covid response Uganda,,Edukans,[UG],,,Extra budget for covid response,,Closed,2012-02-17T00:00:00Z,2021-04-01T00:00:00Z,2022-03-31T00:00:00Z,2022-03-31T00:00:00Z,2021-08-04T14:53:05.494Z,,,,
1,NL-KVK-32092131-2063,GH-2063 Covid response Ghana,,Edukans,[GH],,,Extra budget covid response,,Closed,2012-02-17T00:00:00Z,2021-04-01T00:00:00Z,2022-03-31T00:00:00Z,2022-03-31T00:00:00Z,2021-08-04T14:53:06.898Z,,,,
2,NL-KVK-32092131-2064,GH-2064 EDU Active Learning Project,,Edukans,[GH],,,Correctbooks,,Implementation,2012-02-17T00:00:00Z,2021-04-01T00:00:00Z,2022-03-31T00:00:00Z,2022-03-31T00:00:00Z,2021-08-04T14:53:08.198Z,,,,
3,NL-KVK-32092131-2065,UG-2065 EDU Active Learning Project,,Edukans,[UG],,,The EDU Active Learning Project contains a lea...,,Implementation,2012-02-17T00:00:00Z,2021-04-01T00:00:00Z,2022-03-31T00:00:00Z,2022-03-31T00:00:00Z,2021-08-04T14:53:09.580Z,,,,
4,NL-KVK-32092131-2066,ET-2066 EDU Active Learning Project,,Edukans,[ET],,,The EDU Active Learning Project contains a lea...,,Implementation,2012-02-17T00:00:00Z,2021-04-01T00:00:00Z,2022-03-31T00:00:00Z,2022-03-31T00:00:00Z,2021-08-04T14:53:10.959Z,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,XM-DAC-41122-Democratic Republic of Congo-0990...,,OUTPUT 4: EBOLA / EDUCATION - PREVENTION AND R...,UNICEF,[CD],[298],[Democratic Republic of Congo],,"EBOLA / EDUCATION - PREVENTION AND RESPONSE, w...",Pipeline/identification,2012-02-17T00:00:00Z,2021-04-01T00:00:00Z,2022-03-31T00:00:00Z,2022-03-31T00:00:00Z,2023-10-26T00:33:55Z,11110;,Education policy and administrative management;,,
96,XM-DAC-41122-Democratic Republic of Congo-0990...,,OUTPUT 6: EBOLA / RCCE - PREVENTION ANDRESPONSE,UNICEF,[CD],[298],"[Democratic Republic of Congo, Beni]",,"EBOLA / RCCE - PREVENTION ANDRESPONSE, which c...",Pipeline/identification,2012-02-17T00:00:00Z,2021-04-01T00:00:00Z,2022-03-31T00:00:00Z,2022-03-31T00:00:00Z,2023-10-26T00:33:55Z,91010;,Administrative costs (non-sector allocable);,,
97,XM-DAC-41122-Democratic Republic of Congo-0990...,,OUTPUT 8: EBOLA - RAPID RESPONSE INTERVENTIONS,UNICEF,[CD],[298],"[Democratic Republic of Congo, Bunia]",,"EBOLA - RAPID RESPONSE INTERVENTIONS, which co...",Pipeline/identification,2012-02-17T00:00:00Z,2021-04-01T00:00:00Z,2022-03-31T00:00:00Z,2022-03-31T00:00:00Z,2023-10-26T00:33:55Z,72010;,Material relief assistance and services;,,
98,XM-DAC-41122-Democratic Republic of Congo-0990...,,OUTPUT 12: EBOLA - OPERATIONAL EFFECTIVENESS,UNICEF,[CD],[298],[Democratic Republic of Congo],,"EBOLA - OPERATIONAL EFFECTIVENESS, which contr...",Pipeline/identification,2012-02-17T00:00:00Z,2021-04-01T00:00:00Z,2022-03-31T00:00:00Z,2022-03-31T00:00:00Z,2023-10-26T00:33:56Z,91010;,Administrative costs (non-sector allocable);,,
