In [75]:
import pandas as pd
import numpy as np

In [76]:
#Read in data into df from json
df = pd.read_json('../../src/response_iati.json')
df.iloc[385:388]

Unnamed: 0,iati_identifier,title_narrative,activity_date_type,activity_status_code,default_aid_type_code,description_narrative,last_updated_datetime,activity_date_iso_date,recipient_country_code,reporting_org_narrative,...,policy_marker_code,policy_marker_narrative,location_name_narrative,sector_vocabulary,document_link_url,tag_code,tag_narrative,tag_vocabulary,activity_date_narrative,recipient_region_code
385,GB-CHC-202918-LEBA83,[Increased access to water and dignified lives...,"[2, 4]",3,,[Oxfam will equip refugees with key informatio...,2021-09-08T13:09:12Z,"[2019-01-01T00:00:00Z, 2020-02-29T00:00:00Z]",[LB],[Oxfam GB],...,,,,"[99, 99, 99, 99]",,,,,"[start-actual, end-actual]",
386,GB-CHC-202918-RVNA86,[Pro-poor Policy Monitoring and Analysis in Vi...,"[2, 4]",3,,[This is a revision of the ongoing post -WTO p...,2015-09-22T12:12:14Z,"[2010-07-01T00:00:00Z, 2014-06-30T00:00:00Z]",[VN],[Oxfam GB],...,,,,"[99, 99, 99]",,,,,"[start-actual, end-actual]",
387,GB-CHC-202918-LEBA84,[Addressing extreme vulnerabilities among refu...,"[2, 3]",2,,[- Provide 4 months temporary cash assistance ...,2022-06-28T14:45:00Z,"[2018-12-01T00:00:00Z, 2023-03-31T00:00:00Z]",[LB],[Oxfam GB],...,,,,[99],,,,,"[start-actual, end-planned]",


In [77]:
nan_count = df[df['title_narrative'].isna()]
nan_count.head()

Unnamed: 0,iati_identifier,title_narrative,activity_date_type,activity_status_code,default_aid_type_code,description_narrative,last_updated_datetime,activity_date_iso_date,recipient_country_code,reporting_org_narrative,...,policy_marker_code,policy_marker_narrative,location_name_narrative,sector_vocabulary,document_link_url,tag_code,tag_narrative,tag_vocabulary,activity_date_narrative,recipient_region_code


In [78]:
# create new empty df to fill with transformed data

trans_df = pd.DataFrame()

## **Feature Engineering & Transformation**

### IATI ID

In [79]:
trans_df["iati_id"] = df["iati_identifier"].values

trans_df.head(2)

Unnamed: 0,iati_id
0,NL-KVK-32092131-2062
1,NL-KVK-32092131-2063


### Title

In [80]:
################
# Add en title #
################

trans_df["title_en"] = "NaN"

for index, row in df.iterrows():
    lang_list = row['title_narrative_xml_lang']
    title_row = row['title_narrative']

    # nan in pandas is type float
    # check if nan and if yes take first entry in lang
    if isinstance(lang_list, float):
        if isinstance(title_row, float):
            trans_df["title_en"][index] = "NaN"
        else:
            trans_df["title_en"][index] = title_row[0]
    else:
        for j in range(0, len(lang_list)):
            if "en" or "EN" in lang_list:
                if lang_list[j].lower() == "en":
                    title = title_row[j]
                    trans_df["title_en"][index] = title
            else:
                trans_df["title_en"][index] = "NaN"
        
trans_df.head(2)

Unnamed: 0,iati_id,title_en
0,NL-KVK-32092131-2062,UG-2062 Covid response Uganda
1,NL-KVK-32092131-2063,GH-2063 Covid response Ghana


In [81]:
###################
# Add other title #
###################


trans_df["title_other"] = "NaN"

for index, row in df.iterrows():
    lang_list = row['title_narrative_xml_lang']

    # every title which is has no lang attribute is cassified as english and therefore not in other
    if isinstance(lang_list, float):
        trans_df["title_other"][index] = "NaN"
    else:
        for j in range(0, len(lang_list)):
            if lang_list[j].lower() != "en":
                title = row['title_narrative'][j]
                if trans_df["title_other"][index] == "NaN":
                    trans_df["title_other"][index] = title
                else:
                    trans_df["title_other"][index] = f"{trans_df['title_other'][index]}; {title}"

trans_df.head(200)

Unnamed: 0,iati_id,title_en,title_other
0,NL-KVK-32092131-2062,UG-2062 Covid response Uganda,
1,NL-KVK-32092131-2063,GH-2063 Covid response Ghana,
2,NL-KVK-32092131-2064,GH-2064 EDU Active Learning Project,
3,NL-KVK-32092131-2065,UG-2065 EDU Active Learning Project,
4,NL-KVK-32092131-2066,ET-2066 EDU Active Learning Project,
...,...,...,...
195,XM-DAC-41122-Democratic Republic of Congo-0990...,,(C) YE302-04 ZC Intermediate Results (IR) 3.4 ...
196,XM-DAC-41122-Democratic Republic of Congo-0990...,,YY306-02 NAT Intermediate Results (IR) 7.2 PRO...
197,XM-DAC-41122-Democratic Republic of Congo-0990...,,IR 1.8 REPONSE & SORTIE DES URGENCES EAH
198,XM-DAC-41122-Democratic Republic of Congo-0990...,,SURVIE MISE EN OEUVRE DU PLAN Harmonized Appro...


### Organization

In [82]:
trans_df['organization'] = df['reporting_org_narrative'].apply(lambda x: x[0])

print(trans_df.organization.value_counts())
trans_df.head(2)

Oxfam GB                                                          1642
UNICEF                                                             221
Edukans                                                             88
Nederlands Instituut voor Meerpartijendemocratie                    32
NGO PLAN BELGIUM                                                     9
European Commission - Service for Foreign Policy Instruments         3
AECID Spanish Agency for International Development Cooperation       3
AFD                                                                  2
Name: organization, dtype: int64


Unnamed: 0,iati_id,title_en,title_other,organization
0,NL-KVK-32092131-2062,UG-2062 Covid response Uganda,,Edukans
1,NL-KVK-32092131-2063,GH-2063 Covid response Ghana,,Edukans


### Country

In [84]:
trans_df["country"] = df["recipient_country_code"]
trans_df.head(5)

Unnamed: 0,iati_id,title_en,title_other,organization,country
0,NL-KVK-32092131-2062,UG-2062 Covid response Uganda,,Edukans,[UG]
1,NL-KVK-32092131-2063,GH-2063 Covid response Ghana,,Edukans,[GH]
2,NL-KVK-32092131-2064,GH-2064 EDU Active Learning Project,,Edukans,[GH]
3,NL-KVK-32092131-2065,UG-2065 EDU Active Learning Project,,Edukans,[UG]
4,NL-KVK-32092131-2066,ET-2066 EDU Active Learning Project,,Edukans,[ET]


### Region

In [85]:
trans_df['region'] = df['recipient_region_code']
trans_df.head(5)

Unnamed: 0,iati_id,title_en,title_other,organization,country,region
0,NL-KVK-32092131-2062,UG-2062 Covid response Uganda,,Edukans,[UG],
1,NL-KVK-32092131-2063,GH-2063 Covid response Ghana,,Edukans,[GH],
2,NL-KVK-32092131-2064,GH-2064 EDU Active Learning Project,,Edukans,[GH],
3,NL-KVK-32092131-2065,UG-2065 EDU Active Learning Project,,Edukans,[UG],
4,NL-KVK-32092131-2066,ET-2066 EDU Active Learning Project,,Edukans,[ET],


### Location

In [86]:
trans_df['location'] = df['location_name_narrative']
trans_df.head(5)

Unnamed: 0,iati_id,title_en,title_other,organization,country,region,location
0,NL-KVK-32092131-2062,UG-2062 Covid response Uganda,,Edukans,[UG],,
1,NL-KVK-32092131-2063,GH-2063 Covid response Ghana,,Edukans,[GH],,
2,NL-KVK-32092131-2064,GH-2064 EDU Active Learning Project,,Edukans,[GH],,
3,NL-KVK-32092131-2065,UG-2065 EDU Active Learning Project,,Edukans,[UG],,
4,NL-KVK-32092131-2066,ET-2066 EDU Active Learning Project,,Edukans,[ET],,


### Descriptions

In [110]:
#################################
# Description english and other #
#################################

df.description_narrative_xml_lang.value_counts()

trans_df["description_en"] = "NaN"
trans_df["description_other"] = "NaN"

for index, row in df.iterrows():
    lang_list = row['description_narrative_xml_lang']
    descr_row = row['description_narrative']

    # nan in pandas is type float
    # check if nan and if yes take first entry in lang
    if isinstance(lang_list, float):
        if isinstance(descr_row, float):
            trans_df["description_en"][index] = "NaN"
        else:
            trans_df["description_en"][index] = descr_row[0]
    else:
        for j in range(0, len(lang_list)):
            if lang_list[j].lower() == "en":
                if type(descr_row) == float:
                    descr = "NaN"
                else:
                    descr = descr_row[j]
                if trans_df["description_en"][index] == "NaN":
                    trans_df["description_en"][index] = descr
                else:
                    trans_df["description_en"][index] = f"{trans_df['description_en'][index]}; {descr}"
            else:
                if type(descr_row) == float:
                    descr = "NaN"
                else:
                    descr = descr_row[j]
                if trans_df["description_other"][index] == "NaN":
                    trans_df["description_other"][index] = descr
                else:
                    trans_df["description_other"][index] = f"{trans_df['description_other'][index]}; {descr}"
        
trans_df.head(5)

Unnamed: 0,iati_id,title_en,title_other,organization,country,region,location,description_en,description_other
0,NL-KVK-32092131-2062,Extra budget for covid response,,Edukans,[UG],,,Extra budget for covid response,
1,NL-KVK-32092131-2063,Extra budget covid response,,Edukans,[GH],,,Extra budget covid response,
2,NL-KVK-32092131-2064,Correctbooks,,Edukans,[GH],,,Correctbooks,
3,NL-KVK-32092131-2065,The EDU Active Learning Project contains a lea...,,Edukans,[UG],,,The EDU Active Learning Project contains a lea...,
4,NL-KVK-32092131-2066,The EDU Active Learning Project contains a lea...,,Edukans,[ET],,,The EDU Active Learning Project contains a lea...,


In [None]:
######################
# Add en discription #
######################