In [1]:
#Set up
import seaborn as sns
import pandas as pd
from datetime import datetime
import ast
import os
import noteql
import matplotlib.pyplot as plt
from time import sleep
from IPython.core.display import HTML

# Organisation-Identifier Networking

> Organisations publishing IATI data declare participating organisations, which have various roles in their activities. By using Organisation-Identifiers, participating organisations can be uniquely identified, allowing us to create a network.

> This application visualises the network of shared organisation references in IATI data, based on the participating-org/@ref element. You can create your own visualisation based on type of reporting organisation, specific countries, or using a list of specific organisations. 

Example network:

<hr>

In [2]:
# function to add flexibility to change width and viz_id and also error handling

def show_flourish(viz_id="17570004", width="28%"): # defult values 
    try:
        html = f'<div class="flourish-embed flourish-chord" data-src="visualisation/{viz_id}" data-width="{width}"><script src="https://public.flourish.studio/resources/embed.js"></script></div>'
        return HTML(html)
    except Exception as e:
        print(f"Failed to load visualization: {e}")
        return None


show_flourish()  # Uses defaults (17570004, 28%)


## Running the application:

1. Select the type of visualisation you want to create from the dropdown menu: A. Reporting organisation type, B. Country, or C. A list of reporting organisations.

In [3]:
vis_type = 'Organisations by Type'

2A. If you are interested in specific organisation types, select them using the dropdown menu. Selecting a large amount of organisation types may cause this application to run slowly or fail.

In [4]:
rep_org_type = ['10']

**Organisation Types:**

**Code** | **Name**     |
|---------|---------------------------------|
| 10 | Government |
| 11 | Local Government |
| 15 | Other Public Sector |
| 21 | International NGO |
| 22 | National NGO |
| 23 | Regional NGO |
| 24 | Partner Country based NGO | 
| 30 | Public Private Partnership |	
| 40 | Multilateral |
| 60 | Foundation |
| 70 | Private Sector |
| 71 | Private Sector in Provider Country |
| 72 | Private Sector in Aid Recipient Country |
| 73 | Private Sector in Third Country |
| 80 | Academic, Training and Research | 	
| 90 | Other 


Get info country selector dropdown.

In [5]:
try:
    !wget https://iatiregistry.org/publisher/download/csv
    registry = pd.read_csv("csv")
    countries = sorted(registry['HQ Country or Region'].unique())
except Exception as e:
    print(f"Error downloading or processing registry: {e}")
finally:
    !rm -rf "csv"  # Clean up regardless of success/failure

2B.  If you are interested in reporting organisations based in a specific country or countries, select them using the dropdown menu. You can find out where an organisation is based using the registry. Selecting a large number of countries may cause this application to run slowly or fail.

In [6]:
country_query = ['(No country assigned)']

2C.  If you are interested in a list of specific reporting organisations, enter the Organisation Identifiers of the IATI Publishers you want to select into the text box, copying this formatting exactly: 'AU-5', '44000'.  You can find these identifiers using the registry.

In [7]:
pub_org_id = ''

Reporting organisation identifiers:

In [8]:
registry[['Publisher', 'IATI Organisation Identifier','HQ Country or Region']]

3. Click the "Run" button.

In [9]:
if vis_type in ['Organisations by Type']:
    print(f"Last run on organisation types:",*rep_org_type)
elif vis_type in ['Country']:
    print(f"Last run on organisations with headquarters in:",*country_query)
elif vis_type in ['Specific Organisations']:
    print(f"Last run on reporting organisations:",*pub_org_id)

# Download data

## Setup

Using https://iati-tables.opendata.coop/. The following cell sets up the basic libraries needed for analysis and makes a connection to the database. 

In [10]:
def setup_database_connection(timeout=1000):
    try:
        sns.set_context('notebook')
        
        print("Initializing database connection...")
        
        # Restarting PostgreSQL safely
        try:
            !sudo service postgresql restart
            sleep(2)  # Giving PostgreSQL time to restart
            print("PostgreSQL restarted successfully")
        except Exception as e:
            print(f"Warning: PostgreSQL restart failed: {e}")
        
        # Creating new session
        session = noteql.Session(
            datasette_url='https://datasette.codeforiati.org/iati.json',
            connect_args={'connect_timeout': timeout}
        )
        
        print("Database connection established successfully")
        return session
        
    except Exception as e:
        print(f"Error setting up database connection: {e}")
        return None

In [11]:
session = setup_database_connection()

## IATI-tables Queries

Downloading all participating organisations reported by selected reporting org type, countries, or ids.

Prepare country filter - based on ISO code as much as possible.

In [12]:
def get_country_iso_codes(country_query, csv_path="Country.csv"):
    try:
        # Reading country codes
        print(f"Loading country codes from {csv_path}...")
        iso_codes = pd.read_csv(csv_path)
        
        # Filtering based on country query
        filtered_codes = iso_codes[iso_codes['name'].isin(country_query)]
        
        if filtered_codes.empty:
            print(f"Warning: No ISO codes found for countries: {', '.join(country_query)}")
            return None
            
        print(f"Found ISO codes for {len(filtered_codes)} countries")
        return filtered_codes['code']
        
    except FileNotFoundError:
        print(f"Error: Country code file '{csv_path}' not found")
        return None
    except Exception as e:
        print(f"Error processing country codes: {e}")
        return None

In [13]:
iso_codes = get_country_iso_codes(country_query)

In [14]:
iso_codes

In [15]:
def process_country_organisations(registry, country_query):
    try:
        # Filtering organizations by country
        print(f"Filtering organisations for countries: {', '.join(country_query)}")
        filtered_orgs = registry[registry['HQ Country or Region'].isin(country_query)]
        
        if filtered_orgs.empty:
            print("No organisations found for specified countries")
            return None
            
        # Spliting IATI identifiers into prefix and ID
        split_identifiers = filtered_orgs['IATI Organisation Identifier'].str.split('-', n=1, expand=True)
        
        # Combining original data with split identifiers
        result = pd.concat([filtered_orgs, split_identifiers], axis=1)
        
        print(f"Found {len(result)} organisations")
        return result
        
    except Exception as e:
        print(f"Error processing organisations: {e}")
        return None

In [16]:
country_orgs = process_country_organisations(registry, country_query)

In [17]:
country_orgs

Format input text

In [18]:
def process_org_ids(pub_org_id):
    try:
        org_list = ast.literal_eval(pub_org_id)
        
        cleaned_orgs = [org.strip() for org in org_list]
        
        print(f"Processed {len(cleaned_orgs)} organisation IDs")
        return cleaned_orgs
        
    except ValueError as ve:
        print(f"Invalid format for organisation IDs: {ve}")
        return None
    except SyntaxError as se:
        print("Invalid syntax in organisation IDs string")
        return None
    except Exception as e:
        print(f"Error processing organisation IDs: {e}")
        return None

In [19]:
pub_orgs = process_org_ids(pub_org_id)

In [20]:
pub_orgs 

In [21]:
outputs = {}
if vis_type in ['Organisations by Type']:
   for i in rep_org_type:
       df = %nql SELECT DISTINCT a.prefix, a.reportingorg_ref, b.reportingorg_type, b.reportingorg_typename, a.ref, a.type, a.typename, a.narrative FROM participatingorg as a LEFT JOIN activity as b on (b._link = a._link_activity) WHERE (b.reportingorg_type = {{i}}) ORDER BY a.narrative
       outputs[i] = df
elif vis_type in ['Country']:
   for i in country_orgs['IATI Organisation Identifier']:
       df = %nql SELECT DISTINCT a.prefix, a.reportingorg_ref, a.ref, a.type, a.typename, a.narrative FROM participatingorg as a WHERE (a.reportingorg_ref = {{i}})
       outputs[i] = df
elif vis_type in ['Specific Organisations']:
   for i in pub_org_id:
       df = %nql SELECT DISTINCT a.prefix, a.reportingorg_ref, a.ref, a.type, a.typename, a.narrative FROM participatingorg as a WHERE (a.reportingorg_ref = {{i}})
       outputs[i] = df

part_orgs = pd.concat(outputs.values(), ignore_index=True)
part_orgs

In [22]:
!rm -rf part_orgs.xlsx
part_orgs.to_excel("part_orgs.xlsx",sheet_name="part_orgs", index=False)

# Clean data

## Setup

In [23]:
session2 = noteql.local_db_session()
# Restart postgres to make sure any existing connections get dropped
!sudo service postgresql restart
# We can't get the noteql session to do these, because it wraps sql in a transaction, which isn't allowed for DROP/CREATE
!sudo -u postgres psql -c "DROP DATABASE IF EXISTS pa"
!sudo -u postgres psql -c "CREATE DATABASE pa"
session2 = noteql.Session("postgresql+psycopg2://root@/pa", "public")

## Load data

In [24]:
part_orgs = pd.read_excel('part_orgs.xlsx',sheet_name=None)

In [25]:
for key in part_orgs.keys():
    table_name = key.replace(' ','')
    table_name = table_name.lower()
    print(table_name)
    
    %nql DROP TABLE IF EXISTS {{table_name | ident}}; SELECT ''
    
    # Convert to a DataFrame and render
    df = pd.DataFrame.from_dict(part_orgs[key])
    session2.load_dataframe(df, table_name, index=False)



In [26]:
!wget https://iatistandard.org/reference_downloads/203/codelists/downloads/clv3/csv/en/OrganisationRegistrationAgency.csv
!wget https://iatistandard.org/reference_downloads/203/codelists/downloads/clv3/csv/en/OrganisationIdentifier.csv
reg_agency = pd.read_csv("OrganisationRegistrationAgency.csv")
old_orgid = pd.read_csv("OrganisationIdentifier.csv")
refs = pd.read_csv("Generic_XM-DACs_Jul24.csv")
session2.load_dataframe(registry, "registry", index=False)
session2.load_dataframe(reg_agency, "reg_agency", index=False)
session2.load_dataframe(old_orgid, "old_orgid", index=False)
session2.load_dataframe(refs,"refs",index=False)
!rm -rf "OrganisationRegistrationAgency.csv" "OrganisationIdentifier.csv"

- Remove obvious issues such as blank references 

- Remove all/part of the data from reporting orgs with known issues which mean a participating organisation could not be identified. 

- Trim leading/trailing whitespace from narratives.

- Remove generic XM-DAC codes.

Other issues will be caught by the filter data section, which checks for valid organisation-ids with a format of {RegistrationAgency}-{RegistrationNumber}.

In [27]:
def clean_organisation_data(refs_df):
    try:
        print("Starting data cleaning process...")
        
        # Converting codes to string and create XMDAC format
        refs_df['code'] = refs_df['code'].astype(str)
        refs_df['XMDAC'] = "XM-DAC-" + refs_df['code']
        
        # Removing blank references
        refs_df = refs_df.dropna(subset=['code'])
        
        # Triming whitespace from narratives if they exist
        narrative_columns = [col for col in refs_df.columns if 'narrative' in col.lower()]
        for col in narrative_columns:
            refs_df[col] = refs_df[col].astype(str).str.strip()
            
        print(f"Cleaned {len(refs_df)} organization references")
        return refs_df
        
    except Exception as e:
        print(f"Error during data cleaning: {e}")
        return None

In [28]:
refs = pd.read_csv("Generic_XM-DACs_Jul24.csv")

In [29]:
refs = clean_organisation_data(refs)

In [30]:
refs

In [31]:
%%nql SHOW CREATE cleaned_part_orgs

SELECT DISTINCT
    prefix,
    reportingorg_ref,
    ref,
    type,
    typename,
    CASE
        WHEN
            SPLIT_PART(ref,'-',2) = '' THEN ''
            ELSE CONCAT(SPLIT_PART(ref,'-',1),'-',SPLIT_PART(ref,'-',2))
    END as "reg_agency",
    TRIM(REGEXP_REPLACE(narrative, E'\n', '')) as "narrative"
FROM part_orgs
WHERE NOT narrative in ('',' ','EN: ','Odefinierat','EN: IP not published','Not applicable',
                        'Confidential',
                        'USAID redacted this field in accordance with the exceptions outlined in the Foreign Aid Transparency and Accountability Act of 2016.')
    AND NOT POSITION('Private donors' IN narrative) > 0
    AND NOT POSITION('Övrigt' IN narrative) > 0
    AND NOT POSITION('Övriga' IN narrative) > 0
    AND CASE
        WHEN prefix='unicef' THEN NOT POSITION('XM-DAC-41122-VN-' IN ref) > 0 
        ELSE true
    END
    AND NOT ref in (' ','Not registered in IATI','0')
    AND NOT ref in {{ refs['code'] | inclause }}
    AND NOT ref in {{ refs['XMDAC'] | inclause }}

## Add other data sources

- Join to registry IDs, registration agency, and old IATI codelist. 

- Add name of reporting-org from registry. If the reporting org reference does not match the registry, it will be removed. 

In [32]:
%%nql SHOW CREATE joined_part_orgs 
SELECT 
    e."Publisher" as "reportingorg_name",
    a.*,
    b."Publisher" as "registry_narrative",
    c.name as "reg_agency_narrative",
    d.name as "v1_orgid"
FROM cleaned_part_orgs as a
    LEFT JOIN registry as b ON a.ref = b."IATI Organisation Identifier"
    LEFT JOIN reg_agency as c ON a.reg_agency = c.code
    LEFT JOIN old_orgid as d ON a.ref = d.code
    LEFT JOIN registry as e ON a.reportingorg_ref = e."IATI Organisation Identifier"
WHERE NOT e."Publisher" = 'None'
ORDER BY a.prefix

## Filter data

Check that the participating organisation references:

- Are a valid reporting org ID from the registry OR

- Have a valid prefix from org-id UNLESS

- They are a v1 org id

In [33]:
%%nql SHOW CREATE filtered_orgs output=DF
SELECT
    *
FROM joined_part_orgs 
WHERE NOT reg_agency_narrative = 'None'
    OR NOT registry_narrative = 'None'
    OR NOT v1_orgid = 'None'
ORDER BY prefix

In [34]:
!rm -rf output.xlsx
output.to_excel("output.xlsx",sheet_name="output", index=False)

output_file_url = f"https://deepnote.com/publish/{os.environ['DEEPNOTE_PROJECT_ID']}/file?path=output.xlsx"
print(output_file_url)

# Summarise

<hr>

## Chord counts - to download

Code source: https://stackoverflow.com/questions/64689296/counting-shared-elements-within-groups-with-condition?rq=3 

Chord output for use with bokeh:

In [35]:
df = output[["reportingorg_name","ref"]].drop_duplicates()
result = df.merge(df, on='ref').groupby(['reportingorg_name_x','reportingorg_name_y']).count().reset_index()
result.columns = ['source', 'target', 'value']
mask = result.source!=result.target
chord_df = result[mask].reset_index(drop='True')
chord_df = chord_df.sort_values(by="value", ascending=False)
chord_df

## Networking Visualisation

Use Bokeh to create a chord diagram

In [36]:
import holoviews as hv
import numpy as np
from holoviews import opts, dim
from bokeh.plotting import figure, output_file, save, show
from IPython.display import IFrame
from IPython.core.display import display, HTML
import tempfile

Workaround to use Bokeh in Deepnote

In [37]:
def bokeh_deepnote_show(plot):
    tmp_output_filename = tempfile.NamedTemporaryFile(suffix='.html').name
    output_file(tmp_output_filename)
    save(plot)
    f = open(tmp_output_filename, "r")
    display(HTML(f.read()))


In [38]:
#set first variable to true for first run through
first = True

Adjust the minimum number of shared participating organisation references by using the slider. Use the "visualise" button to refresh the diagram.

In [39]:
filter = 0

chord = False

In [41]:
if first or chord:
    mask = np.logical_and(result.source<result.target, result.value>0)
    bokeh_df = result[mask].reset_index(drop='True')
    bokeh_df = bokeh_df.sort_values(by="source")

bokeh_df

In [42]:
def create_chord_visualisation(bokeh_df, filter_value):
    try:
        hv.extension('bokeh')
        
        # Filtering data
        filtered_data = bokeh_df[bokeh_df['value'] >= filter_value]
        
        # Creating organisation list for labels
        orgs = list(set(filtered_data["source"].unique().tolist() + 
                       filtered_data["target"].unique().tolist()))
        orgs_dataset = hv.Dataset(pd.DataFrame(orgs, columns=["Organisation"]))
        
        # Creating chord diagram
        chord = hv.Chord((filtered_data, orgs_dataset))
        
        # Setting visualisation options
        chord.opts(
            cmap='Category20',
            edge_cmap='Category20',
            labels='Organisation',
            node_color=hv.dim('Organisation').str(),
            title='Shared Participating Organisation References',
            width=900,
            height=900
        )
        
        # Displaying using Bokeh workaround
        bokeh_deepnote_show(hv.render(chord))
        print(f"Visualising {len(filtered_data)} connections")
        return chord
        
    except Exception as e:
        print(f"Error creating visualisation: {e}")
        return None

In [43]:
if first or chord:
    chord = create_chord_visualisation(bokeh_df, filter)

# Output

In [44]:
if first or chord:
    !rm -rf chord_df.xlsx
chord_df.to_excel("chord_df.xlsx",sheet_name="chord_df", index=False)

chord_file_url = f"https://deepnote.com/publish/{os.environ['DEEPNOTE_PROJECT_ID']}/file?path=chord_df.xlsx"
print(chord_file_url)

<hr>

# Further visualisation

To create and edit your own diagram, you may wish to use the data visualisation tool "Flourish". 

You can download the chord data produced by this application here: Download.

Example flourish chord diagram:

In [45]:
from IPython.core.display import HTML
HTML('<div class="flourish-embed flourish-chord" data-src="visualisation/17570004" data-width="29%"><script src="https://public.flourish.studio/resources/embed.js"></script></div>')

<hr>

## How is the data cleaned?

Organisation identifiers with known issues are removed - these include redacted or blank narratives, and internal or generic references, as they cannot be used for the networking of IATI data. This is not an exhaustive list and is subject to change.  

 Blank/redacted narratives

Blank/internal/generic references

| **Organisation** | **Redaction message/Issue**     |
|---------|---------------------------------|
| iom     | Confidential                    |
| sida | Övriga/Övrigt ... (Other) |
| unicef  | EN: IP not published            |
| unhcr   | Private donors in...             |
|usaid | USAID redacted this field in accordance with the exceptions outlined in the Foreign Aid Transparency and Accountability Act of 2016.|

| **Organisation** | **Redaction message/Issue**     |
|------------------|---------------------------------|
| unicef            | Internal ID eg XM-DAC-41122-VN-242363 |
| Multiple orgs | Generic XM-DAC- codes used |

TO DO - compare DAC codes to https://github.com/pwyf/aid-transparency-tracker/blob/main/beta/excluded_xm_dac.py

Participating organisation identifiers must have a valid format of {RegistrationAgency}-{RegistrationNumber}, or match a known organisation-identifier as recorded in the registry.

You can download the cleaned data here: Download

<hr>

## Data Snapshots

Number of reporting organisations referencing each participating organisation identifier :

In [46]:
%%nql SHOW CREATE ref_count ref_count=DF

SELECT 
    ref as "Participating Organisation Identifier",
    COUNT(DISTINCT(reportingorg_ref)) as "Count" 
FROM filtered_orgs 
GROUP BY ref
ORDER BY "Count" DESC

In [47]:
def create_styled_visualization(data, x_column, y_column, title, xlabel, ylabel, n=25):
   try:
       # Setting style
       plt.style.use('seaborn')
       
       # Creating figure
       fig, ax = plt.subplots(figsize=(14, 10))
       
        # Creating horizontal bar chart
       bars = ax.barh(y_column, x_column, 
                     data=data.loc[0:n-1],
                     color='#008080',  # teal colour
                     alpha=0.8)
       
       # Adding value labels on bars
       for bar in bars:
           width = bar.get_width()
           ax.text(width, bar.get_y() + bar.get_height()/2,
                  f'{int(width):,}',
                  ha='left', va='center', fontsize=10)

       # Customising appearance
       ax.set_title(title, pad=20, fontsize=14, fontweight='bold')
       ax.set_xlabel(xlabel, fontsize=12)
       ax.set_ylabel(ylabel, fontsize=12)

       # Removing top and right spines
       ax.spines['top'].set_visible(False)
       ax.spines['right'].set_visible(False)

       # Adding grid
       ax.grid(axis='x', linestyle='--', alpha=0.7)

       # Adjusting layout
       plt.tight_layout()
       plt.show()
       
   except Exception as e:
       print(f"Error creating visualization: {e}")

Most commonly referenced participating organisation identifiers - Top 25

In [48]:
# reference counts
create_styled_visualization(
   ref_count,
   "Count",
   "Participating Organisation Identifier",
   "Top 25 Most Referenced Organizations",
   "Number of References",
   "Organization"
)


Count of unique valid participating organisation identifiers referenced by each reporting organisation:

In [49]:
%%nql SHOW CREATE reporg_count reporg_count=DF

SELECT
    reportingorg_name as "Reporting Organisation",
    COUNT(DISTINCT(ref)) as "Count of Participating Organisation Identifiers"
FROM filtered_orgs 
GROUP BY reportingorg_name
ORDER BY "Count of Participating Organisation Identifiers" DESC

Reporting organisations with most valid participating organisation identifiers - Top 25

In [50]:
# reporting org counts
create_styled_visualization(
   reporg_count,
   "Count of Participating Organisation Identifiers",
   "Reporting Organisation",
   "Top 25 Reporting Organizations by Reference Count",
   "Number of References",
   "Organization"
)

### Footnote

The code behind this application is hidden to improve usability. You are welcome to view this code at: Notebook.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=240fcfd1-8557-41e6-8271-b13ecef554c3' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>