# INSPER Data Set SDG Sankey

This creates some Sankey diagrams from the data in the INSPER data set.

It is inspired by the diagrams on https://golab.bsg.ox.ac.uk/knowledge-bank/indigo/prototype-social-outcomes-and-sdgs/ ( For code see https://github.com/INDIGO-Initiative/jupyter-notebooks/tree/main/sustainable-development-goals-sankey ) 

## Upgrade and install needed libraries

You will need to restart after doing this

In [None]:
!pip install --upgrade plotly
!pip install kaleido

## Get the Data and provide the data to this workbook

Download the INSPER data as a CSV file. Upload it to this runtime by clicking files icon on the left bar, then the upload icon. Make sure it is called `data.csv`


## Load data into memory

In [None]:
import csv

all_projects = []
with open('data.csv', newline='') as csvfile:
    csvreader = csv.DictReader(csvfile)
    for row in csvreader:
        all_projects.append(row)


## Filter Data

If you want to work with only a set of the data, filter it here. Some examples are commented out in the code.


In [None]:
projects = []
for project in all_projects:
    if True:
    #if int(project['ID']) < 10:
        projects.append(project)

# Some Code Constants and Libraries

We'll just define these for use later


In [None]:
import plotly.graph_objects as go
from collections import defaultdict

COLUMN_PROJECT_ID = 'ID'
COLUMN_PROJECT_NAME = 'Name'
COLUMN_SDG_1 = 'SDG 1'
COLUMN_SDG_2 = 'SDG 2'
COLUMN_SDG_3 = 'SDG 3'
COLUMN_SDG_4 = 'SDG 4'
COLUMN_TARGET_POPULATION_NUMBER = 'Target Population Number'


SDG_GOAL_MAP = {
    '1': {
        'name': '1. No poverty',
        'node_colour': '#e5243b',
        'link_colour': 'rgba(229, 36, 59, 0.5)',
    },
    '2': {
        'name': '2. Zero hunger',
        'node_colour': '#DDA63A',
        'link_colour': 'rgba(221, 166, 58, 0.5)',
    },
    '3': {
        'name': '3. Good health and wellbeing',
        'node_colour': '#4C9F38',
        'link_colour': 'rgba(76, 159, 56, 0.5)',
    },
    '4': {
        'name': '4. Quality education',
        'node_colour': '#C5192D',
        'link_colour': 'rgba(197, 25, 45, 0.5)',
    },
    '5': {
        'name': '5. Gender equality',
        'node_colour': '#FF3A21',
        'link_colour': 'rgba(255, 58, 33, 0.5)',
    },
    '6': {
        'name': '6. Clean water and sanitation',
        'node_colour': '#26BDE2',
        'link_colour': 'rgba(38, 189, 226, 0.5)',
    },
    '7': {
        'name': '7. Affordable and clean energy',
        'node_colour': '#FCC30B',
        'link_colour': 'rgba(252, 195, 11, 0.5)',
    },
    '8': {
        'name': '8. Work and economic growth',
        'node_colour': '#A21942',
        'link_colour': 'rgba(162, 25, 66, 0.5)',
    },
    '9': {
        'name': '9. Industry, innovation and infrastructure',
        'node_colour': '#FD6925',
        'link_colour': 'rgba(253, 105, 37, 0.5)',
    },
    '10': {
        'name': '10. Reduced inequalities',
        'node_colour': '#DD1367',
        'link_colour': 'rgba(221, 19, 103, 0.5)',
    },
    '11': {
        'name': '11. Sustainable cities and communities',
        'node_colour': '#FD9D24',
        'link_colour': 'rgba(253, 157, 36, 0.5)',
    },
    '12': {
        'name': '12. Responsible consumption and production',
        'node_colour': '#BF8B2E',
        'link_colour': 'rgba(191, 139, 46, 0.5)',
    },
    '13': {
        'name': '13. Climate action',
        'node_colour': '#3F7E44',
        'link_colour': 'rgba(63, 126, 68, 0.5)',
    },
    '14': {
        'name': '14. Life below water',
        'node_colour': '#0A97D9',
        'link_colour': 'rgba(10, 151, 217, 0.5)',
    },
    '15': {
        'name': '15. Life on land',
        'node_colour': '#56C02B',
        'link_colour': 'rgba(86, 192, 43, 0.5)',
    },
    '16': {
        'name': '16. Peace, justice and strong institutions',
        'node_colour': '#00689D',
        'link_colour': 'rgba(0, 104, 157, 0.5)',
    },
    '17': {
        'name': '17. Partnerships for the goals',
        'node_colour': '#19486A',
        'link_colour': 'rgba(25, 72, 106, 0.5)',
    },
}

def sankey_viz(projects, sankey_height=400, filename='out.png', weight_by_population=False):

    # Pass 1: Make sure we have all the nodes listed
    projects_id_to_name = {}
    sdg_nodes = set()
    for project in projects:
        sdg_values = [ project[COLUMN_SDG_1] , project[COLUMN_SDG_2] , project[COLUMN_SDG_3] , project[COLUMN_SDG_4] ]
        for sdg_value in sdg_values:
            if sdg_value:
                projects_id_to_name[project[COLUMN_PROJECT_ID]] = project[COLUMN_PROJECT_NAME]
                sdg_nodes.add(sdg_value)

    # Pass 2: Get links
    links = defaultdict(dict)
    for project in projects:
        sdg_values = [ project[COLUMN_SDG_1] , project[COLUMN_SDG_2] , project[COLUMN_SDG_3] , project[COLUMN_SDG_4] ]
        for sdg_value in sdg_values:
            if sdg_value:
                if weight_by_population:
                    population = int(project[COLUMN_TARGET_POPULATION_NUMBER]) if project[COLUMN_TARGET_POPULATION_NUMBER] else 1
                    # The only way there would not be a value here already is if one project listed the same SDG twice
                    # In that case, not double-counting the population is good
                    links[project[COLUMN_PROJECT_ID]][sdg_value] = population
                else:
                    links[project[COLUMN_PROJECT_ID]][sdg_value] = 1



    # Turn into data suitable for sankey
    node_labels = []
    node_colours = []
    project_nodes_idx = {}
    sdg_nodes_idx = {}
    for project_id, project_name in projects_id_to_name.items():
        node_labels.append("Project " + project_id + ": "+ project_name)
        node_colours.append("black")
        project_nodes_idx[project_id] = len(node_labels) - 1
    for sdg_node in sdg_nodes:
        node_labels.append(SDG_GOAL_MAP[sdg_node]['name'])
        node_colours.append(SDG_GOAL_MAP[sdg_node]['node_colour'])
        sdg_nodes_idx[sdg_node] = len(node_labels) - 1

    links_for_sankey = []
    for project_id in links.keys():
        for sdg in links[project_id].keys():
            links_for_sankey.append([project_id, sdg, links[project_id][sdg]])

    sankey_node_data = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = node_labels,
      color = node_colours
    )

    sankey_link_data = dict(
      source = [project_nodes_idx[l[0]] for l in links_for_sankey],
      target = [sdg_nodes_idx[l[1]] for l in links_for_sankey],
      value = [l[2] for l in links_for_sankey],
      color = [SDG_GOAL_MAP[l[1]]['link_colour'] for l in links_for_sankey]
    )

    # make Sankey
    fig = go.Figure(
        data=go.Sankey(
            node=sankey_node_data,
            link=sankey_link_data,
            arrangement='perpendicular',
        ),
    )

    fig.update_layout(
        height=sankey_height,
    )

    fig.show()
    fig.write_image(filename)





# A Sankey that just shows links





In [None]:


sankey_viz(projects, sankey_height=4000, filename='sankey.png')


# A Sankey that shows links weighted by Target Population Number

Many projects do not have that data - in those cases, we have just assumed 1.


In [None]:
sankey_viz(projects, sankey_height=4000, filename='sankey-population.png', weight_by_population=True)
