In [33]:
"""
functions to create network graph on maps with plotly depending on a dataframe.
3 options: World, Europe, or Continent
"""
import os
from typing import Tuple, Any

import pandas as pd
import plotly.graph_objects as go
from itertools import combinations
from collections import Counter

# LOADING STUFF
from numpy import nan, NaN

df_country = pd.read_excel("../data/countries_full.xls", sheet_name=0)

df_country_eu = pd.read_excel("../data/countries_eu.xlsx", sheet_name=0)
df_country_w_continent = pd.read_excel("../data/countries_w_continent.xlsx", sheet_name=0)
df_country_w_continent = df_country_w_continent[['Country','Region', 'latitude', 'longitude']]
df_continent = pd.read_excel("../data/continents.xls", sheet_name=0)

# dic of pair Country: Continent
# df_country_w_continent = df_country.dropna(subset=['Region'])
dic_country_continent = dict(zip(df_country_w_continent.Country, df_country_w_continent.Region))
list_eu_countries = df_country_eu.Country.tolist()

In [34]:
df_country_w_continent

Unnamed: 0,Country,Region,latitude,longitude
0,Afghanistan,South Asia,33.939110,67.709953
1,Albania,Europe & Central Asia,41.153332,20.168331
2,Algeria,Middle East & North Africa,28.033886,1.659626
3,American Samoa,East Asia & Pacific,-14.270972,-170.132217
4,Andorra,Europe & Central Asia,42.546245,1.601554
...,...,...,...,...
222,Venezuela,Latin America & Caribbean,6.423750,-66.589730
223,Vietnam,East Asia & Pacific,14.058324,108.277199
224,Yemen,Middle East & North Africa,15.552727,48.516388
225,Zambia,Sub-Saharan Africa,-13.133897,27.849332


In [35]:
def unique_actors(dataframe, map_filter='World') -> pd.DataFrame:
    """
    this function creates a list of unique actors present in the CN column of
    the dataframe. CN column is a made a str such as 'France, United States'
    Args:
        dataframe (pd.Dataframe): input dataframe
        map_filter (str): choose between World, Europe or Continent which
        affects the displays

    Returns:
        list: a list with all actors present in the df (only European countries)
        if map_filter=Europe, only continents if map_filter=Continent.
    """
    return_set = set()
    # df_lst_country = pd.read_csv("data/country_continent.csv", index_col=0)

    if map_filter == 'World':
        for ind, row in dataframe.iterrows():
            if row.CN.split(", "):
                return_set |= set(row.CN.split(", "))

    elif map_filter == 'Europe':
        for ind, row in dataframe.iterrows():
            countries = row.CN.split(", ")
            countries_from_eu = [elem for elem in countries if elem in list_eu_countries]
            return_set |= set(countries_from_eu)

    elif map_filter == 'Continent':
        for ind, row in dataframe.iterrows():
            countries = row.CN.split(", ")
            try:
                lst_countries_as_continent = [dic_country_continent[elem] for elem in countries]
                return_set |= set(lst_countries_as_continent)
            except KeyError:
                for elem in countries:
                    if elem not in dic_country_continent.keys():
                        print(elem)

    if "" in return_set:
        return_set.remove("")
    if return_set:
        return_lst = list(return_set)
    else:
        print("return list is empty")
        return pd.DataFrame(data=None)

    if map_filter in ["World", "Europe"]:
        df_unique_actors = pd.DataFrame(return_lst, columns=["Country"])
        df_unique_actors.sort_values(by="Country", inplace=True)
    elif map_filter == "Continent":
        df_unique_actors = pd.DataFrame(return_lst, columns=["Continent"])
        df_unique_actors.sort_values(by="Continent", inplace=True)
    else:
        df_unique_actors = pd.DataFrame(data=None, columns=["Country"])

    return df_unique_actors.reset_index(drop=True)


def give_lat_long(country: str, map_filter="World") -> tuple:
    """
    Args:
        country (str): name of the country

    Returns:
        tuple: a tuple made of (latitude, longitude) of a given country
        according to a csv file
        :param map_filter:
    """
    # df_lst_country = pd.read_csv(country_csv_path, sep=sep)
    quer = df_country_w_continent.query('Country == "%s"' % country)[['latitude', 'longitude']]

    if map_filter == "Continent":
        quer = df_continent.query('Continent == "%s"' % country)[['latitude', 'longitude']]
    return tuple(quer.values[0])

In [36]:
def actors_mention(dataframe: pd.DataFrame, map_filter="World",
                   max_num_actors="max") -> pd.DataFrame:
    """

    Args:
        max_num_actors:
        dataframe (pd.Dataframe): Dataframe with a CN column
        map_filter (str): World, Europe or Continent

    Returns:
        pd.Dataframe: a dataframe with columns (Country, total_mention, latitude, longitude,
        continent, percentage (= mention / sum(mentions))
    """
    # getting the list of unique countries/continent and sorting them
    df_mentions = unique_actors(dataframe, map_filter=map_filter)
    if df_mentions is None:
        return pd.DataFrame(data=None)
    if max_num_actors == "max":
        max_num_actors = len(df_mentions)
    else:
        try:
            max_num_actors = int(max_num_actors)
        except ValueError:
            max_num_actors = len(df_mentions)

    # Computing total_mention per actor
    total_mention = Counter()
    for ind, row in dataframe.iterrows():
        if map_filter in ['World', 'Europe']:
            lst_actor_in_row = sorted(row.CN.split(", "))
            if "" in lst_actor_in_row:
                lst_actor_in_row.remove("")
            if lst_actor_in_row:
                for mention in lst_actor_in_row:
                    total_mention[mention] += 1

        elif map_filter == 'Continent':
            # For continent, we convert the country to its continent using the dic
            lst_actor_in_row = sorted(row.CN.split(", "))
            if lst_actor_in_row:
                for mention in lst_actor_in_row:
                    if mention != "":
                        total_mention[dic_country_continent[mention]] += 1

    if map_filter in ['World', 'Europe']:
        df_mentions['total_mention'] = df_mentions['Country'].apply(lambda x: total_mention[x])
    df_mentions = df_mentions.merge(df_country_w_continent, left_on="Country", right_on='Country')
    if map_filter == "Continent":
        df_mentions['total_mention'] = df_mentions['Continent'].apply(lambda x: total_mention[x])
        df_mentions = df_mentions.merge(df_continent, left_on="Continent", right_on='Continent')
    # df_mentions.drop(columns=['name', 'country'], axis=1, inplace=True)
    max_total = df_mentions['total_mention'].sum()
    df_mentions['percentage'] = df_mentions['total_mention'].apply(
        lambda x: round(100 * float(x) / max_total, 2))
    return df_mentions.nlargest(max_num_actors, columns=['total_mention']).reset_index(drop=True)


In [37]:
def actors_edge(dataframe: pd.DataFrame, dataframe_mentions: pd.DataFrame, map_filter="World", ) :
    """

    Args:
        dataframe_mentions:
        dataframe:
        map_filter: World, Europe or Continent

    Returns: a df with the following columns: edge (number of collab between actors),
    c1,c2 (the two actors), latitude_c1, longitude_c1, latitude_c2, longitude_c2
    (their localisations)
    """
    total_edge = Counter()
    for ind, row in dataframe.iterrows():
        if map_filter in ['World', 'Europe']:
            lst_actor_in_row = sorted(row.CN.split(", "))
            # For collaborations, we do not take into account publications with more than 10 authors
            if len(lst_actor_in_row) <= 10:
                for edge in list(combinations(lst_actor_in_row, 2)):
                    if len(set(edge)) > 1:
                        res = tuple(sorted(list(edge)))
                        total_edge[res] += 1

        if map_filter == "Continent":
            lst_actor_in_row = row.CN.split(", ")
            # For collaborations, we do not take into account publications with more than 10 authors
            if len(lst_actor_in_row) <= 10:
                if "" in lst_actor_in_row:
                    lst_actor_in_row = lst_actor_in_row.remove("")

                if not lst_actor_in_row:
                    continue
                lst_actor_in_row = sorted([dic_country_continent[actor] for actor in lst_actor_in_row])
                for edge in list(combinations(lst_actor_in_row, 2)):
                    if len(set(edge)) > 1:
                        res = tuple(sorted(list(edge)))
                        total_edge[res] += 1

    df_edge = pd.DataFrame.from_dict(total_edge, orient="index").reset_index()
    df_edge = df_edge.rename(columns={'index': 'country_pair', 0: 'edge'})

    # Transform tuple (c1, c2) into two columns
    df_edge["c1"] = df_edge.country_pair.apply(lambda x: x[0])
    df_edge["c2"] = df_edge.country_pair.apply(lambda x: x[1])
    df_edge = df_edge.drop(columns="country_pair")

    # df_unique_actors = unique_actors(dataframe, map_filter=map_filter) \
    #     .nlargest(max_actors, "total").reset_index(drop=True)
    if map_filter in ["World", "Europe"]:
        cond_c1 = df_edge["c1"].isin(dataframe_mentions.Country.values.tolist())
        cond_c2 = df_edge["c2"].isin(dataframe_mentions.Country.values.tolist())
        df_edge = df_edge[cond_c1 & cond_c2]
        df_edge['latitude_c1'] = df_edge['c1'].apply(lambda x: give_lat_long(x)[0])
        df_edge['longitude_c1'] = df_edge['c1'].apply(lambda x: give_lat_long(x)[1])
        df_edge['latitude_c2'] = df_edge['c2'].apply(lambda x: give_lat_long(x)[0])
        df_edge['longitude_c2'] = df_edge['c2'].apply(lambda x: give_lat_long(x)[1])
        max_total_edge = df_edge["edge"].max()
        df_edge.reset_index(drop=True, inplace=True)

    elif map_filter == "Continent":
        cond_c1 = df_edge["c1"].isin(dataframe_mentions.Continent.values.tolist())
        cond_c2 = df_edge["c2"].isin(dataframe_mentions.Continent.values.tolist())
        df_edge = df_edge[cond_c1 & cond_c2]
        df_edge['latitude_c1'] = df_edge['c1'].apply(lambda x: give_lat_long(x, map_filter="Continent")[0])
        df_edge['longitude_c1'] = df_edge['c1'].apply(lambda x: give_lat_long(x, map_filter="Continent")[1])
        df_edge['latitude_c2'] = df_edge['c2'].apply(lambda x: give_lat_long(x, map_filter="Continent")[0])
        df_edge['longitude_c2'] = df_edge['c2'].apply(lambda x: give_lat_long(x, map_filter="Continent")[1])
        max_total_edge = df_edge["edge"].max()
        df_edge.reset_index(drop=True, inplace=True)
    else:
        df_edge = pd.DataFrame(data=None)
        max_total_edge = 0
    return df_edge, max_total_edge




In [75]:
import networkx as nx


def draw_network_map(dataframe, map_filter="World", save=False,
                     folder="/media/kevin-main/My Passport/SDG/img/network_map/",
                     name="all_pubs",
                     max_num_actors="max"):
    """
    this function draws the graph with countries mentions and collaborations
    :param dataframe:
    :param map_filter: World, Europe or Continent as a string
    :param save: if True, the function saves said graph as a jpeg else just it displays it
    :param folder: folder where to save file
    :param name: name of the file
    :param max_num_actors: number of countries which should be displayed
    :return: a plotly graph
    """
    df_mentions = actors_mention(
        dataframe=dataframe, map_filter=map_filter, max_num_actors=max_num_actors
    )

    print("df_mentions ok")
    df_edges, max_total_edge = actors_edge(dataframe=dataframe, dataframe_mentions=df_mentions, map_filter=map_filter)
    print("df_edge_ok")
    if map_filter == "Continent":
        df_mentions = df_mentions.rename(columns={"Continent": "Country"})
    # Plotting
    # Networkx
    g = nx.Graph()
    nodesize = []
    for ind, row in df_mentions.iterrows():
        g.add_node(row["Country"])
        nodesize.append(row.total_mention)
    maxi_node = max(nodesize)
    node_size = [100 * node / maxi_node for node in nodesize]

    for ind, row in df_edges.iterrows():
        c1 = row["c1"]
        c2 = row["c2"]
        count = row["edge"]

        g.add_edge(c1, c2, weight=count, agree=row["edge"])




    edges = g.edges()
    w = [g[u][v]['weight'] for u, v in edges]
    maxi = max(w) if len(w) > 0 else 0
    # Edges = Lines logic
    edge_trace = []
    for ind, edge in enumerate(g.edges()):
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        weight = 4 * (g[edge[0]][edge[1]]["weight"]) / maxi


        # Line logic
        trace = go.Scatter(
            x=[x0, x1, None], y=[y0, y1, None],
            line=dict(width=weight),
        )
        edge_trace.append(trace)

    # Nodes logic
    node_x = []
    node_y = []
    node_name = []
    for node in g.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        node_name.append(str(node))

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        hovertext=nodesize,
        # hoverinfo='',
        text=node_name,
        marker=dict(
            showscale=True,
            # colorscale options
            # 'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
            # 'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
            # 'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
            colorscale='YlGnBu',
            reversescale=True,
            color=[],
            size=node_size,
            colorbar=dict(
                thickness=15,
                title='Node Connections',
                xanchor='left',
                titleside='right'
            ),
            line_width=2))

    node_adjacencies = []
    node_text = []
    for node, adjacencies in enumerate(g.adjacency()):
        node_adjacencies.append(len(adjacencies[1]))
        node_text.append('# of connections: ' + str(len(adjacencies[1])))

    node_trace.marker.color = node_adjacencies
    # node_trace.text = node_text
    edge_trace.append(node_trace)

    fig = go.Figure(data=edge_trace,

                    layout=go.Layout(
                        title='<br>Network graph showing actors collaborations',
                        titlefont_size=16,
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20, l=5, r=5, t=40),
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                    )
    fig.update_layout(template="ggplot2")
    fig.update_traces(textposition='top center', textfont=dict(family='sans-serif', size=15, color='#000'))
    return fig

In [76]:
df_inter = pd.read_pickle("../data/dataframes/SDG/all_sdg_fixed_dst.pkl")
df_inter

Unnamed: 0,PT,AU,TI,SO,DE,AB,C1,EM,TC,PY,...,IOT,computing_infrastructure,blockchain,robotics,additive_manufacturing,Society,Economy,Environment,EU,DST
0,J,"Pauw, WP; Castro, P; Pickering, J; Bhasin, S",Conditional nationally determined contribution...,CLIMATE POLICY,Nationally determined contributions; UNFCCC; c...,The Paris Agreement's success depends on parti...,"[Pauw, W. P.] Frankfurt Sch Finance & Manageme...",p.pauw@fs.de,22,2020,...,False,False,False,False,False,True,True,False,True,False
1,C,"Morozova, IM; Litvinova, TN; Przhedetskaya, NV...",The Problems of Financing of Entrepreneurship ...,IMPACT OF INFORMATION ON MODERN HUMANS,Problems of financing; Entrepreneurship infras...,The purpose of the research is to determine th...,"[Morozova, Irina M.] Volgograd State Tech Univ...",morozovaira@list.ru; litvinova1358@yandex.ru; ...,0,2018,...,False,False,False,False,False,False,True,False,False,False
2,J,"Hourcade, JC; Dasgupta, D; Ghersi, F",Accelerating the speed and scale of climate fi...,CLIMATE POLICY,Climate finance; public guarantees; de-risking...,"In this paper, we examine how to trigger a wav...","[Hourcade, Jean-Charles] CNRS, CIRED, EHESS, N...",ghersi@centre-cired.fr,1,2021,...,False,False,False,False,False,True,True,False,True,False
3,J,"Larionova, M; Safonkina, E",The First Five Decades of Cooperation for Deve...,VESTNIK MEZHDUNARODNYKH ORGANIZATSII-INTERNATI...,cooperation for development; United Nations; G...,The sustainable development goals (SDGs) adopt...,"[Larionova, M.; Safonkina, E.] Natl Econ & Pub...",larionova-mv@ranepa.ru; safonkina-ea@ranepa.ru,3,2018,...,False,False,False,False,False,True,True,False,False,False
4,J,"Kim, DH; Kim, DH; Lee, DH; Park, S; Kim, SI",Centralization of the Global REDD plus Financi...,FORESTS,REDD plus; Global Governance Network; global R...,With the institutionalization of reducing emis...,"[Kim, Do-hun; Kim, Dong-hwan; Park, Sunjoo; Ki...",seongil@snu.ac.kr,8,2019,...,False,False,False,False,False,True,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1200662,J,"Zhang, H; Smith, M","EXPOSURE TO GLOBAL MARKETS, INTERNAL LABOUR MA...",CANADIAN JOURNAL OF SOCIOLOGY-CAHIERS CANADIEN...,globalization; exporting; foreign ownership; c...,In this paper we address two bodies of sociolo...,"[Smith, Michael] McGill Univ, Montreal, PQ H3A...",heather.zhang@mail.mcgill.ca; michael.smith@mc...,0,2010,...,False,False,False,False,False,True,False,False,False,False
1200663,C,"Zhang, HF; Wu, YZ",WHY THE LAND OWNERSHIP CANNOT PRODUCE HIGH INC...,PROCEEDINGS OF THE SECOND INTERNATIONAL POSTGR...,Land Property; property Income; land Reform,"In China, the urban land belongs to the state ...","[Zhang Huifang; Wu Yuzhe] Zhejiang Univ, Dept ...",,11,2010,...,False,False,False,False,False,True,False,False,False,False
1200664,C,"Zhang, J; Chen, S",Does Technology Disparities Affect the Regiona...,PROCEEDINGS OF SHANGHAI CONFERENCE ON MANAGEME...,technology disparities; income difference; eco...,"In this paper, we examine China's regional gro...","[Zhang Jian; Chen Song] Tongji Univ, Sch Econ ...",,1,2010,...,False,False,False,False,False,True,False,False,False,False
1200665,J,"Zhou, YJ; Dominici, F; Louis, TA",Racial disparities in risks of mortality in a ...,JOURNAL OF THE ROYAL STATISTICAL SOCIETY SERIE...,Hierarchical model; Markov chain Monte Carlo m...,Racial disparities in risks of mortality adjus...,"[Zhou, Yijie] Merck Res Labs, Rahway, NJ 07065...",yijie_zhou@merck.com,1,2010,...,False,False,False,False,False,True,False,False,False,False


In [173]:
a = actors_mention(df_inter, max_num_actors=20)
a

Unnamed: 0,Country,total_mention,Region,latitude,longitude,percentage
0,United States,909183,North America,37.09024,-95.712891,16.31
1,China,908048,East Asia & Pacific,35.86166,104.195397,16.29
2,United Kingdom,279394,Europe & Central Asia,55.378051,-3.435973,5.01
3,Italy,228257,Europe & Central Asia,41.87194,12.56738,4.1
4,Australia,217674,East Asia & Pacific,-25.274398,133.775136,3.91
5,India,197658,South Asia,20.593684,78.96288,3.55
6,Germany,195309,Europe & Central Asia,51.165691,10.451526,3.5
7,Spain,192200,Europe & Central Asia,40.463667,-3.74922,3.45
8,Canada,169566,North America,56.130366,-106.346771,3.04
9,Brazil,167254,Latin America & Caribbean,-14.235004,-51.92528,3.0


In [174]:
b = actors_edge(df_inter, a)
b[0]

Unnamed: 0,edge,c1,c2,latitude_c1,longitude_c1,latitude_c2,longitude_c2
0,6359,Australia,Germany,-25.274398,133.775136,51.165691,10.451526
1,4758,Australia,India,-25.274398,133.775136,20.593684,78.962880
2,4191,Australia,Netherlands,-25.274398,133.775136,52.132633,5.291266
3,2726,Australia,Sweden,-25.274398,133.775136,60.128161,18.643501
4,2569,Germany,India,51.165691,10.451526,20.593684,78.962880
...,...,...,...,...,...,...,...
185,73,Brazil,Turkey,-14.235004,-51.925280,38.963745,35.243322
186,106,Malaysia,Portugal,4.210484,101.975766,39.399872,-8.224454
187,420,Malaysia,Netherlands,4.210484,101.975766,52.132633,5.291266
188,201,Japan,Turkey,36.204824,138.252924,38.963745,35.243322


In [282]:
# Networkx
g = nx.Graph()
nodesize_raw_numbers = []
for ind, row in a.iterrows():
    g.add_node(row["Country"])
    nodesize_raw_numbers.append(row.total_mention)
maxi_node = max(nodesize_raw_numbers)
node_size = [100 * node / maxi_node for node in nodesize_raw_numbers]
for ind, row in b[0].iterrows():
    c1 = row["c1"]
    c2 = row["c2"]
    count = row["edge"]

    g.add_edge(c1, c2, weight=count)

In [283]:
pos = nx.spring_layout(g, k=1, iterations=1000)
edges = g.edges()
w = [g[u][v]['weight'] for u, v in edges]
maxi = max(w) if len(w) > 0 else 0

In [284]:
for node in g.nodes:
    g.nodes[node]['pos'] = list(pos[node])

In [285]:
# Edges = Lines logic
edge_trace = []
for ind, edge in enumerate(g.edges()):
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    weight = 10 * (g[edge[0]][edge[1]]["weight"]) / maxi

    # Line logic
    trace = go.Scatter(
        x=[x0, x1, None], y=[y0, y1, None],
        line=dict(width=weight, color="black"),

        mode='lines')
    edge_trace.append(trace)


In [286]:
 # Nodes logic
node_x = []
node_y = []
node_name = []
for node in g.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_name.append(str(node))

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        hovertext=node_size,
        # hoverinfo='',
        text=node_name,
        textposition="bottom center",
        marker=dict(
            showscale=True,
            # colorscale options
            # 'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
            # 'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
            # 'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
            colorscale='YlGnBu',
            reversescale=True,
            color=[],
            size=node_size,
            colorbar=dict(
                thickness=15,
                title='Number of publications',
                xanchor='left',
                titleside='right'
            ),
            line_width=2))

In [287]:
node_adjacencies = []
node_text = []
for node, adjacencies in enumerate(g.adjacency()):
    node_adjacencies.append(len(adjacencies[1]))
    node_text.append('# of connections: ' + str(len(adjacencies[1])))

node_trace.marker.color = nodesize_raw_numbers

In [288]:
edge_trace.append(node_trace)

fig = go.Figure(data=edge_trace,
                layout=go.Layout(
                    title='<br>Network graph showing actors collaborations',
                    titlefont_size=16,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20, l=5, r=5, t=40),
                    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                )
fig.update_traces(textposition='top center', textfont=dict(family='sans-serif', size=15, color='#000'))
fig.update_layout(template='ggplot2')
fig.show()

In [290]:
fig.write_image("../img/Commission/network.jpeg", width=1900, height=800)