In [1]:
"""SCRIPT FOR DEEP DIVE: NETWORK ANALYSIS AT THE LOCAL LEVEL FOR SELECTED TECHNOLOGY HYDROGEN"""

'SCRIPT FOR DEEP DIVE: NETWORK ANALYSIS AT THE LOCAL LEVEL FOR SELECTED TECHNOLOGY'

In [1]:
# Imports
import pandas as pd
import numpy as np


In [None]:
# Import the dataframes with the partnerships and the organizations
long_df_berlin = pd.read_csv("long_df_berlin.csv")
long_df_houston = pd.read_csv("long_df_houston.csv")
df_orgs_houston = pd.read_csv("df_orgs_houston.csv")
df_orgs_berlin = pd.read_csv("df_orgs_berlin.csv")


In [38]:
import networkx as nx
from ipysigma import Sigma

list_orgtypes = ["All", "Hydrogen services", "Service sector", "Mining industry", "Hydrogen industry", "Utilities", "Oil and gas firms", "Other industry", "Research organizations", "Banks", "Venture Capital", "Other Finance", "Governmental organizations", "Incubators/Accelerators",  "Other"]

def create_network_plots_with_edges(df_edges, df_organizations, city):
    # make post_date a datetime
    df_edges['post_date'] = pd.to_datetime(df_edges['post_date'])

    columns = df_edges.columns[6:]

    columns_= list(columns)
    columns_.append('partners')

    df_edges["partners"] = df_edges.apply(lambda x: '-'.join(sorted([x['source'], x['target']])), axis=1)

    """Split the datasets into pre IRA and post IRA"""
    # df with datatime before 1.07.2022
    df_pre_ira = df_edges[df_edges['post_date'] < datetime.datetime(2022, 7, 1)]
    # drop duplicate columns
    df_pre_ira = df_pre_ira.drop_duplicates(subset=columns_)
    df_pre_ira = df_pre_ira.drop(columns=['partners'])

    df_pos_ira = df_edges[df_edges['post_date'] >= datetime.datetime(2022, 7, 1)]
    df_pos_ira = df_pos_ira.drop_duplicates(subset=columns_)
    df_pos_ira = df_pos_ira.drop(columns=['partners'])




    for period, df_copy in enumerate([df_pre_ira,df_pos_ira]):
        list_number_nodes = []
        list_number_edges = []
        org_rows = []

        G = nx.from_pandas_edgelist(df_copy, source="source", target="target")

        # Compute the node degree for node size
        node_degrees = dict(G.degree())

        # Map firm types from df2 to the nodes
        firm_type_dict = pd.Series(df_organizations['orgtype'].values, index=df_organizations['Linkedin_name']).to_dict()
        label_dict = pd.Series(df_organizations['name'].values, index=df_organizations['Linkedin_name']).to_dict()

        # Set node degree and firm type as node attributes
        nx.set_node_attributes(G, node_degrees, 'degree')
        nx.set_node_attributes(G, firm_type_dict, 'firm_type')
        nx.set_node_attributes(G, label_dict, 'label')

        # Group nodes by orgtype
        orgtype_groups = {}
        for node, data in G.nodes(data=True):
            orgtype = data.get('orgtype', 'Other')
            orgtype_groups.setdefault(orgtype, []).append(node)

        # Define a color mapping for firm types
        color_mapping = {
            'Service sector': 'lightblue',
            'Hydrogen services': 'lightgreen',
            'Mining industry': '#756bb1',
            'Hydrogen industry': 'green',
            'Research organizations': 'orange',
            'Utilities': '#104862',
            'Oil and gas firms': 'brown',
            'Other Finance': 'pink',
            'Banks': 'purple',
            'Venture Capital': 'yellow',
            'Governmental organizations': 'red',
            'Incubators/Accelerators': '#008080',
            'Other industry': 'blue'
        }


        # Adjust node attributes for gravity and repulsion
        for node in G.nodes:
            degree = G.nodes[node]['degree']
            G.nodes[node]['centralGravity'] = 0.7 if degree > 10 else 0.2  # Stronger gravity for higher-degree nodes
            G.nodes[node]['nodeStrength'] = -10 if degree < 5 else -30  # Less repulsion for low-degree nodes

        # Adjust edge attributes for stronger attraction
        for edge in G.edges:
            # Set edge strength based on the degree of connected nodes
            source_degree = G.nodes[edge[0]]['degree']
            target_degree = G.nodes[edge[1]]['degree']
            G.edges[edge]['edgeStrength'] = 0.5 + min(source_degree, target_degree) * 0.05

        layout_settings = {
            "gravity": 0.1,  # Keeps the overall graph cohesive
            "scalingRatio": 2.5,  # Ensures enough space between clusters
            "nodeStrength": "nodeStrength",  # Repulsion strength depends on node degree
            "edgeStrength": "edgeStrength",  # Stronger attraction for connected nodes
            "centralGravity": "centralGravity",  # Higher gravity for hubs
            "outboundAttraction": True,  # Prevents low-degree nodes from drifting
            "linLogMode": True,  # Uses logarithmic distance scaling for better clustering
            "strongGravityMode": True,  # Reinforces gravity to pull nodes closer to their hubs
            "adjustSizes": True,  # Prevents overlap by adjusting distances
        }

               # Visualize the graph using Sigma
        Sigma.write_html(
            G,
            './network_period_'+str(period)+'_'+ city+'.html',
            fullscreen=True,
            node_metrics=['louvain'],
            node_color='firm_type',
            node_size_range=(1, 25),
            max_categorical_colors=10,
            default_edge_type='curve',
            node_border_color_from='node',
            default_node_label_size=14,  # Node label font size
            node_size='degree',
            node_color_palette=color_mapping,
            default_edge_size=0.08,
            node_label="label",
            label_density=0,  # Use the 'label' attribute for node labels
            layout_settings=layout_settings
        )

        for orgtype in list_orgtypes:
            if orgtype=="All":
                # number of nodes
                list_number_nodes.append(len(G.nodes))
                # number of edges
                list_number_edges.append(len(G.edges))
                org_rows.append(orgtype)
            else:
                try: list_number_nodes.append(len(df_organizations[df_organizations['orgtype']==orgtype]))
                except: list_number_nodes.append(0)
                # number of edges where orgtype = orgtype
                try:list_number_edges.append(df_copy[(df_copy['source'].isin(orgtype_groups[orgtype]))&(df_copy['target'].isin(orgtype_groups[orgtype]))].shape[0])
                except: list_number_edges.append(0)

                org_rows.append(orgtype)

    # create dataframe with the descriptives
    df_descriptives = pd.DataFrame({"orgtype":org_rows, "number_nodes":list_number_nodes, "number_edges":list_number_edges})

    # save as csv for period
    df_descriptives.to_csv("descriptives_"+city+"_"+str(period)+".csv", index=False)

    # Node-Level Metrics
    degree_centrality = nx.degree_centrality(G)

    # Mapping the dictionary to the DataFrame
    df_organizations['degree_centrality'] = df_organizations['Linkedin_name'].map(degree_centrality)

    # Save the DataFrame
    df_organizations.to_csv("df_organizations_"+city+"_"+str(period)+".csv", index=False)

In [39]:
# Create and store network plots with edges for both cities
create_network_plots_with_edges(long_df_houston, df_orgs_houston, "Houston")
create_network_plots_with_edges(long_df_berlin, df_orgs_berlin, "Berlin")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_edges['post_date'] = pd.to_datetime(df_edges['post_date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_edges["partners"] = df_edges.apply(lambda x: '-'.join(sorted([x['source'], x['target']])), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_edges['post_date'] = pd.to_datetime(df

In [35]:
""" Compute the distribution of partnerships across different collaboration types (e.g., R&D collaborations, joint ventures, etc.) """

columns = list(long_df_houston.columns)
columns_partnerships = columns[-23:]

In [None]:
def drop_duplicate_partnerships(df):
    columns = df.columns[6:]

    columns_= list(columns)
    columns_.append('partners')

    # sort partners
    df["partners"] = df.apply(lambda x: '-'.join(sorted([x['source'], x['target']])), axis=1)

    # drop duplicate columns
    df = df.drop_duplicates(subset=columns_)
    df = df.drop(columns=['partners'])
    return df

In [36]:
import datetime

"""For Berlin"""
# make post_date a datetime
long_df_berlin['post_date'] = pd.to_datetime(long_df_berlin['post_date'])

columns = long_df_berlin.columns[6:]

columns_= list(columns)
columns_.append('partners')

long_df_berlin["partners"] = long_df_berlin.apply(lambda x: '-'.join(sorted([x['source'], x['target']])), axis=1)

"""Split the datasets into pre IRA and post IRA"""
# df with datatime before 1.07.2022
long_df_berlin_pre_ira = long_df_berlin[long_df_berlin['post_date'] < datetime.datetime(2022, 7, 1)]
# drop duplicate columns
long_df_berlin_pre_ira = long_df_berlin_pre_ira.drop_duplicates(subset=columns_)
long_df_berlin_pre_ira = long_df_berlin_pre_ira.drop(columns=['partners'])

long_df_berlin_pos_ira = long_df_berlin[long_df_berlin['post_date'] >= datetime.datetime(2022, 7, 1)]
long_df_berlin_pos_ira = long_df_berlin_pos_ira.drop_duplicates(subset=columns_)
long_df_berlin_pos_ira = long_df_berlin_pos_ira.drop(columns=['partners'])

print(long_df_berlin_pos_ira.shape[0])

long_df_berlin_pre_ira = long_df_berlin_pre_ira[columns_partnerships].sum()
long_df_berlin_pre_ira["period"] = "0"
print(long_df_berlin_pre_ira)
long_df_berlin_pre_ira.to_csv("collabs_berlin_pre_ira.csv")
long_df_berlin_pos_ira = long_df_berlin_pos_ira[columns_partnerships].sum()
long_df_berlin_pos_ira["period"] = "1"
print(long_df_berlin_pos_ira)
long_df_berlin_pos_ira.to_csv("collabs_berlin_post_ira.csv")

1038
r_and_d_collaborations                     79.0
demonstrations_and_pilots                  53.0
commercialisation_and_product_launches      4.0
production_and_manufacturing                3.0
offtake_agreements_and_futures              0.0
adoption_and_deployments                  117.0
operations_and_maintenance                  3.0
grants                                     70.0
equity_investments                         17.0
loans                                       0.0
other_unspecified_finance                  11.0
spin_offs                                   3.0
mergers_and_acquisitions                    2.0
joint_ventures                             10.0
incubators_and_accelerator_programs         3.0
certification_and_approvals                 2.0
training                                    3.0
core_technology                           240.0
infrastructure                             48.0
software_and_digital_platforms             10.0
raw_materials                      

In [41]:
"""For Houston"""

# make post_date a datetime
long_df_houston['post_date'] = pd.to_datetime(long_df_houston['post_date'])

long_df_houston_pre_ira = long_df_houston[long_df_houston['post_date'] < datetime.datetime(2022, 7, 1)]
long_df_houston_pos_ira = long_df_houston[long_df_houston['post_date'] >= datetime.datetime(2022, 7, 1)]

# drop duplicate columns
long_df_houston_pre_ira = drop_duplicate_partnerships(long_df_houston_pre_ira)
long_df_houston_pos_ira = drop_duplicate_partnerships(long_df_houston_pos_ira)
print(long_df_houston_pre_ira.shape[0])

long_df_houston_pre_ira = long_df_houston_pre_ira[columns_partnerships].sum()
long_df_houston_pre_ira["period"] = "0"
print(long_df_houston_pre_ira)
long_df_houston_pre_ira.to_csv("collabs_houston_pre_ira.csv")

long_df_houston_pos_ira = long_df_houston_pos_ira[columns_partnerships].sum()
long_df_houston_pos_ira["period"] = "1"
print(long_df_houston_pos_ira)
long_df_houston_pos_ira.to_csv("collabs_houston_post_ira.csv")

335
r_and_d_collaborations                     81.0
demonstrations_and_pilots                  33.0
commercialisation_and_product_launches     32.0
production_and_manufacturing               15.0
offtake_agreements_and_futures              2.0
adoption_and_deployments                   93.0
operations_and_maintenance                  1.0
grants                                     30.0
equity_investments                         32.0
loans                                       2.0
other_unspecified_finance                  23.0
spin_offs                                   0.0
mergers_and_acquisitions                    6.0
joint_ventures                             16.0
incubators_and_accelerator_programs        38.0
certification_and_approvals                 7.0
training                                    1.0
core_technology                           306.0
infrastructure                             30.0
software_and_digital_platforms              9.0
raw_materials                       

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["partners"] = df.apply(lambda x: '-'.join(sorted([x['source'], x['target']])), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["partners"] = df.apply(lambda x: '-'.join(sorted([x['source'], x['target']])), axis=1)


In [26]:
df_bars = pd.concat([long_df_berlin_pre_ira,long_df_berlin_pos_ira,long_df_houston_pre_ira,long_df_houston_pos_ira],axis=1)
print(df_bars)
df_bars.to_csv("houston_berlin_collaboration_bars.csv")

In [8]:
""" Check the share of "outside" collaborations by comparing within city partnerships vs. outside city partnerships"""

# import all partnership
long_df= pd.read_csv("../Data/innovation_network_prepost_2020-01-01.csv")
long_df = long_df[long_df.Hydrogen == 1]

# make datetime
long_df['post_date'] = pd.to_datetime(long_df['post_date'])
long_df_pos_ira = long_df[long_df['post_date'] >= datetime.datetime(2022, 7, 1)]
long_df_pos_ira = drop_duplicate_partnerships(long_df_pos_ira)
long_df_pre_ira = long_df[long_df['post_date'] < datetime.datetime(2022, 7, 1)]
long_df_pre_ira = drop_duplicate_partnerships(long_df_pre_ira)

         source               target              post_id   
0  saudi-aramco                sabic  6960967815109443584  \
1            bp              equinor  7068135555187757056   
2       de-nora       john-cockerill  7012057244582166528   
3       de-nora  thyssenkrupp-nucera  7012057244582166528   
4       de-nora               topsoe  7012057244582166528   

        collaboration_id            post_date  Biomass  Biofuels  Biogas   
0  6960967815109443584_0  2022-08-04 14:40:55        0         0       0  \
1  7068135555187757056_0  2023-05-27 08:07:15        0         0       0   
2  7012057244582166528_0  2022-12-23 14:12:04        0         0       0   
3  7012057244582166528_0  2022-12-23 14:12:04        0         0       0   
4  7012057244582166528_0  2022-12-23 14:12:04        0         0       0   

   Wind  Offshore_Wind  ...  joint_ventures   
0     0              0  ...               0  \
1     0              0  ...               0   
2     0              0  ...        

In [12]:
# for all firms headquartered in Houston / Berlin
# houston

firms_houston = list(df_orgs_houston[(df_orgs_houston.headquarter==1)&(df_orgs_houston.sector.str.contains("Hydrogen", na=False))&(df_orgs_houston.firm_type.str.contains("manu|projec|serv", na=False, case=False))].Linkedin_name)

number_links = long_df_houston_pos_ira[(long_df_houston_pos_ira.source.isin(firms_houston))|(long_df_houston_pos_ira.target.isin(firms_houston))].shape[0]
number_links_total = long_df_pos_ira[(long_df_pos_ira.source.isin(firms_houston))|(long_df_pos_ira.target.isin(firms_houston))].shape[0]
print("Houston", number_links, number_links_total, number_links/number_links_total)

# berlin
firms_berlin = list(df_orgs_berlin[(df_orgs_berlin.headquarter==1)&(df_orgs_berlin.sector.str.contains("Hydrogen", na=False))&(df_orgs_berlin.firm_type.str.contains("manu|projec|serv", na=False, case=False))].Linkedin_name)

number_links = long_df_berlin_pos_ira[(long_df_berlin_pos_ira.source.isin(firms_berlin))|(long_df_berlin_pos_ira.target.isin(firms_berlin))].shape[0]
number_links_total = long_df_pos_ira[(long_df_pos_ira.source.isin(firms_berlin))|(long_df_pos_ira.target.isin(firms_berlin))].shape[0]
print("Berlin", number_links, number_links_total, number_links/number_links_total)

Houston 111 781 0.14212548015364918
Berlin 388 1770 0.2192090395480226


In [13]:
# for pre ir
# houston
number_links = long_df_houston_pre_ira[(long_df_houston_pre_ira.source.isin(firms_houston))|(long_df_houston_pre_ira.target.isin(firms_houston))].shape[0]
number_links_total = long_df_pre_ira[(long_df_pre_ira.source.isin(firms_houston))|(long_df_pre_ira.target.isin(firms_houston))].shape[0]
print("Houston", number_links, number_links_total, number_links/number_links_total)

# berlin
number_links = long_df_berlin_pre_ira[(long_df_berlin_pre_ira.source.isin(firms_berlin))|(long_df_berlin_pre_ira.target.isin(firms_berlin))].shape[0]
number_links_total = long_df_pre_ira[(long_df_pre_ira.source.isin(firms_berlin))|(long_df_pre_ira.target.isin(firms_berlin))].shape[0]
print("Berlin", number_links, number_links_total, number_links/number_links_total)

Houston 83 506 0.16403162055335968
Berlin 113 585 0.19316239316239317
