In [1]:
import pandas as pd
import networkx as nx
import numpy as np

In [2]:
df_SC = pd.read_csv('SC_summary.csv', parse_dates=['start_date', 'end_date'])
df_SC.replace({pd.NaT: np.datetime64(800, 'M')}, inplace=True)

dates = [np.datetime64(592 + 3*i, 'M') for i in range(14)]

df_customer_cats = df_SC[["customer_SIC", "SIC_desc"]].drop_duplicates().sort_values(by='customer_SIC').rename(columns={"customer_SIC": "SIC", "SIC_desc": "SIC_desc"})
df_supplier_cats = df_SC[["supplier_SIC", "SIC_desc_s"]].drop_duplicates().sort_values(by='supplier_SIC').rename(columns={"supplier_SIC": "SIC", "SIC_desc_s": "SIC_desc"})

all_sic_codes = pd.concat([df_customer_cats, df_supplier_cats]).drop_duplicates().sort_values(by='SIC')

In [5]:
def get_relevant_connections_for_date(df_SC, date):
    df_SC_date = df_SC.query('start_date <= @date <= end_date')
    return df_SC_date.reset_index().drop(columns=['index'])

def convert_df_to_graph(df):
    G = nx.MultiGraph()
    for row_nr, row in df.iterrows():
        start_date, end_date, customer_ID, cusomter_name, customer_SIC, SIC_desc, supplier_ID, supplier_name, supplier_SIC, SIC_desc_s = row
        G.add_node(customer_ID, name=cusomter_name, SIC = customer_SIC, desc=SIC_desc) # id=customer_SIC, desc=SIC_desc
        G.add_node(supplier_ID, name=supplier_name, SIC = supplier_SIC, desc=SIC_desc_s) #  id=supplier_SIC, desc=SIC_desc_s
        G.add_edge(supplier_ID, customer_ID, customer=customer_ID, customer_name = cusomter_name, supplier=supplier_ID, supplier_name=supplier_name, goods=supplier_SIC, descr=SIC_desc_s, edge_id=row_nr)
    return G

# stats on centrality, closeness etc -> see networkx


def get_suppliers(df, firmID):
    suppliers = df[df['customer_ID'] == firmID]
    if suppliers.empty:
        return suppliers, None
    
    sector_client_SIC = int(suppliers['customer_SIC'].median())
    if sector_client_SIC not in suppliers['customer_SIC'].values:
        sector_client_SIC = int(suppliers['customer_SIC'].max())
    return suppliers, sector_client_SIC

def get_clients(df, firmID):
    clients = df[df['supplier_ID'] == firmID]
    if clients.empty:
        return clients, None
    sector_supplier_SIC = int(clients['supplier_SIC'].median())
    if sector_client_SIC not in clients['supplier_SIC'].values:
        sector_client_SIC = int(clients['supplier_SIC'].max())
    return clients, sector_supplier_SIC


def score_dualsourcing(df, G):
    # scoring:
    # global model
    n= all_sic_codes.shape[0]
    list_sic_codes = list(all_sic_codes['SIC'].values)
    sectors_dualsourced = pd.DataFrame(np.zeros((n,n)), columns = list_sic_codes, index = list_sic_codes)
    sectors_not_dualsourced = pd.DataFrame(np.zeros((n,n)), columns = list_sic_codes, index = list_sic_codes)
    
    total_dual_sourcers = 0
    total_firms = 0
    for node in G.nodes:
        supplier_of_goods, SIC_code = get_suppliers(df, node)
        if supplier_of_goods.empty:
            continue

        suppliers_per_sector = supplier_of_goods['supplier_SIC'].value_counts()
        amnt_dualsourced_suppliers = sum(suppliers_per_sector>1)

        does_dual_sourcing = amnt_dualsourced_suppliers > 0

        if does_dual_sourcing:
            total_dual_sourcers += 1

        total_firms += 1

        sectors_that_were_dualsourced = list(suppliers_per_sector[suppliers_per_sector>1].index)
        sectors_that_were_not_dualsourced = list(suppliers_per_sector[suppliers_per_sector<=1].index)

        for supplier_sector_SIC in sectors_that_were_dualsourced:
            sectors_dualsourced[supplier_sector_SIC][SIC_code] += 1

        for supplier_sector_SIC in sectors_that_were_not_dualsourced:
            sectors_not_dualsourced[supplier_sector_SIC][SIC_code] += 1

    return sectors_dualsourced, sectors_not_dualsourced, total_dual_sourcers/total_firms


list_G = []
list_sectors_dualsourced = []
list_sectors_not_dualsourced = []
list_does_any_dualsourcing = []
for date in dates:
    print(date)
    df = get_relevant_connections_for_date(df_SC, date)
    #df = df.head(25)
    G = convert_df_to_graph(df)
    nx.write_graphml(G, f'001_full_network_{date}.graphml', prettyprint=True, edge_id_from_attribute='edge_id')
    sectors_dualsourced, sectors_not_dualsourced, pct_any_dualsource = score_dualsourcing(df, G)

    list_G.append(G)
    list_sectors_dualsourced.append(sectors_dualsourced)
    list_sectors_not_dualsourced.append(sectors_not_dualsourced)
    list_does_any_dualsourcing.append(pct_any_dualsource)

2019-05
2019-08
2019-11
2020-02
2020-05
2020-08
2020-11
2021-02
2021-05
2021-08
2021-11
2022-02
2022-05
2022-08


In [134]:
list_pct_dualsourcing = [df.sum().sum() / (df.sum().sum() + df2.sum().sum()) for df, df2 in zip(list_sectors_dualsourced, list_sectors_not_dualsourced)]
abs_amnt_dual_sourcing = [df.sum().sum() for df, df2 in zip(list_sectors_dualsourced, list_sectors_not_dualsourced)]

In [145]:
size_of_network = [len(G.nodes) for G in list_G]
size_of_network_edges = [len(G.edges) for G in list_G]

In [136]:
import plotly.express as px

px.line(x=dates, y=list_pct_dualsourcing, title='Firmwise view on suppliers: Percentage of supplier sectors with two or more suppliers from the same sector')

In [137]:
px.line(x=dates, y=abs_amnt_dual_sourcing, title='Firmwise view on suppliers: Absolute amount of dual-sourced sectors')

In [138]:
px.line(x=dates, y=list_does_any_dualsourcing, title='Firms that employ dual source practices')

In [144]:
px.line(x=dates, y=size_of_network,
        labels={
            "x": "Date",
            "y": "Companies in the supply chain",
        },
        title='Size of the Factset Supply Chain network')

In [146]:
px.line(x=dates, y=size_of_network_edges,
        labels={
            "x": "Date",
            "y": "Relations in the supply chain",
        },
        title='Size of the Factset Supply Chain network')

In [150]:
px.line(x=dates, y=size_of_network_edges,
        labels={
            "x": "Date",
            "y": "Relations in the supply chain",
        },
        title='Size of the Factset Supply Chain network')

In [152]:
size_of_network = [len(G.nodes) for G in list_G]
size_of_network_edges = [len(G.edges) for G in list_G]
density_of_network = [nx.density(G) for G in list_G]

In [153]:
px.line(x=dates, y=density_of_network,
        labels={
            "x": "Date",
            "y": "Density of the supply chain",
        },
        title='Size of the Factset Supply Chain network')

In [6]:
from cdlib import algorithms

coms = algorithms.leiden(list_G[0])

In [11]:
len(coms.communities)

261

In [8]:
from cdlib import TemporalClustering
tc = TemporalClustering()
for index in range(len(list_G)):
    tc.add_clustering(algorithms.leiden(list_G[index]), index)

In [20]:
for clustering in tc.clusterings:
    print(len(tc.clusterings[clustering].communities))

262
269
266
256
271
270
247
251
262
241
250
268
261
265
