In [1]:
"""
Build the concordance from the product space and the SITC codes to the technology space with the IPC
codes. This will build the tech space so that I can place the inventors on this space.
The steps are as follows:
    1) Define a dictionary "sitc_proximity_dict" which returns the proximity between two SITC classifications
    in the original product space
    2) Find all possible combinations of IPC classifications of length two which can form edges in the
    new technology network
    3) For each potential edge, find the expected value of that edge in the original product space using
    the probability that IPC code x = SITC code y and the proximity of (x, y) in the original product space.
    4) The proximity in the original product space is given by the adjacency matrix G so the proximity
    between x and y equals G_{x, y}

        For example take the following example taken directly from the concordance used,

        IPC code    SITC code   Probability
        "A01B",     "721.1",    .938634
        "A01B",     "721.9",    .061366
        "A62C",     "598.9",    1

        The edge between (A01B, A62C) in the new technology space is then:

        Pr(A01B = 721.1) x Pr(A62C = 598.9) x G_{721.1, 598.9}
                                   +
        Pr(A01B = 721.9) x Pr(A62C = 598.9) x G_{721.1, 598.9}
"""

'\nBuild the concordance from the product space and the SITC codes to the technology space with the IPC\ncodes. This will build the tech space so that I can place the inventors on this space.\nThe steps are as follows:\n    1) Define a dictionary "sitc_proximity_dict" which returns the proximity between two SITC classifications\n    in the original product space\n    2) Find all possible combinations of IPC classifications of length two which can form edges in the\n    new technology network\n    3) For each potential edge, find the expected value of that edge in the original product space using\n    the probability that IPC code x = SITC code y and the proximity of (x, y) in the original product space.\n    4) The proximity in the original product space is given by the adjacency matrix G so the proximity\n    between x and y equals G_{x, y}\n\n        For example take the following example taken directly from the concordance used,\n\n        IPC code    SITC code   Probability\n      

In [2]:
import os
import pandas as pd
from itertools import combinations
import numpy as np
import networkx as nx
import pickle

In [3]:
os.chdir("D:\\IDEA Masters\\TFM Data")

In [4]:
inventor = pd.read_csv("Inventor Data.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
sitc_data = pd.read_csv("sitc2_proximities.csv")

In [6]:
os.chdir("D:\\IDEA Masters\\TFM Data\\IPC to Trade codes\\SITC_Rev2")
ipc_subclass = pd.read_csv("ipc4_to_sitc_rev2_4.txt")

In [11]:
"Set all links g_ii = 1 since we are measuring proximity not distance!"

for i in range(len(sitc_data["commoditycode_1"])):

    if sitc_data.loc[i]["commoditycode_1"] == sitc_data.loc[i]["commoditycode_2"]:

        sitc_data.at[i, "proximity"] = 1

In [9]:
sitc_data.loc[i]["commoditycode_1"] == sitc_data.loc[i]["commoditycode_2"]

True

In [13]:
"""
Clean the codes so that they match across SITC to Zolas
"""
ipc_subclass["sitc_rev2_4"] = ipc_subclass["sitc_rev2_4"] * 10
ipc_subclass["sitc_rev2_4"] = ipc_subclass["sitc_rev2_4"].astype(np.int64)

In [14]:
print(len(sitc_data["commoditycode_1"].unique()))
print(len(ipc_subclass["ipc4"].unique()))
print(len(ipc_subclass["sitc_rev2_4"].unique()))
print(len(inventor["section_class_sub"].unique()))

771
634
721
634


In [15]:
# 1)
sitc_proximity_dict = dict(zip(zip(sitc_data["commoditycode_1"], sitc_data["commoditycode_2"]),
                               sitc_data["proximity"]))

#2)
edges = list(combinations(sorted(inventor["section_class_sub"].unique()), 2))
edges_dict = dict.fromkeys(edges)

#3)
for edge in edges:


    source, target = edge

    source_sitc = ipc_subclass[ipc_subclass["ipc4"] == source]
    target_sitc = ipc_subclass[ipc_subclass["ipc4"] == target]

    edge_sum = 0

    for source_match in source_sitc["sitc_rev2_4"].index:

        pr_source = source_sitc.loc[source_match]["probability_weight"]
        source_sitc_code = source_sitc.loc[source_match]["sitc_rev2_4"]

        for target_match in target_sitc["sitc_rev2_4"].index:

            pr_target = target_sitc.loc[target_match]["probability_weight"]
            target_sitc_code = target_sitc.loc[target_match]["sitc_rev2_4"]


            if (source_sitc_code, target_sitc_code) in sitc_proximity_dict.keys():

                product_space_proximity = sitc_proximity_dict[source_sitc_code, target_sitc_code]

                edge_sum = edge_sum + pr_source * pr_target * product_space_proximity


        edges_dict[edge] = edge_sum

In [None]:
"""
Check the number of nodes... have we lost any areas?
"""

In [16]:
np.mean(sitc_data.proximity), np.mean(list(edges_dict.values()))

(0.18248070594201943, 0.2110893354215286)

In [17]:
def check_symmetric(a, rtol=1e-05, atol=1e-08):
    """
    A simple function to check if the adjacency matrix is symmetric
    """
    return np.allclose(a, a.T, rtol=rtol, atol=atol)

In [18]:
for k, v in edges_dict.items():
        edges_dict[k] = round(v, 3)

graph = nx.Graph()

graph.add_nodes_from(inventor["section_class_sub"].unique())
graph.add_edges_from(list(edges_dict.keys()))

for edge in list(edges_dict.keys()):

    graph.edges[edge]["weight"] = edges_dict[edge]

nx.write_gml(graph, "Technology Network from PS.gml")

In [19]:
"""
Clean the product space as to present a visual representation. Keep only the top 5 
connections for each, this results in a now directed network.. this is only for visual purposes.
"""

graph = nx.read_gml("Technology Network from PS.gml")

In [25]:
adj = nx.to_numpy_array(graph, nodelist=sorted(graph.nodes()))

In [21]:
for i in range(len(graph.nodes())):

    sum_row = np.sum(adj[i, :])

    for j in range(len(graph.nodes())):

        adj[i, j] = adj[i, j] / sum_row

In [22]:
n = len(graph.nodes)
adj_normalised = np.zeros((n, n))
for i in range(n):

    a = adj[i, :]
    ind = np.argpartition(a, -5)[-5:]
    adj_normalised[i, ind] = a[ind]

adj_normalised

array([[0.        , 0.00535571, 0.00337293, ..., 0.        , 0.        ,
        0.        ],
       [0.00534925, 0.        , 0.00336333, ..., 0.        , 0.        ,
        0.        ],
       [0.00346458, 0.00345889, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [23]:
graph_normalised = nx.from_numpy_array(adj_normalised)
node_labels = dict(zip(np.arange(n), sorted(graph.nodes())))
nx.relabel_nodes(graph_normalised, node_labels, copy=False)

nx.write_gml(graph_normalised, "Technology Network from PS (top 5).gml")

In [24]:
np.mean(adj)



0.0015772870662460572