In [1]:
import os
import pandas as pd
from collections import Counter, OrderedDict
import networkx as nx
import nx_altair as nxa
import altair as alt
from networkx.drawing.nx_agraph import graphviz_layout
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import spacy 

low_memory=False

In [2]:
# load data
filepath = os.path.join(os.getcwd(), 'patents_with_title.csv')
data = pd.read_csv(filepath , sep = ";", dtype={'patent_number': str, 'cited_patent_number': str})


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
data = data.drop_duplicates(subset=["patent_number","cited_patent_number"])
data = data[data['cited_patent_number'].notna()]
count = 0
data.fillna("0", inplace=True)


cited_patents_df = pd.DataFrame(data, columns=["patent_number", "cited_patent_number"])

In [4]:

data["count"] = 1
citation_matrix = data.pivot(index="patent_number", columns="cited_patent_number", values="count")
citation_matrix.fillna(0, inplace=True)

citation_similarity = cosine_similarity(citation_matrix)
sim_df = pd.DataFrame(citation_similarity, index=citation_matrix.index, columns=citation_matrix.index)
#sim_df.to_csv("citation_similarity.csv", sep=";")


In [5]:
citation_similarity = pd.read_csv("citation_similarity.csv", index_col="patent_number", dtype={'patent_number': str}, sep=";")

cited_patents_df.drop_duplicates("patent_number")

cited_patent_list = set(cited_patents_df["patent_number"])
citation_similarity.index = citation_similarity.index.astype(str)

citation_similarity = citation_similarity[citation_similarity.index.isin(cited_patent_list)]
citation_similarity = citation_similarity[set(cited_patent_list)]
threshold = 0.6

citation_similarity_filtered = citation_similarity
citation_similarity_filtered.values[citation_similarity_filtered <= threshold] = 0

citsim_graph = nx.from_pandas_adjacency(citation_similarity_filtered,  create_using=nx.DiGraph)

In [16]:
patents_to_analyze = ['10360191', '10447770', '10657526', '10505949', '10255108', '10608825', '10764325', '10382388', '10783128', '10790963']
patents_to_analyze_df = data[data['patent_number'].isin(patents_to_analyze)].drop_duplicates(subset='patent_number')

organization_dict = {}
#organization_dict = {tuple[2]:tuple[5] for tuple in data.itertuples()}
for tuple in data.itertuples():    
    if tuple[2] in patents_to_analyze:
        if(tuple[9] == "0"):
            organization_dict[tuple[2]] = tuple[5]
        else:
            organization_dict[tuple[2]] = tuple[9]
nx.set_node_attributes(citsim_graph, organization_dict, "organization")

patent_id_dict = {tuple[2]:tuple[2] for tuple in data.itertuples()}
nx.set_node_attributes(citsim_graph, patent_id_dict, "patent_id")

degree = nx.degree_centrality(citsim_graph)
degree_dict = {key:value*1000 + 3 for key,value in degree.items()}
nx.set_node_attributes(citsim_graph, degree_dict, "centrality")


title_dict = {tuple[2]:tuple[4] for tuple in data.itertuples()}
nx.set_node_attributes(citsim_graph, title_dict, "patent_title")

In [7]:
patent_title_sim_matrix = []
nlp = spacy.load("en_core_web_lg")
for patent1 in patents_to_analyze_df.itertuples():
    doc1 = nlp(patent1[4])    
    patent_title_sim_vector = []        
    for patent2 in patents_to_analyze_df.itertuples():        
        doc2 = nlp(patent2[4])                
        patent_similarity = doc1.similarity(doc2)
        patent_title_sim_vector.append(patent_similarity)  
        
    patent_title_sim_matrix.append(patent_title_sim_vector)
patent_title_sim_matrix_df = pd.DataFrame(patent_title_sim_matrix, index=patents_to_analyze_df['patent_number'], columns=patents_to_analyze_df['patent_number'])

In [8]:
edge_color_dict = {}
i = 0
col = 0

while i < len(patent_title_sim_matrix_df) - 1:
    j = 1 + col    
    while j < len(patent_title_sim_matrix_df):
        
        if (patent_title_sim_matrix_df.index[i], patent_title_sim_matrix_df.index[j]) in citsim_graph.edges:            
            if patent_title_sim_matrix_df.iloc[i,j] >= 0.9:                
                edge_color_dict[(patent_title_sim_matrix_df.index[i], patent_title_sim_matrix_df.index[j])] = '#09cf02'
                edge_color_dict[(patent_title_sim_matrix_df.index[j], patent_title_sim_matrix_df.index[i])] = '#09cf02'
            elif patent_title_sim_matrix_df.iloc[i,j] >= 0.8:
                edge_color_dict[(patent_title_sim_matrix_df.index[i], patent_title_sim_matrix_df.index[j])] = '#009687'
                edge_color_dict[(patent_title_sim_matrix_df.index[j], patent_title_sim_matrix_df.index[i])] = '#009687'
            elif patent_title_sim_matrix_df.iloc[i,j] >= 0.7:
                edge_color_dict[(patent_title_sim_matrix_df.index[i], patent_title_sim_matrix_df.index[j])] = '#d2e004'
                edge_color_dict[(patent_title_sim_matrix_df.index[j], patent_title_sim_matrix_df.index[i])] = '#d2e004'
            elif patent_title_sim_matrix_df.iloc[i,j] >= 0.6:
                edge_color_dict[(patent_title_sim_matrix_df.index[i], patent_title_sim_matrix_df.index[j])] = '#e0b004'
                edge_color_dict[(patent_title_sim_matrix_df.index[j], patent_title_sim_matrix_df.index[i])] = '#e0b004'
            else:
                edge_color_dict[(patent_title_sim_matrix_df.index[i], patent_title_sim_matrix_df.index[j])] = '#e03f04'
                edge_color_dict[(patent_title_sim_matrix_df.index[j], patent_title_sim_matrix_df.index[i])] = '#e03f04'
        j += 1        
    col += 1
    i += 1
edge_color_dict
nx.set_edge_attributes(citsim_graph, edge_color_dict, 'edge_colors')

In [18]:
alt.data_transformers.disable_max_rows()
chart = nxa.draw_networkx(
    citsim_graph,
    node_color='organization',
    #cmap='viridis',
    edge_color= 'edge_colors',
    node_size='centrality',
    node_tooltip = ["organization", "patent_id", "patent_title"],
    width = 3
).properties(
    width=800,
    height=800
).interactive()
chart



In [17]:
# create graph from edgelist
cited_patents_graph = nx.from_pandas_edgelist(cited_patents_df,
source = "patent_number",
target = "cited_patent_number")

pos = nx.spring_layout(cited_patents_graph)

In [18]:
degree = nx.degree_centrality(cited_patents_graph)
degree_dict = {key:value*1000 +3 for key,value in degree.items()}
nx.set_node_attributes(cited_patents_graph, degree_dict, "centrality")

In [19]:
organization_dict = {tuple[2]:tuple[5] for tuple in data.itertuples()}
for tuple in data.itertuples():    
    if(tuple[7] == "0"):
        organization_dict[tuple[2]] = tuple[4]
    else:
        organization_dict[tuple[2]] = tuple[8]

nx.set_node_attributes(cited_patents_graph, organization_dict, "organization")

In [20]:
patent_id_dict = {tuple[2]:tuple[2] for tuple in data.itertuples()}

# import unknown patents
filepath = os.path.join(os.getcwd(), 'unknown_patents.csv')
unknown_patents = pd.read_csv(filepath , sep = ";", dtype={'patent_number': str, 'cited_patent_number': str})
patent_id_dict = {tuple[2]:tuple[2] for tuple in unknown_patents.itertuples()}

nx.set_node_attributes(cited_patents_graph, patent_id_dict, "patent_id")

In [21]:
patent_color_dict = dict()
patent_color_dict = {tuple[2]: 'blue' for tuple in data.itertuples()}
patent_color_dict = {tuple[2]: 'red' for tuple in unknown_patents.itertuples()}
nx.set_node_attributes(cited_patents_graph, patent_id_dict, "node_color")

In [59]:
#draw networkx altair chart
alt.data_transformers.disable_max_rows()
chart = nxa.draw_networkx(
    cited_patents_graph,
    node_color='node_color',
    node_size='centrality',
    cmap='viridis',
    edge_color='black',
    node_tooltip = ["organization", "patent_id"]
).properties(
    width=600,
    height=600
).interactive()
chart

NameError: name 'cited_patents_graph' is not defined

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(data)


In [None]:
for i,patent1 in enumerate(data.itertuples()):                
    print(i)
    cited_by_patent_id_exists = True
    for j,patent2 in enumerate(data.itertuples()):
        print(j)
        if patent1[16] != patent2[2]:
            cited_by_patent_id_exists = False
        count += 1
        print("---")
        print(patent1[16])
        print(patent2[2])

        print("---")
            
    if cited_by_patent_id_exists == False:
        data.drop[patent1[0]]
        
    #if data["patent_number"].str.contains(str(patent[16])).any:
    #    count += 1
    #print(str(patent[16]))
    #break
    #data.drop(patent[0])   