In this notebook i will embedd patents and search for them this way

In [1]:
# Imports 

# Own Packages
from Masterarbeit_utils.model_utils_agg import get_tokenizer, load_and_modify_model, load_pretrained_Tokenizer

# Site Packages
import pandas as pd
import numpy as np
import pickle as pk
import torch
import os 
import sys
import psutil
from collections import Counter
import itertools
# Dimension reduction algorithms
#from cuml.manifold import TSNE
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from scipy.spatial import distance
from scipy.fft import fft, fftfreq
# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.io import export_png
from bokeh.palettes import Viridis256, Category20
from bokeh.transform import linear_cmap, factor_cmap
from bokeh.colors import RGB

# Huggingface
from transformers import AutoTokenizer, OPTForCausalLM
from tokenizers.processors import TemplateProcessing
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from torch.utils.data import Dataset

2023-09-27 10:12:01.402939: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-27 10:12:01.421141: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
###########################################################
# Loading the Henkel Patents
###########################################################

# Directories in which data important for the notebook is stored
dump_dir = 'PK_DUMP'
data_dir = 'data'

# Loading the dataframes with the patents deemed most important for electrically debondable adhesives from Henkel
henkel_patents = pd.read_csv(f'{data_dir}/Henkel_patente_patstat_docdb_families_abstract.csv', delimiter=',').reset_index(drop=True)
henkel_orbit = pd.read_csv(f'{data_dir}/Henkel_Orbit_Suche_Patstat_Export.csv', delimiter=',')

# Filtering the Samples which contain F-Terms
henkel_filtered = henkel_patents[henkel_patents['fterms'].notna()]
henkel_filtered = henkel_filtered.reset_index(drop=True)

orbit_filtered = henkel_orbit[henkel_orbit['fterms'].notna()]
orbit_filtered = orbit_filtered.reset_index(drop=True)

print(f"There are {len(henkel_patents['doc_db_family_id'].unique())} unique patents in the Henkel dataset, only {len(henkel_filtered['doc_db_family_id'].unique())} of them contain F-Terms.")

################################################################
# Loading the Model
################################################################

model_name = 'gal_125_new_1'
checkpoint = int(2*86515)
# If True normalization is applied to the embeddings
norm = True

# The folder at which the model will be saved. This folder has to be created for your system 
model_folder = f'data/models/{model_name}'
os.makedirs(model_folder, exist_ok=True)

# Folder in which the tokenizer will be saved
tokenizer_folder = f'data/tokenizers/{model_name}'
os.makedirs(tokenizer_folder, exist_ok=True)

# Folder at which all pickle files are stored. This folder is fixed for this project and should not be changed
dump_dir = r'PK_DUMP'

# This folder will be created and filled with txt.files for each sample after you run the Pytorch Dataset Notebook
dataset_folder = f'data/dataset_samples_new_with_id'

# Model parameters 
'''
mini	125 M
base	1.3 B
standard	6.7 B
large	30 B
huge	120 B'''

# selecting the correct model 
if model_name.split('_')[1] == '125':
    base_model_name = 'mini'
elif model_name.split('_')[1] == '1300':
    base_model_name = 'base'


# All new Torch-objects will be by default in this dtype
# if default_type = float16 fp16 must be False
default_dtype = torch.float32
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_default_dtype(default_dtype)

# Default device on which the model will be loaded
default_device = 'cpu'

# Number of GPUs the model will be parallelised to 
num_gpus = 1
# If you change 'default_device' to 'cpu', make sure to set num_gpus to zero.
if default_device == 'cpu':
    num_gpus = 0

tensor_parallel = False


device_map=None
max_memory = {}
if num_gpus > 0:
    # based on https://github.com/huggingface/accelerate/blob/5315290b55ea9babd95a281a27c51d87b89d7c85/src/accelerate/utils/modeling.py#L274
    for i in range(num_gpus):
        _ = torch.tensor([0], device=i)
    for i in range(num_gpus):
        max_memory[i] = torch.cuda.mem_get_info(i)[0]
    device_map = "auto"
max_memory["cpu"] = psutil.virtual_memory().available

model = OPTForCausalLM.from_pretrained(f'{model_folder}/checkpoint-{checkpoint}', torch_dtype=default_dtype, low_cpu_mem_usage=True,
                                               device_map=device_map, max_memory=max_memory)

###########################
# Loading the Tokenizer
###########################
tokenizer = AutoTokenizer.from_pretrained(tokenizer_folder)
n_f_terms = len(tokenizer) - tokenizer.vocab_size
print('Loaded Tokenizer from serialized instance!')    
print(f'There are {n_f_terms:,} different F-Terms in the whole Dataset!')


###########################
# Loading Descriptions
###########################
with open(f'{dump_dir}/agg_themes_descriptions_new.pk', 'rb') as f:
    theme_dict = pk.load(f)
with open(f'{dump_dir}/agg_viewpoints_descriptions_new.pk', 'rb') as f:
    viewpoint_dict = pk.load(f)
with open(f'{dump_dir}/agg_numbers_descriptions_new.pk', 'rb') as f:
    number_dict = pk.load(f)
with open(f'{dump_dir}/agg_full_descriptions_new.pk', 'rb') as f:
    full_descriptions_dict = pk.load(f)


###########################
# Extracting the Patent-Embeddings 
###########################
class JapPatDatasetEmb(Dataset):
    """Dataset containing Japanese patents and their F-Term classification"""
    def __init__(self, data_folder, tokenizer):
        """
        data_folder: path to folder containing the text samples
        tokenizer: tokenizer instance with added additional Tokens for F-Terms
        """
        super(Dataset).__init__()
        self.data_folder = data_folder
        self.l = len(os.listdir(data_folder))
        self.tokenizer = tokenizer
        
    def __len__(self):
        return self.l
    
    def __getitem__(self, idx):
        try:
            with open(f'{self.data_folder}/{idx}.txt', 'r', encoding='utf-8') as f:
                item = f.read()
        except FileNotFoundError:
            raise FileNotFoundError
        
        # Tokenizing the item 

        abstract = item.split('<START F-TERMS>')[0]
        id = abstract.split('/')[0]
        
        item = abstract

        
        output = self.tokenizer(item)  
        output.pop('token_type_ids')
        return output, id

# loading the dataset to embedd the patents
train_dataset = JapPatDatasetEmb(f'{dataset_folder}/train', tokenizer)
validation_dataset = JapPatDatasetEmb(f'{dataset_folder}/validation', tokenizer)

for b in train_dataset:
    print(b)
    break

def generate_embedding(abstract):
    out = model(input_ids= torch.tensor([[i]]), attention_mask = torch.tensor([[1]]), output_hidden_states=True)          
    out = out.hidden_states
    for i, k in enumerate(out):
        context_less_emb[i].append(k.to('cpu').detach().numpy())

There are 34 unique patents in the Henkel dataset, only 15 of them contain F-Terms.
Loaded Tokenizer from serialized instance!
There are 195,617 different F-Terms in the whole Dataset!
({'input_ids': [0, 43, 46, 40, 39, 47, 41, 45, 46, 39, 37, 2294, 4144, 19971, 14574, 452, 1863, 2380, 1845, 962, 841, 343, 281, 35308, 395, 299, 8652, 4409, 301, 286, 1092, 1789, 301, 281, 4198, 685, 345, 17601, 312, 7821, 14574, 658, 281, 35308, 395, 299, 8652, 4409, 36, 73, 31629, 48, 351, 4198, 3170, 3838, 36044, 4198, 345, 17601, 312, 7821, 14574, 658, 281, 35308, 395, 299, 8652, 4409, 36, 381, 4198, 3170, 3838, 655, 281, 6960, 2675, 388, 9019, 281, 8652, 3838, 345, 25078, 281, 14574, 922, 452, 1693, 286, 35308, 395, 299, 8652, 4409, 301, 286, 1092, 1789, 312, 281, 8652, 1078, 2675, 388, 6063, 400, 281, 8652, 3838, 377, 286, 1092, 1789, 388, 343, 838, 452, 286, 8652, 3838, 25078, 286, 14574, 922, 321, 550, 34748, 286, 14574, 922, 36, 29551, 9631, 2472, 412, 49847, 4574, 48, 1482, 243, 44, 50001], 'at

In [4]:
henkel_filtered

Unnamed: 0,doc_db_family_id,appln_id,appln_auth,appln_kind,appln_filing_date,appln_nr_original,granted,pat_publn_id,publn_auth,publn_nr,...,appln_id-2,publn_date,publn_lg,publn_first_grant,publn_claims,appln_id-3,fterms,appln_id-4,appln_abstract_lg,appln_abstract
0,15124483,31945235,JP,A,1999-05-14,13427599,Y,292125794,JP,2000319599,...,31945235,2000-11-21,,N,0,31945235.0,"4J040/JB10,4J040/JA09,4J040/PA42,4J040/MB03,4J...",31945235,en,PROBLEM TO BE SOLVED: To provide a process for...
1,15124483,31945235,JP,A,1999-05-14,13427599,Y,292125795,JP,3848490,...,31945235,2006-11-22,,Y,0,31945235.0,"4J040/JB10,4J040/JA09,4J040/PA42,4J040/MB03,4J...",31945235,en,PROBLEM TO BE SOLVED: To provide a process for...
2,18155212,36149433,JP,A,1996-11-19,8323484,N,296888242,JP,H10147752,...,36149433,1998-06-02,,N,0,36149433.0,"4J040/EC001,4J040/EB111,4J040/EB031,4J040/MA02...",36149433,en,PROBLEM TO BE SOLVED: To provide a method wher...
3,23387243,37823570,JP,A,2002-08-19,2002238134,N,291974626,JP,2003129030,...,37823570,2003-05-08,,N,0,37823570.0,"4J038/DJ021,4J040/PA42,4J040/NA19,4J040/KA42,4...",37823570,en,"PROBLEM TO BE SOLVED: To obtain a composition,..."
4,36203625,407371493,JP,A,2013-02-08,2013023112,N,407239310,JP,2013100541,...,407371493,2013-05-23,,N,0,407371493.0,"4F073/GA11,4F073/GA07,4F073/BB11,4F073/BA19,4F...",407371493,en,PROBLEM TO BE SOLVED: To provide polymeric mat...
5,40471151,57394459,JP,A,2007-09-26,2007249578,Y,317831107,JP,4404926,...,57394459,2010-01-27,,Y,0,57394459.0,"2C056/KC30,2C056/KC10,2C056/KC09,2C056/EA26,2C...",57394459,en,<P>PROBLEM TO BE SOLVED: To provide an ink car...
6,40471151,57394459,JP,A,2007-09-26,2007249578,Y,276757036,JP,2009078460,...,57394459,2009-04-16,,N,0,57394459.0,"2C056/KC30,2C056/KC10,2C056/KC09,2C056/EA26,2C...",57394459,en,<P>PROBLEM TO BE SOLVED: To provide an ink car...
7,40780346,57704169,JP,A,2007-11-06,2007288611,Y,365151335,JP,4971106,...,57704169,2012-07-11,,Y,0,57704169.0,"2C002/MM04,2C002/MM02,2C002/LL01,2C002/KK02,2C...",57704169,en,<P>PROBLEM TO BE SOLVED: To provide a separati...
8,40780346,57704169,JP,A,2007-11-06,2007288611,Y,276457839,JP,2009112513,...,57704169,2009-05-28,,N,0,57704169.0,"2C002/MM04,2C002/MM02,2C002/LL01,2C002/KK02,2C...",57704169,en,<P>PROBLEM TO BE SOLVED: To provide a separati...
9,42010241,275325130,JP,A,2008-07-31,2008198057,Y,410777738,JP,5296446,...,275325130,2013-09-25,,Y,0,275325130.0,"4F100/AA02A,4F100/AB10,4F100/AK25A,4F100/AL01A...",275325130,en,<P>PROBLEM TO BE SOLVED: To provide an electri...


# Plotting the Henkel F-Terms

In [68]:
###########################################
# Create a histogram of the Henkel F-Terms
###########################################

# Extracting the frequencies and the fterms from the counters
hist = list(counter_henkel.values())
fterms = list(counter_henkel.keys())


# Sorting the frequencies and the fterms 
idx = np.argsort(hist)[::-1]
hist = [hist[i] for i in idx]
fterms = [fterms[i] for i in idx]
edges = np.arange(len(hist)+1)

# Extracting the theme viewpoint and number definitions

themes = []
for fterm in fterms:
    try:
        themes.append(theme_dict[fterm[:5]]) 
    except KeyError:
        themes.append('Not Found')
        
vps = []
for fterm in fterms:
    try: 
        vps.append(viewpoint_dict[fterm[:8]])
    except KeyError:
        vps.append('Not Found')
        
numbers = []
for fterm in fterms:
    try:
        numbers.append(number_dict[fterm[:10]]) 
    except KeyError:
        numbers.append('Not Found')

# Create a Bokeh figure
output_notebook()  # Display Bokeh plots in Jupyter Notebook
p = figure(title="Histogram Henkel F-Terms", y_axis_label="Frequency", width=1000, height=1000)

# Create a ColumnDataSource for the histogram bars
source = ColumnDataSource(data={"top": hist, "left": edges[:-1], "right": edges[1:], "theme": themes})

# Create a color mapping for themes
unique_themes = list(set(themes))
color_mapping = factor_cmap("theme", palette=Category20[len(unique_themes)], factors=unique_themes)

# Create VBar glyph for the histogram bars with the color mapping
p.vbar(x="left", top="top", bottom=0, width=1, source=source, fill_color=color_mapping, legend_field="theme")

# Create a new column in the ColumnDataSource for text annotations
text_annotations = [str(count) for count in hist]

# Add annotations to each bin at the correct position
source.add(text_annotations, "text")
source.add(fterms, 'F-Term')
source.add(themes, 'Theme')
source.add(vps, 'Vp.')
source.add(numbers, 'Number')

# Create a HoverTool to display annotations on hover
hover = HoverTool()
hover.tooltips = [("Count", "@text"),("F-Term", "@{F-Term}"), ("Theme", "@Theme"), ("Viewpoint", "@{Vp.}"), ("Number", "@Number")]
p.add_tools(hover)

# Remove x-axis ticks and description
p.xaxis.visible = False

# Show the plot
show(p)


In [4]:
# Plotting the Henkel Embeddings and the Orbit Embeddings in TSNE Plots 
orbit_emb = []
o_ft = []
for fterm in counter_orbit.keys(): 
    try:
        orbit_emb.append(out_emb_dict[fterm])
        o_ft.append(fterm)
    except KeyError:
        pass

orbit_emb = np.array(orbit_emb)

henkel_emb = []
h_ft = []
for fterm in counter_henkel.keys():
    try:
        henkel_emb.append(out_emb_dict[fterm])
        h_ft.append(fterm)
    except KeyError:
        pass

henkel_emb = np.array(henkel_emb)

all_emb = np.concatenate([orbit_emb, henkel_emb], 0)
all_ft = [*o_ft, *h_ft]

tsne = TSNE(n_components=2, verbose=0, random_state=69)
rep = tsne.fit_transform(all_emb)

datasource_henkel = ColumnDataSource(
        data=dict(
            x = rep[len(o_ft):,0],
            y = rep[len(o_ft):,1],
            fterms = h_ft,
            themes = [theme_dict[fterm[:5]] for fterm in h_ft],
            viewpoints = [viewpoint_dict[fterm[:8]] for fterm in h_ft],
            numbers = [number_dict[fterm[:10]] for fterm in h_ft]))

datasource_orbit = ColumnDataSource(
        data=dict(
            x = rep[:len(o_ft), 0],
            y = rep[:len(o_ft), 1],
            fterms = o_ft,
            themes = [theme_dict[fterm[:5]] for fterm in o_ft],
            viewpoints = [viewpoint_dict[fterm[:8]] for fterm in o_ft],
            numbers = [number_dict[fterm[:10]] for fterm in o_ft]))


hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>F-Term:</b> @fterms<br><b>Theme:</b> @themes<br><b>Viewpoint:</b> @viewpoints<br><b>Number:</b> @numbers</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Henkel and Orbit Embeddings')
    
plot_tsne.circle('x', 'y', size=10, fill_color=RGB(250, 50, 100), alpha=1, line_width=0, source=datasource_henkel, name="Henkel Embeddings")
plot_tsne.square('x', 'y', size=7, fill_color=RGB(50, 75, 250), alpha=0.2, line_width=0, source=datasource_orbit, name="Orbit Embeddings")

show(plot_tsne)

In [None]:
print(f'''
There are 7 main clusters of Henkel F-Terms, which correspont to 7 main Themes:
    "Containers, tranfer, fixing, positioning etc. of wavers ,etc."
    "Adhesives or adhesive processes"
    "Laminated bodies #2"
    "Treatments of macromolecular shaped articles"
    "Paints or removers"
    "ink jet, e.g. ink supply, others"
    "golf clubs"''')

# Simply Searching for Similar F-Term Embeddings

In [None]:
"Searching for similar F-Terms without creating difference Vectors"

# n = number of similar and novel F-Terms (not jet found by previous similarity search), That are saved for each sample. 
n = 50
hits = []
cos = torch.nn.CosineSimilarity(dim=1)

for i, q_fterm in enumerate(counter_henkel.keys()):
    print(i, q_fterm, end='\r')
    q_emb = out_emb_dict[q_fterm]
    search_embs = []
    search_fterms = []
    for fterm, emb in out_emb_dict.items():
        # Filtering all previously found f
        if fterm in hits:
            continue
        if fterm in counter_henkel.keys():
            continue
        search_embs.append(emb)
        search_fterms.append(fterm)

    search_emb = torch.tensor(np.array(search_embs))
    q_emb = torch.tensor(np.array([q_emb for _ in search_emb]))

    simis = cos(search_emb, q_emb)
    idx = np.argsort(simis)
    for i in idx[-n:]:
        hits.append(search_fterms[i])

In [None]:
# Plotting the embeddigns hits vs the orbit search results

hits_emb = torch.tensor(np.array([out_emb_dict[fterm] for fterm in hits]))

orbit_emb = []
o_ft = []
for fterm in counter_orbit.keys(): 
    try:
        orbit_emb.append(out_emb_dict[fterm])
        o_ft.append(fterm)
    except KeyError:
        pass

orbit_emb = np.array(orbit_emb)

henkel_emb = []
h_ft = []
for fterm in counter_henkel.keys():
    try:
        henkel_emb.append(out_emb_dict[fterm])
        h_ft.append(fterm)
    except KeyError:
        pass

henkel_emb = np.array(henkel_emb)

all_emb = np.concatenate([orbit_emb, henkel_emb, hits_emb], 0)
all_ft = [*o_ft, *h_ft, *hits]

tsne = TSNE(n_components=2, verbose=0, random_state=69)
rep = tsne.fit_transform(all_emb)

datasource_henkel = ColumnDataSource(
        data=dict(
            x = rep[len(o_ft):len(o_ft) + len(h_ft),0],
            y = rep[len(o_ft):len(o_ft) + len(h_ft),1],
            fterms = h_ft,
            themes = [theme_dict[fterm[:5]] for fterm in h_ft],
            viewpoints = [viewpoint_dict[fterm[:8]] for fterm in h_ft],
            numbers = [number_dict[fterm[:10]] for fterm in h_ft]))

datasource_orbit = ColumnDataSource(
        data=dict(
            x = rep[:len(o_ft), 0],
            y = rep[:len(o_ft), 1],
            fterms = o_ft,
            themes = [theme_dict[fterm[:5]] for fterm in o_ft],
            viewpoints = [viewpoint_dict[fterm[:8]] for fterm in o_ft],
            numbers = [number_dict[fterm[:10]] for fterm in o_ft]))

datasource_emb_search = ColumnDataSource(
        data=dict(
            x = rep[len(o_ft) + len(h_ft):, 0],
            y = rep[len(o_ft) + len(h_ft):, 1],
            fterms = hits,
            themes = [theme_dict[fterm[:5]] for fterm in hits],
            viewpoints = [viewpoint_dict[fterm[:8]] for fterm in hits],
            numbers = [number_dict[fterm[:10]] for fterm in hits]))


hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>F-Term:</b> @fterms<br><b>Theme:</b> @themes<br><b>Viewpoint:</b> @viewpoints<br><b>Number:</b> @numbers</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Henkel and Orbit Embeddings')
    
plot_tsne.circle('x', 'y', size=10, fill_color=RGB(250, 50, 75), alpha=1, line_width=0, source=datasource_henkel, name="Henkel Embeddings")
plot_tsne.square('x', 'y', size=7, fill_color=RGB(50, 75, 250), alpha=0.2, line_width=0, source=datasource_orbit, name="Orbit Embeddings")
plot_tsne.triangle('x', 'y', size=7, fill_color=RGB(75, 250, 50), alpha=0.2, line_width=0, source=datasource_emb_search, name="Cos Similar Embeddings")

show(plot_tsne)

# In Viewpoint Differences

In [57]:
def create_fterm_dict():
    """
    Creates a hirachical dict with all F-Terms ordered by Theme -> Viewpoint > F-Terms
    """
    f_term_dict = {}
    for f_term in f_term_tokens:
        theme = f_term.split('/')[0]
        vp = f_term[:8]
        # Creating a dict entry for the theme
        try: 
            _ = f_term_dict[theme]
        except KeyError:
            f_term_dict[theme] = {}
    
        # Creating a dict entry for the viewpoint
    
        try:
            # The first dict call will def. work the second may work if the vp-dict entry 
            # was already made. If it works the theme is appended to the viewpoint dict
            f_term_dict[theme][vp].append(f_term)
        except KeyError:
            f_term_dict[theme][vp] = []

    return f_term_dict
    

def create_all_diffs():
    """
    Creates all possible in viewpoint combinations and returns them a s a 
    """
    all_diffs = {}
    emb = out_emb_dict
    # Calculating the needed combinations
    f_term_dict = create_fterm_dict()
    for i, (theme, t_dict) in enumerate(f_term_dict.items()):
        # print(i, theme, len(t_dict), end='\r')
        all_diffs[theme] = {}
        for viewpoint, fterms in t_dict.items():
            all_diffs[theme][viewpoint] = {}
            combinations = itertools.combinations(fterms, 2)
            for fterm1, fterm2 in combinations:
                diff = emb[fterm2[:10]] - emb[fterm1[:10]]
                diff = normalize(np.array([diff]))
                all_diffs[theme][viewpoint][(fterm1, fterm2)] = diff

    return all_diffs
    

def create_diffs_tensor(block_theme, all_diffs):
    """
    Creates a tensor with all diffs, which do not contain the block theme.
    Additionaly also returns a list with all comination descriptions
    """
    # Filtering out the unwanted theme
    diffs = {theme: t_dict for theme, t_dict in all_diffs.items() if theme != block_theme}
    out_diffs = []
    out_desc = []
    for _, t_dict in diffs.items():
        for _, vp_dict in t_dict.items():
            for comb, diff in vp_dict.items():
                out_desc.append(comb)
                out_diffs.append(diff)

    out_diffs = np.array(out_diffs)
    out_diffs = out_diffs.squeeze(1)
    return out_diffs, out_desc


def search_cos(query_vec, all_vecs, all_desc, n):
    """
    Computes the cosine similarities between all_vecs and the query_vec and returns the descriptions of the n best matches.
    
    """
    cos = torch.nn.CosineSimilarity(dim=1)
    # Creating an array of query vectors, with the same number of vectors as the all_vecs array.
    query = np.concatenate([query_vec for _ in all_vecs], 0)
    simis = cos(torch.tensor(all_vecs, requires_grad=False), torch.tensor(query, requires_grad=False))
    # Creating the indices of the top n similarities
    idx = np.argsort(simis).numpy()[::-1][:n]
    matches = [all_desc[i] for i in idx]
    simis = [simis[i] for i in idx]
    return matches
    

In [15]:
# Hirachical dict which contains all in viewpoint differences
all_diffs = create_all_diffs()

In [58]:
########################################################
# Searching with henkel in viewpoint differences
########################################################

# maximum number of new patents added to the results per query diff
n = 50
# Hirachical dict containing all Henkel f-terms
henkel_dict = {}

for fterm in counter_henkel.keys():
    theme = fterm[:5]
    try:
        _ = henkel_dict[theme]
    except KeyError:
        henkel_dict[theme] = {}

    vp = fterm[:8]
    try: 
        henkel_dict[theme][vp].append(fterm)
    except KeyError:
        henkel_dict[theme][vp] = [fterm]

# List of all henkel f-term in viewpoint combinations 
henkel_invp = []
henkel_invp_diff = []

for t_dict in henkel_dict.values():
    for fterms in t_dict.values():
        combinations = list(itertools.combinations(fterms, 2))
        henkel_invp.extend(combinations)
        for fterm1, fterm2 in combinations:
            diff = normalize(np.array([out_emb_dict[fterm2]])-np.array([out_emb_dict[fterm1]]))
            henkel_invp_diff.append(diff)


found_f_terms = []
# list which stores the query diffs, by which the found_f_terms were found
found_by = []
for i, (query_diff, desc) in enumerate(zip(henkel_invp_diff, henkel_invp)):
    print(i, len(found_f_terms), len(found_by), end='\r')
    f_terms_added = 0
    theme = desc[0][:5]
    search_diffs, search_descs = create_diffs_tensor(theme, all_diffs)
    results = search_cos(query_diff, search_diffs, search_descs, n*10) # Getting more results to account for multiple finings of one f-term
    while f_terms_added < n:
        try:
            fterm1, fterm2 = results.pop(0)
            if fterm1 not in found_f_terms:
                found_f_terms.append(fterm1)
                found_by.append(desc)
                f_terms_added += 1
    
            if fterm2 not in found_f_terms:
                found_f_terms.append(fterm2)
                found_by.append(desc)
                f_terms_added += 1
        except IndexError:
            results = search_cos(query_diff, search_diffs, search_descs, n*1000)

    

144 7257 7257

In [59]:
###############################################
# Plotting the in viewpoint search results
###############################################

hits_emb = torch.tensor(np.array([out_emb_dict[fterm[:10]] for fterm in found_f_terms]))
# To compare to a random sample of F-Terms
#random_fterms = np.random.choice(f_term_tokens, len(found_f_terms), replace=False)
#hits_emb = torch.tensor(np.array([out_emb_dict[fterm[:10]] for fterm in random_fterms]))

orbit_emb = []
o_ft = []
for fterm in counter_orbit.keys(): 
    try:
        orbit_emb.append(out_emb_dict[fterm])
        o_ft.append(fterm)
    except KeyError:
        pass

orbit_emb = np.array(orbit_emb)

henkel_emb = []
h_ft = []
for fterm in counter_henkel.keys():
    try:
        henkel_emb.append(out_emb_dict[fterm])
        h_ft.append(fterm)
    except KeyError:
        pass

henkel_emb = np.array(henkel_emb)

all_emb = np.concatenate([orbit_emb, henkel_emb, hits_emb], 0)

tsne = TSNE(n_components=2, verbose=0, random_state=69)
rep = tsne.fit_transform(all_emb)

datasource_henkel = ColumnDataSource(
        data=dict(
            x = rep[len(o_ft):len(o_ft) + len(h_ft),0],
            y = rep[len(o_ft):len(o_ft) + len(h_ft),1],
            fterms = h_ft,
            themes = [theme_dict[fterm[:5]] for fterm in h_ft],
            viewpoints = [viewpoint_dict[fterm[:8]] for fterm in h_ft],
            numbers = [number_dict[fterm[:10]] for fterm in h_ft]))

datasource_orbit = ColumnDataSource(
        data=dict(
            x = rep[:len(o_ft), 0],
            y = rep[:len(o_ft), 1],
            fterms = o_ft,
            themes = [theme_dict[fterm[:5]] for fterm in o_ft],
            viewpoints = [viewpoint_dict[fterm[:8]] for fterm in o_ft],
            numbers = [number_dict[fterm[:10]] for fterm in o_ft]))

datasource_emb_search = ColumnDataSource(
        data=dict(
            x = rep[len(o_ft) + len(h_ft):, 0],
            y = rep[len(o_ft) + len(h_ft):, 1],
            fterms = found_f_terms,
            themes = [theme_dict[fterm[:5]] for fterm in found_f_terms],
            viewpoints = [viewpoint_dict[fterm[:8]] for fterm in found_f_terms],
            numbers = [number_dict[fterm[:10]] for fterm in found_f_terms], 
            found_by = [number_dict[comb[0][:10]] +'---' + number_dict[comb[1][:10]] for comb in found_by]))


hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>F-Term:</b> @fterms<br><b>Theme:</b> @themes<br><b>Viewpoint:</b> @viewpoints<br><b>Number:</b> @numbers<br><b>Query:</b> @found_by</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Henkel and Orbit Embeddings')
    
plot_tsne.circle('x', 'y', size=10, fill_color=RGB(250, 50, 75), alpha=1, line_width=0, source=datasource_henkel, name="Henkel Embeddings")
plot_tsne.square('x', 'y', size=7, fill_color=RGB(50, 75, 250), alpha=0.2, line_width=0, source=datasource_orbit, name="Orbit Embeddings")
plot_tsne.triangle('x', 'y', size=7, fill_color=RGB(75, 250, 50), alpha=0.2, line_width=0, source=datasource_emb_search, name="Cos Similar Embeddings")

show(plot_tsne)
    

# Search in Theme Combinations

In [60]:
def create_fterm_dict():
    """
    Creates a hirachical dict with all F-Terms ordered by Theme -> Viewpoint > F-Terms
    """
    f_term_dict = {}
    for f_term in f_term_tokens:
        theme = f_term.split('/')[0]
        vp = f_term[:8]
        # Creating a dict entry for the theme
        try: 
            _ = f_term_dict[theme]
        except KeyError:
            f_term_dict[theme] = {}
    
        # Creating a dict entry for the viewpoint
    
        try:
            # The first dict call will def. work the second may work if the vp-dict entry 
            # was already made. If it works the theme is appended to the viewpoint dict
            f_term_dict[theme][vp].append(f_term)
        except KeyError:
            f_term_dict[theme][vp] = []

    return f_term_dict


def extract_theme_fterms(t_dict):
    fterms = []
    for vp_list in t_dict.values():
        fterms.extend(vp_list)
    return fterms        


def create_all_diffs(emb=out_emb_dict):
    """
    Creates all possible in viewpoint combinations and returns them a s a 
    """
    all_diffs = {}
    # Calculating the needed combinations
    f_term_dict = create_fterm_dict()
    for i, (theme, t_dict) in enumerate(f_term_dict.items()):
        print(i, theme, end='\r')
        all_diffs[theme] = {}
        fterms = extract_theme_fterms(t_dict)
        combinations = itertools.combinations(fterms, 2)
        for fterm1, fterm2 in combinations:
            viewpoint = fterm1[:8]
            diff = emb[fterm2[:10]] - emb[fterm1[:10]]
            diff = normalize(np.array([diff]))
            try:
                all_diffs[theme][viewpoint][(fterm1, fterm2)] = diff
            except KeyError:
                all_diffs[theme][viewpoint] = {}
                all_diffs[theme][viewpoint][(fterm1, fterm2)] = diff
    return all_diffs
    

def create_diffs_tensor(block_theme, all_diffs):
    """
    Creates a tensor with all diffs, which do not contain the block theme.
    Additionaly also returns a list with all comination descriptions
    """
    # Filtering out the unwanted theme
    diffs = {theme: t_dict for theme, t_dict in all_diffs.items() if theme != block_theme}
    out_diffs = []
    out_desc = []
    for i, (theme, t_dict) in enumerate(diffs.items()):
        #print(i, theme, end='\r')
        for vp_dict in t_dict.values():
            for comb, diff in vp_dict.items():
                out_desc.append(comb)
                out_diffs.append(diff)

    out_diffs = np.array(out_diffs)
    out_diffs = out_diffs.squeeze(1)
    return out_diffs, out_desc


def search_cos(query_vec, all_vecs, all_desc, n):
    """
    Computes the cosine similarities between all_vecs and the query_vec and returns the descriptions of the n best matches.
    
    """
    cos = torch.nn.CosineSimilarity(dim=1)
    # Creating an array of query vectors, with the same number of vectors as the all_vecs array.
    query = np.concatenate([query_vec for _ in all_vecs], 0)
    simis = cos(torch.tensor(all_vecs, requires_grad=False), torch.tensor(query, requires_grad=False))
    # Creating the indices of the top n similarities
    idx = np.argsort(simis).numpy()[::-1][:n]
    matches = [all_desc[i] for i in idx]
    simis = [simis[i] for i in idx]
    return matches

In [None]:
all_diffs = create_all_diffs()

In [82]:
###############################################################
# Searching for promising in theme Henkel Combinations
###############################################################
n = 100

search_combination = ['4F100/JL11', '4F100/JG01']  # Führt zu Guss/Thermoforming F-Terms (adhesiveness, conductivity being properties or funcitons)
search_combination = ['4J040/JB09', '4J040/PA21'] # Wilde Mischung (pressure sensitive adhesive or adhesive types, use of adhesive characterised by specific shapess of functions)
search_combination = ['4J004/CC02', '4J004/CA07'] # (foil like, inorganic materials)

query_diff = normalize(np.array([out_emb_dict[search_combination[1]] - out_emb_dict[search_combination[0]]]))

theme = search_combination[0][:5]
search_diffs, search_descs = create_diffs_tensor(theme, all_diffs)
results = search_cos(query_diff, search_diffs, search_descs, n)     # Getting more results to account for multiple finings of one f-term

###############################################
# Plotting the in theme search results 
###############################################

# Extracting uniqe F-terms from the results
found_f_terms = []
[found_f_terms.extend(comb) for comb in results]
found_f_terms = list(set(found_f_terms))

# Embedding the found F-Terms
hits_emb = torch.tensor(np.array([out_emb_dict[fterm[:10]] for fterm in found_f_terms]))

orbit_emb = []
o_ft = []
for fterm in counter_orbit.keys(): 
    try:
        orbit_emb.append(out_emb_dict[fterm])
        o_ft.append(fterm)
    except KeyError:
        pass

orbit_emb = np.array(orbit_emb)

henkel_emb = []
h_ft = []
for fterm in search_combination:
    try:
        henkel_emb.append(out_emb_dict[fterm])
        h_ft.append(fterm)
    except KeyError:
        pass

henkel_emb = np.array(henkel_emb)

all_emb = np.concatenate([orbit_emb, henkel_emb, hits_emb], 0)

tsne = TSNE(n_components=2, verbose=0, random_state=69)
rep = tsne.fit_transform(all_emb)

datasource_henkel = ColumnDataSource(
        data=dict(
            x = rep[len(o_ft):len(o_ft) + len(h_ft),0],
            y = rep[len(o_ft):len(o_ft) + len(h_ft),1],
            fterms = h_ft,
            themes = [theme_dict[fterm[:5]] for fterm in h_ft],
            viewpoints = [viewpoint_dict[fterm[:8]] for fterm in h_ft],
            numbers = [number_dict[fterm[:10]] for fterm in h_ft]))

datasource_orbit = ColumnDataSource(
        data=dict(
            x = rep[:len(o_ft), 0],
            y = rep[:len(o_ft), 1],
            fterms = o_ft,
            themes = [theme_dict[fterm[:5]] for fterm in o_ft],
            viewpoints = [viewpoint_dict[fterm[:8]] for fterm in o_ft],
            numbers = [number_dict[fterm[:10]] for fterm in o_ft]))

datasource_emb_search = ColumnDataSource(
        data=dict(
            x = rep[len(o_ft) + len(h_ft):, 0],
            y = rep[len(o_ft) + len(h_ft):, 1],
            fterms = found_f_terms,
            themes = [theme_dict[fterm[:5]] for fterm in found_f_terms],
            viewpoints = [viewpoint_dict[fterm[:8]] for fterm in found_f_terms],
            numbers = [number_dict[fterm[:10]] for fterm in found_f_terms], 
            found_by = [number_dict[comb[0][:10]] +'---' + number_dict[comb[1][:10]] for comb in found_by]))


hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>F-Term:</b> @fterms<br><b>Theme:</b> @themes<br><b>Viewpoint:</b> @viewpoints<br><b>Number:</b> @numbers<br><b>Query:</b> @found_by</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Henkel and Orbit Embeddings')
    
plot_tsne.circle('x', 'y', size=10, fill_color=RGB(250, 50, 75), alpha=1, line_width=0, source=datasource_henkel, name="Henkel Embeddings")
plot_tsne.square('x', 'y', size=7, fill_color=RGB(50, 75, 250), alpha=0.2, line_width=0, source=datasource_orbit, name="Orbit Embeddings")
plot_tsne.triangle('x', 'y', size=7, fill_color=RGB(75, 250, 50), alpha=1, line_width=0, source=datasource_emb_search, name="Cos Similar Embeddings")

show(plot_tsne)
    


2259 3B034



In [83]:
for fterm1, fterm2 in results:
    theme = theme_dict[fterm1[:5]]
    vp1 = viewpoint_dict[fterm1[:8]]
    vp2 = viewpoint_dict[fterm2[:8]]
    n1 = number_dict[fterm1[:10]]
    n2 = number_dict[fterm2[:10]]

    print(f'''    
Theme: {theme}
vp1: {vp1}     vp2: {vp2}
n1: {n1}       n2:{n2}
''')

    
Theme: ｄｒａｍ
vp1: purpose/means/methods     vp2: purpose/means/methods
n1: ・high-speed memory technology → high-speed memory jj       n2:other

    
Theme: polyesters or polycarbonates
vp1: dicarboxylic acid containing halogen     vp2: dicarboxylic acid containing halogen
n1: . r dicarboxylic acids       n2:. condensed ar dicarboxylic acids

    
Theme: inks, pencil-leads, or crayons
vp1: additive ; purpose or function     vp2: additive ; purpose or function
n1: . solvents       n2:. fungicidal or bactericidal agents

    
Theme: measurement of position, velocity, etc. using sound waves and ultrasonic waves
vp1: representation     vp2: representation
n1: ・multiple indicators       n2:picture-in-picture

    
Theme: fuel cell (system)
vp1: nan     vp2: nan
n1: nan       n2:nan

    
Theme: radiation measurements
vp1: shape, detail structure, etc. of measuring equipment     vp2: shape, detail structure, etc. of measuring equipment
n1: ... 2d array       n2:・・・・cylindrically arranged


In [25]:
len(all_diffs)

2261

In [40]:
henkel_dict

{'4J040': {'4J040/JB': ['4J040/JB10', '4J040/JB02', '4J040/JB09'],
  '4J040/JA': ['4J040/JA06', '4J040/JA01'],
  '4J040/PA': ['4J040/PA41', '4J040/PA21'],
  '4J040/MB': ['4J040/MB01', '4J040/MB08'],
  '4J040/MA': ['4J040/MA01', '4J040/MA07'],
  '4J040/EB': ['4J040/EB09', '4J040/EB02'],
  '4J040/EK': ['4J040/EK03'],
  '4J040/KA': ['4J040/KA09'],
  '4J040/EH': ['4J040/EH02'],
  '4J040/DF': ['4J040/DF01', '4J040/DF02'],
  '4J040/HB': ['4J040/HB42', '4J040/HB22'],
  '4J040/HC': ['4J040/HC01', '4J040/HC20', '4J040/HC10', '4J040/HC16'],
  '4J040/HD': ['4J040/HD38', '4J040/HD01', '4J040/HD21', '4J040/HD41'],
  '4J040/LA': ['4J040/LA09', '4J040/LA01', '4J040/LA06'],
  '4J040/GA': ['4J040/GA04'],
  '4J040/EE': ['4J040/EE01']},
 '4J038': {'4J038/DJ': ['4J038/DJ02'],
  '4J038/KA': ['4J038/KA01'],
  '4J038/DA': ['4J038/DA11', '4J038/DA03']},
 '4F073': {'4F073/GA': ['4F073/GA11', '4F073/GA07'],
  '4F073/BB': ['4F073/BB11'],
  '4F073/BA': ['4F073/BA19', '4F073/BA02'],
  '4F073/AA': ['4F073/AA32']},
