In this notebook I will analyze the patents about electrically debondable adhesives form Henkel.

In [1]:
# Imports 

# Own Packages
from Masterarbeit_utils.model_utils_agg import get_tokenizer, load_and_modify_model, load_pretrained_Tokenizer

# Site Packages
import pandas as pd
import numpy as np
import pickle as pk
import torch
import os 
import sys
import psutil
from collections import Counter
import itertools
# Dimension reduction algorithms
#from cuml.manifold import TSNE
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from scipy.spatial import distance
from scipy.fft import fft, fftfreq
# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.io import export_png
from bokeh.palettes import Viridis256, Category20
from bokeh.transform import linear_cmap, factor_cmap
from bokeh.colors import RGB

# Huggingface
from transformers import AutoTokenizer, OPTForCausalLM
from tokenizers.processors import TemplateProcessing
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from torch.utils.data import Dataset

output_notebook()

2023-10-09 11:07:31.827619: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-09 11:07:31.846995: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
###########################################################
# Loading the Henkel Patents
###########################################################

# Directories in which data important for the notebook is stored
dump_dir = 'PK_DUMP'
data_dir = 'data'

# Loading the dataframes with the patents deemed most important for electrically debondable adhesives from Henkel
henkel_patents = pd.read_csv(f'{data_dir}/Henkel_patente_patstat_docdb_families_abstract.csv', delimiter=',').reset_index(drop=True)
henkel_orbit = pd.read_csv(f'{data_dir}/Henkel_Orbit_Suche_Patstat_Export.csv', delimiter=',')

# Filtering the Samples which contain F-Terms
henkel_filtered = henkel_patents[henkel_patents['fterms'].notna()]
henkel_filtered = henkel_filtered.reset_index(drop=True)

orbit_filtered = henkel_orbit[henkel_orbit['fterms'].notna()]
orbit_filtered = orbit_filtered.reset_index(drop=True)

print(f"There are {len(henkel_patents['doc_db_family_id'].unique())} unique patents in the Henkel dataset, only {len(henkel_filtered['doc_db_family_id'].unique())} of them contain F-Terms.")

# extracting all f-terms form the datasets
fterms_henkel = [fterm[:10] for fterms in henkel_filtered['fterms'] for fterm in fterms.split(',')]
fterms_orbit = [fterm[:10] for fterms in orbit_filtered['fterms'] for fterm in fterms.split(',')]

# Aggreagting the F-Terms
with open(f'{dump_dir}/aggregation_dict_new.pk', 'rb') as f:
    aggregation_dict = pk.load(f)

def aggregate(f_term):
    try:
        return aggregation_dict[f_term]
    except KeyError:
        pass

fterms_henkel_agg = [aggregate(fterm) for fterm in fterms_henkel if aggregate(fterm) is not None]
fterms_orbit_agg = [aggregate(fterm) for fterm in fterms_orbit if aggregate(fterm) is not None]

# Counting the occurrences of the henkel and orbit fterms
counter_henkel = Counter(fterms_henkel_agg)
counter_orbit = Counter(fterms_orbit_agg)

# Structuring the henkel F-Terms
henkel_dict = {}
for fterm in counter_henkel.keys():
    theme = fterm[:5]
    try:
        _ = henkel_dict[theme]
    except KeyError:
        henkel_dict[theme] = {}

    vp = fterm[:8]
    try: 
        henkel_dict[theme][vp].append(fterm)
    except KeyError:
        henkel_dict[theme][vp] = [fterm]

# Extracting the themes form the F-Terms

henkel_themes = list(set([fterm[:5] for fterm in fterms_henkel_agg]))
orbit_themes = list(set([fterm[:5] for fterm in fterms_orbit_agg]))
################################################################
# Loading the Model
################################################################

model_name = 'gal_125_new_1'
checkpoint = int(2*86515)
# If True normalization is applied to the embeddings
norm = True
context_less = False

# The folder at which the model will be saved. This folder has to be created for your system 
model_folder = f'data/models/{model_name}'
os.makedirs(model_folder, exist_ok=True)


# Folder in which the tokenizer will be saved
tokenizer_folder = f'data/tokenizers/{model_name}'
os.makedirs(tokenizer_folder, exist_ok=True)

# Folder at which all pickle files are stored. This folder is fixed for this project and should not be changed
dump_dir = r'PK_DUMP'

# Model parameters 
'''
mini	125 M
base	1.3 B
standard	6.7 B
large	30 B
huge	120 B'''
if model_name.split('_')[1] == '125':
    base_model_name = 'mini'
elif model_name.split('_')[1] == '1300':
    base_model_name = 'base'

# All new Torch-objects will be by default in this dtype
# if default_type = float16 fp16 must be False
default_dtype = torch.float32
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_default_dtype(default_dtype)

# Default device on which the model will be loaded
default_device = 'cpu'

# Number of GPUs the model will be parallelised to 
num_gpus = 1
# If you change 'default_device' to 'cpu', make sure to set num_gpus to zero.
if default_device == 'cpu':
    num_gpus = 0

tensor_parallel = False


###########################
# Loading the Model
###########################
device_map=None
max_memory = {}
if num_gpus > 0:
    # based on https://github.com/huggingface/accelerate/blob/5315290b55ea9babd95a281a27c51d87b89d7c85/src/accelerate/utils/modeling.py#L274
    for i in range(num_gpus):
        _ = torch.tensor([0], device=i)
    for i in range(num_gpus):
        max_memory[i] = torch.cuda.mem_get_info(i)[0]
    device_map = "auto"
max_memory["cpu"] = psutil.virtual_memory().available

model = OPTForCausalLM.from_pretrained(f'{model_folder}/checkpoint-{checkpoint}', torch_dtype=default_dtype, low_cpu_mem_usage=True,
                                               device_map=device_map, max_memory=max_memory)

###########################
# Loading the Tokenizer
###########################
tokenizer = AutoTokenizer.from_pretrained(tokenizer_folder)
n_f_terms = len(tokenizer) - tokenizer.vocab_size
print('Loaded Tokenizer from serialized instance!')    
print(f'There are {n_f_terms:,} different F-Terms in the whole Dataset!')


###########################
# Loading Descriptions
###########################
with open(f'{dump_dir}/agg_themes_descriptions_new.pk', 'rb') as f:
    theme_dict = pk.load(f)
with open(f'{dump_dir}/agg_viewpoints_descriptions_new.pk', 'rb') as f:
    viewpoint_dict = pk.load(f)
with open(f'{dump_dir}/agg_numbers_descriptions_new.pk', 'rb') as f:
    number_dict = pk.load(f)
with open(f'{dump_dir}/agg_full_descriptions_new.pk', 'rb') as f:
    full_descriptions_dict = pk.load(f)


###########################
# Extracting the Embeddings
###########################

# Extracting the classification Head weights
inp_emb = model.get_input_embeddings()


#Embeddings if the model is not a sequence classification model
out_emb = model.get_output_embeddings()
out_emb = next(out_emb.parameters()).to('cpu').detach().numpy()[2:]
inp_emb = inp_emb(torch.arange(len(tokenizer))).to('cpu').detach().numpy()[50002:]

if context_less:
    # Extracting context less embeddings
    if not os.path.isfile(f'{model_folder}/context_less_emb{checkpoint}.pk'):
        print('Calculating context less embeddings!')
        context_less_emb = [[] for _ in range(len([1 for _ in model.parameters()]))]
        for i in range(len(tokenizer)):
            print(i, end='\r')
            out = model(input_ids= torch.tensor([[i]]), attention_mask = torch.tensor([[1]]), output_hidden_states=True)
                
            out = out.hidden_states
            for i, k in enumerate(out):
                context_less_emb[i].append(k.to('cpu').detach().numpy())
        with open(f'{model_folder}/context_less_emb{checkpoint}.pk', 'wb') as f:
            pk.dump(context_less_emb, f)
    else:
        print('Loading context less embeddings from disk')
        with open(f'{model_folder}/context_less_emb{checkpoint}.pk', 'rb') as f:
            context_less_emb = pk.load(f)
        
    # Combining context less embeddings of a layer to a single tensor
    for i, layer in enumerate(context_less_emb):
        layer = [e[0] for e in layer]
        layer = np.concatenate(layer, 0)
        context_less_emb[i] = layer

# Normalizing the embeddings 
def normalize(tensor):
    if norm:
        return torch.nn.functional.normalize(torch.tensor(tensor), p=2).numpy()
    else:
        return tensor

out_emb = normalize(out_emb)
inp_emb = normalize(inp_emb)

if context_less:
    context_less_emb = [normalize(layer) for layer in context_less_emb]

# Extracting the matching F_terms for the weights and creating lists with the defintions
tokens = [tokenizer.decode(i) for i in range(len(tokenizer))]
f_term_tokens = tokens[50002:]

# Creating  a dict with f-Terms and their embedding vectors:
out_emb_dict = {token[:-1]: vec for token, vec in zip(f_term_tokens, out_emb)}
ft_emb_dict = {key: np.abs(fft(value)) for key, value in out_emb_dict.items()}
inp_emb_dict = {token[:-1]: vec for token, vec in zip(f_term_tokens, inp_emb)}
    
# Creating Context Less Embedding Dicts
if context_less:
    context_less_dicts = []
    for layer in context_less_emb:
        context_less_dicts.append({token[:-1]: vec for token, vec in zip(tokens, layer)})

# Extracting the emb_dim
for e in out_emb_dict.values():
    break
emb_dim = e.shape[-1]
print('Embedding Dimension: ', emb_dim)

There are 34 unique patents in the Henkel dataset, only 15 of them contain F-Terms.
Loaded Tokenizer from serialized instance!
There are 195,617 different F-Terms in the whole Dataset!
Embedding Dimension:  768


# Search in Theme Combinations

In [3]:
def create_fterm_dict():
    """
    Creates a hirachical dict with all F-Terms ordered by Theme -> Viewpoint > F-Terms
    """
    f_term_dict = {}
    for f_term in f_term_tokens:
        theme = f_term.split('/')[0]
        vp = f_term[:8]
        # Creating a dict entry for the theme
        try: 
            _ = f_term_dict[theme]
        except KeyError:
            f_term_dict[theme] = {}
    
        # Creating a dict entry for the viewpoint
    
        try:
            # The first dict call will def. work the second may work if the vp-dict entry 
            # was already made. If it works the theme is appended to the viewpoint dict
            f_term_dict[theme][vp].append(f_term)
        except KeyError:
            f_term_dict[theme][vp] = []

    return f_term_dict


def extract_theme_fterms(t_dict):
    fterms = []
    for vp_list in t_dict.values():
        fterms.extend(vp_list)
    return fterms        


def create_all_diffs(emb=out_emb_dict):
    """
    Creates all possible in viewpoint combinations and returns them a s a 
    """
    all_diffs = {}
    # Calculating the needed combinations
    f_term_dict = create_fterm_dict()
    for i, (theme, t_dict) in enumerate(f_term_dict.items()):
        print(i, theme, end='\r')
        all_diffs[theme] = {}
        fterms = extract_theme_fterms(t_dict)
        combinations = itertools.combinations(fterms, 2)
        for fterm1, fterm2 in combinations:
            viewpoint = fterm1[:8]
            diff = emb[fterm2[:10]] - emb[fterm1[:10]]
            diff = normalize(np.array([diff]))
            try:
                all_diffs[theme][viewpoint][(fterm1, fterm2)] = diff
            except KeyError:
                all_diffs[theme][viewpoint] = {}
                all_diffs[theme][viewpoint][(fterm1, fterm2)] = diff
    return all_diffs
    

def create_diffs_tensor(block_theme, all_diffs):
    """
    Creates a tensor with all diffs, which do not contain the block theme.
    Additionaly also returns a list with all comination descriptions
    """
    # Filtering out the unwanted theme
    diffs = {theme: t_dict for theme, t_dict in all_diffs.items() if theme != block_theme}
    out_diffs = []
    out_desc = []
    for i, (theme, t_dict) in enumerate(diffs.items()):
        #print(i, theme, end='\r')
        for vp_dict in t_dict.values():
            for comb, diff in vp_dict.items():
                out_desc.append(comb)
                out_diffs.append(diff)

    out_diffs = np.array(out_diffs)
    out_diffs = out_diffs.squeeze(1)
    return out_diffs, out_desc


def search_cos(query_vec, vecs):
    """
    Computes the cosine similarities between all_vecs and the query_vec 
    """
    cos = torch.nn.CosineSimilarity(dim=1)
    # Creating an array of query vectors, with the same number of vectors as the all_vecs array.
    query = np.concatenate([query_vec for _ in vecs], 0)
    vecs = torch.tensor(vecs, requires_grad=False)
    query = torch.tensor(query, requires_grad=False)
    simis = cos(vecs, query)
    return simis
    

def search_in_all(fterm1, fterm2, all_diffs, step=100):
    """
    This function computes the most similar combinations in steps of 'step' themes at a time
    """
    theme = fterm1[:5]
    query = out_emb_dict[fterm2[:10]] - out_emb_dict[fterm1[:10]]
    query = normalize(np.array([query]))
    iterations = -(len(all_diffs)//-step)    # Ceiling devision
    simis = []                               # Stores all computed cosine similarities
    descs = []                               # Stores all combination descriptions (fterm1, fterm2)
    
    for i in range(iterations):
        diffs_chunk = dict([d for d in all_diffs.items()][i*step: (i+1)*step])
        search_diffs, search_descs = create_diffs_tensor(theme, diffs_chunk)
        simis_chunk = search_cos(query, search_diffs)
        simis.extend(simis_chunk)
        descs.extend(search_descs)

    # Sorting for highest similarity
    idx = np.argsort(simis)[::-1]
    simis = [simis[i].item() for i in idx]
    descs = [descs[i] for i in idx]
    return simis,  descs,  idx


def generate_combinations():
    # Extracting the most important Themes
    theme_occurrences = {}
    for fterm, occ in counter_henkel.items():
        theme = fterm[:5]
        try:
            theme_occurrences[theme] += occ
        except KeyError:
            theme_occurrences[theme] = occ
    
    main_themes = ['4J040', '4F100', '4J004']
    
    # Extracting the most important F-Terms
    main_fterms = list(set([fterm for fterm in fterms_henkel_agg if fterm[:5] in main_themes]))
    
    # Creating all in Theme Combinations
    combinations = []
    for theme in main_themes:
        fterms = [fterm for fterm in main_fterms if fterm[:5] == theme]
        # Permutations instead of combinations because we want both directions
        combs = list(itertools.permutations(fterms, 2))
        combinations.append(combs)
    return combinations
        

In [4]:
if os.path.isfile(f'{model_folder}/all_theme_diffs.pk'):
    with open(f'{model_folder}/all_theme_diffs.pk', 'rb') as f:
        all_diffs = pk.load(f)

else:
    all_diffs = create_all_diffs()
    with open(f'{model_folder}/all_theme_diffs.pk', 'wb') as f:
        pk.dump(all_diffs, f)

In [5]:
#####################################################################
# Generating in Theme combinations of the three most frequent Henkel Themes
#####################################################################


combinations = generate_combinations()



In [None]:
# Searching with all combinations
# !!! This takes more than 24 hours, all results above the threshold similarity will be saved.
threshold = 0.15
os.makedirs(f'{model_folder}/comb_sims', exist_ok=True)
for c in combinations:
    for i, (fterm1, fterm2) in enumerate(c):
        print(i, fterm1, fterm2)
        simis, desc, idx = search_in_all(fterm1, fterm2, all_diffs, step=50)
        print('calculated')
        desc = [d for d, s in zip(desc, simis) if s > threshold]
        simis = [s for s in  simis if s > threshold]
        if len(simis) == 0: continue
        print(len(desc), len(simis))
        with open(f'{model_folder}/comb_sims/{fterm1.replace("/", "")}_{fterm2.replace("/","")}_simis.pk', 'wb') as f:
            pk.dump(simis, f)
        with open(f'{model_folder}/comb_sims/{fterm1.replace("/", "")}_{fterm2.replace("/","")}_desc.pk', 'wb') as f:
            pk.dump(desc, f)


0 4J040/HC01 4J040/JB02


In [None]:
#########################################################################
# Searching for the combinations with the highest resulting Similarities
#########################################################################

def generate_search_results(thresh = 0.25):
    top_search_res = {}
    for c in generate_combinations():
        theme = c[0][0][:5]
        top_search_res[theme] = {}
        for i, (fterm1, fterm2) in enumerate(c):
            print(i, end='\r')
            try: 
                with open(f'{model_folder}/comb_sims/{fterm1.replace("/", "")}_{fterm2.replace("/","")}_simis.pk', 'rb') as f:
                    simis = pk.load(f)
                with open(f'{model_folder}/comb_sims/{fterm1.replace("/", "")}_{fterm2.replace("/","")}_desc.pk', 'rb') as f:
                    desc = pk.load(f)
            except FileNotFoundError:
                continue
            
            idx = np.argsort(simis)[::-1]
            desc = [desc[i] for i in idx if simis[i] > thresh]
            simis = [simis[i] for i in idx if simis[i] > thresh]
            
            if len(desc) > 0:
                top_search_res[theme][(fterm1, fterm2)] = [desc, simis] 
    return top_search_res


#####################################
# Searching for viable F-Term Chains
#####################################
def add_crossconnections(chain, combs):
    '''
    For a chain consisting of fterms it adds all possible connections between the fterms to it, which have search results above the threshhold.
    combs = all fterm connections that have results above the threshold
    '''
    chain_elements = [c for el in chain for c in el]
    cross_links = itertools.combinations(chain_elements, 2)
    for cross_link in cross_links:
        if cross_link in combs:
            chain.append(cross_link)
    return chain


def create_possible_connections(comb, all_combs):
    """
    Basically the same as add_crossconnections, but for a set of F-Terms and not a chain of connections
    """
    cross_links = itertools.combinations(comb, 2)
    valid_connections = []
    for cross_link in cross_links:
        if cross_link in all_combs:
            valid_connections.append(cross_link)
    return valid_connections


def find_chains(combs, n=3):
    elements = []
    [elements.extend(comb) for comb in combs]
    elements = list(set(elements))
    chain_items = itertools.combinations(elements, n)
    chains = []
    for items in chain_items:
        chain = []
        for i in range(len(items) - 1):
            chain_element = items[i: i+2]
            if chain_element in combs:
                chain.append(chain_element)
        # Only appends chain if it is valid
        if len(chain) == n-1:
            chains.append(chain)

    # Adding cross links to the chains if there are viable cross links
    for chain in chains:
        chain = add_crossconnections(chain, combs)
    return chains

    
def find_overlap(chain, top_search_res, n=5, chain_n=3):
    """
    n indicates the indices of the f_term string which will be used to measure the search result overlap 
    (5 = theme, 8 = viewpoint and 10 = whole fterm)
    """
    theme = chain[0][0][:5]
    # dict that contains the search resuts for each chain element
    combs = {element: top_search_res[theme][element] for element in chain}
    # all fterms found in any search
    unique_values = list(set([fterm[:n] for desc, simis in combs.values() for pair in desc for fterm in pair]))
    found_by = {u_v:[] for u_v in unique_values}
    score = {u_v: [] for u_v in unique_values}
    for (q1, q2), (desc, simis) in combs.items(): #q1, q2 = query f-terms
        for (ft1, ft2), simi in zip(desc, simis): #ft1, ft2 = Fterm found by search with q1, q2
            found_by[ft1[:n]].extend([q1, q2])
            found_by[ft2[:n]].extend([q1, q2])
            score[ft1[:n]].extend([simi, simi])
            score[ft2[:n]].extend([simi, simi])

    # Now the score will be manipulated, so that the score = maximum threshold at which the u_v can be found
    score_idx = {u_v: np.argsort(s)[::-1] for u_v, s in score.items()}
    found = []
    new_score = []
    for u_v, idx in score_idx.items():
        queries = found_by[u_v]
        scores_for_u_v = score[u_v]
        queries = [queries[i] for i in idx]
        scores_for_u_v = [scores_for_u_v[i] for i in idx]
        if len(set(queries)) == chain_n: # The theme/viewpoint/fterm was found in atleast one search containing each of the unique chain fterms
            i = 0
            while len(set(queries[:i])) < chain_n:
                i += 1
                
            found.append(u_v)
            new_score.append( scores_for_u_v[i])    # Maximal threshold
    
    # sorting
    score = new_score
    idx = np.argsort([s for s in score])[::-1]
    found = [found[i] for i in idx]
    score = [score[i] for i in idx]
    return found, score

In [None]:
# Promising Search F-Terms

search_fterms = {
    '4J040': ['4J040/KA09', '4J040/PA41', '4J040/MA01', '4J040/JB09', '4J040/MB01', '4J040/PA21', '4J040/JB10', '4J040/LA09', '4J040/LA06', '4J040/MB08', '4J040/MA07', '4J040/JB02'],
    '4F100': ['4F100/BA10', '4F100/JG01', '4F100/JL11', '4F100/AB01', '4F100/DB01', '4F100/JK01', '4F100/BA03', '4F100/BA02', '4F100/GB41', '4F100/BA07', '4F100/BA04', '4F100/EH11', '4F100/EJ61', '4F100/BA05', '4F100/JG05', '4F100/EJ16', '4F100/CA21', '4F100/BA06', '4F100/AB33', '4F100/DC30']
}


search_fterms_clustered = {
'4J040': {
    'adhesive':   ['4J040/PA21', '4J040/PA41', '4J040/JB09',  '4J040/JB02'],
    'conductive': ['4J040/JB10', '4J040/LA09'],
    'others':     ['4J040/KA09' , '4J040/MA01', '4J040/MB01', '4J040/LA06', '4J040/MB08', '4J040/MA07']
                 
},

'4F100': {
    'adhesive':   ['4F100/JL11', '4F100/JK01',],
    'conductive': ['4F100/JG01', '4F100/GB41', '4F100/EJ61', '4F100/JG05', '4F100/CA21'], 
    'others':     ['4F100/BA10', '4F100/AB01', '4F100/DB01', '4F100/BA03', '4F100/BA02', '4F100/BA07', '4F100/BA04', '4F100/EH11',  '4F100/BA05',  '4F100/EJ16', '4F100/BA06', '4F100/AB33', '4F100/DC30']
    } 
}

search_fields = {
    'adhesion electric structure': [
                                    '4J040/PA41', # Treatment after adhesion
                                    '4J040/JB09', # Pressure sensitive adhesive type
                                    '4J040/PA21', 
                                    '4J040/JB10',
                                    '4J040/LA09', 
                                    '4J040/LA06'
                                   ,'4J040/JB02'
                                   ], 
    'function material adhesion': [
                                   '4J040/KA09', 
                                   #'4J040/JB09', 
                                   '4J040/PA41', 
                                   '4J040/MA01', 
                                   '4J040/JB10'
                                  ], 
    'conductivity adhesion metal rigidity': ['4F100/JG01', '4F100/AB01', '4F100/JL11', '4F100/JK01']

    }



In [None]:
##############################################################################
# Searching with combinations where some F-Terms must be in a certain category
##############################################################################


def search_clustered_fterms(query_theme, threshold, comb_n=3, overlap=5):
    
    results = []
    results_scores = []

    search_results = generate_search_results(threshold)
    fields = search_fterms_clustered[query_theme]
    try:
        theme_search_res = search_results[query_theme]
    except KeyError:
        raise KeyError (f'No results for theme {query_theme}')
    fterm_fields = [fterms for fterms in fields.values()]
    
    while len(fterm_fields) < comb_n:  # Adding the F-Terms from the category others multiple times until the wanted number of combinations is reached
        fterm_fields.append(fields['others'])

    combinations = list(itertools.product(*fterm_fields))
    # Dropping F-Terms which have duplicates due to the multiple adding of the 'others' field
    combinations = [c for c in combinations if len(set(c)) == comb_n]
    combinations = [create_possible_connections(c, theme_search_res) for c in combinations]
    combinations = [c for c in combinations if len(set([x for x in c])) == comb_n]
    for c in combinations:
        overlaps, scores = find_overlap(c, search_results, overlap_n, comb_n)  
        results.extend(overlaps)
        results_scores.extend(scores)
        
    # Creating a unique set of results and scores
    unique_results = []
    unique_scores = []     # if a Theme/ Viewpoint of Fterm iis found multiple times the result with the highes average score is choosen
    for result, score in zip(results, results_scores):
        if result not in unique_results:

            unique_results.append(result)
            unique_scores.append(score)
        if result in unique_results:
            i = unique_results.index(result)
            if score > unique_scores[i]:
                unique_scores[i] = score

    # Sorting by score
    idx = np.argsort([score for score in unique_scores])[::-1]
    unique_scores = [unique_scores[i] for i in idx]
    unique_results = [unique_results[i] for i in idx]

    return unique_results, unique_scores
            

In [None]:
# Für nächste Woche! Anzeigen für welche Qery F-Terms die Themes gefunden werden

thresh_4J040 = 0.15
thresh_4F100 = 0.28

ncomb = 4
n_overlap = 5 # Theme overlaps

themes_4J040, scores_4J040 = search_clustered_fterms('4J040', thresh_4J040, ncomb, n_overlap)
themes_4F100, scores_4F100 = search_clustered_fterms('4F100', thresh_4F100, ncomb, n_overlap)


In [None]:
0.21367013454437256
compositions of macromolecular compounds
In Orbit

0.21141356229782104
forming of porous articles
In Orbit

0.20656512677669525
container, transfer, fixing, positioning, etc. of wafers, etc.
In Orbit

0.2052198350429535
credit cards or the like
In Orbit

0.20516209304332733
moulding techniques not otherwise provided for, e.g. moulding plastics; combinations of mouldings (no alteration)
In Orbit

0.19889184832572937
polymerisation methods in general
In Orbit

0.19878213107585907
manuscript preparation and masking in photoengraving
In Orbit

0.19873034954071045
polyesters or polycarbonates
In Orbit

0.1978987157344818
laminated bodies (2)
In Orbit

0.1977296620607376
processes specially adapted for manufacturing cables
In Orbit

0.19709782302379608
protection, testing and repair of underground structures and foundations
NEW!

0.19605137407779694
large containers
In Orbit

0.1957583725452423
air conditioning control equipment
In Orbit

0.1957554817199707
electromechanical clocks
In Orbit

0.19573277235031128
brushes
NEW!

0.19486621022224426
treatments of macromolecular shaped articles
In Orbit

0.193396657705307
building environments
NEW!

0.19339494407176971
thermal printer structures
In Orbit

0.19308800995349884
processing and handling of plastics and other materials for molding in general
In Orbit

0.19230255484580994
mating device and connection to printed circuit
In Orbit

0.19095058739185333
large containers
In Orbit

In [None]:

for theme, score in zip(themes_4J040, scores_4J040):
    print(score)
    print(theme_dict[theme])
    print('In Orbit' if theme in orbit_themes else 'NEW!')
    print('')

for theme, score in zip(themes_4F100, scores_4F100):
    print(score)
    print(theme_dict[theme])
    print('In Orbit' if theme in orbit_themes else 'NEW!')
    print('')
        

In [None]:
print('_'*20)
print('not in orbit')
print('_'*20)
for theme in not_in_orbit:
    print(theme, theme_dict[theme[:5]])
    try: 
        print(viewpoint_dict[theme[:8]])
    except Exception:
        print('vp not found')
        pass
    try: 
        print(number_dict[theme[:10]])
    except Exception:
        pass
    print('')

print('_'*20)
print('in orbit')
print('_'*20)

for theme in in_orbit:
    print(theme, theme_dict[theme[:5]])
    try: 
        print(viewpoint_dict[theme[:8]])
    except Exception:
        print('vp not found')
        pass
    try: 
        print(number_dict[theme[:10]])
    except Exception:
        pass
    print('')

In [None]:
top_search_res = generate_search_results(0.15)
df = {}
found_fterms = []
found_by = []
all_found = True

for field, query_fterms in search_fields.items():
    theme = query_fterms[0][:5]
    chain = list(itertools.combinations(query_fterms, 2))
    chain = [c for c in chain if c in top_search_res[theme].keys()]
    if len(chain) == 0:
        print('No chain for', field)
        all_found = False
        continue
        
    themes, scores = find_overlap(chain, top_search_res, n=5, chain_n=len(query_fterms))
    chain_elements = [x for a in chain for x in a]
    chain = field + ':   '
    for fterm in list(set(chain_elements)):
        label = number_dict[fterm]
        chain += f'{fterm}:'
        chain += label
        chain += '| '

    in_orbit = ['Yes' if theme in orbit_themes else 'No' for theme in themes]
    themes = [f'({theme}):{theme_dict[theme]}| Score: {score}' for theme, score in zip(themes, scores)]
    #themes = [f'({theme}):{full_descriptions_dict[theme]}| Score: {score}' for theme, score in zip(themes, scores)]
    if len(themes) == 0:
        print('No themes found',  field)
        all_found = False
        continue
    found_fterms.extend(themes)
    found_by.extend([chain for _ in themes])
    df[chain] = pd.Series(themes)
    df[f'In Orbit {field}'] = pd.Series(in_orbit)

all_found = True
if all_found:
    df = pd.DataFrame(df)          
    df = df.reset_index()
    df.to_excel('Field Search Themes 0_15 comp orbit.xlsx')

In [None]:
len(orbit_themes)

In [None]:
# generating the overlaps for all chains
thresh = 0.3
search = 5
search_theme = '4F100'
top_search_res = generate_search_results(thresh)
df = {}
found_fterms = []
found_by = []
n=4
for searches in search_fterms.values():
    theme = searches[0][:5]
    if not theme == search_theme:
        continue
    chains = list(itertools.combinations(searches, n))
    chains = [list(itertools.combinations(chain, 2)) for chain in chains]
    chains = [[e for e in chain if e in top_search_res[theme].keys()] for chain in chains]
    chains = [chain for chain in chains if len(set([e for e in chain])) == n]
    print('Len Chains', len(chains))
    for chain in chains:
        themes, scores = find_overlap(chain, top_search_res, search)
        chain_elements = [x for a in chain for x in a]
        chain = 'QUERY'
        for fterm in list(set(chain_elements)):
            label = number_dict[fterm]
            chain += f'{fterm}:'
            chain += label
            chain += '|'
    
        #themes = [f'({theme}):{theme_dict[theme]}| Score: {score}' for theme, score in zip(themes, scores)]
        #themes = [f'({theme}):{full_descriptions_dict[theme]}| Score: {score}' for theme, score in zip(themes, scores)]
        if len(themes) == 0:
            continue
        found_fterms.extend(themes)
        found_by.extend([chain for _ in themes])
        df[chain] = pd.Series(themes)

df = pd.DataFrame(df)          
df = df.reset_index()

#res_1 = [f[1:11] for f in found_fterms]
res_1 = found_fterms
res_1_found_by = found_by

# generating the overlaps for all chains
thresh = 0.2
search = 5
search_theme = '4J040'
top_search_res = generate_search_results(thresh)
df = {}
found_fterms = []
found_by = []
n=4
for searches in search_fterms.values():
    theme = searches[0][:5]
    if not theme == search_theme:
        continue
    chains = list(itertools.combinations(searches, n))
    chains = [list(itertools.combinations(chain, 2)) for chain in chains]
    chains = [[e for e in chain if e in top_search_res[theme].keys()] for chain in chains]
    chains = [chain for chain in chains if len(set([e for e in chain])) == n]
    print('Len Chains', len(chains))
    for chain in chains:
        themes, scores = find_overlap(chain, top_search_res, search)
        chain_elements = [x for a in chain for x in a]
        chain = 'QUERY'
        for fterm in list(set(chain_elements)):
            label = number_dict[fterm]
            chain += f'{fterm}:'
            chain += label
            chain += '|'
    
        #themes = [f'({theme}):{theme_dict[theme]}| Score: {score}' for theme, score in zip(themes, scores)]
        #themes = [f'({theme}):{full_descriptions_dict[theme]}| Score: {score}' for theme, score in zip(themes, scores)]
        if len(themes) == 0:
            continue
        found_fterms.extend(themes)
        found_by.extend([chain for _ in themes])
        df[chain] = pd.Series(themes)

#res_2 = [f[1:11] for f in found_fterms]
res_2 = found_fterms
res_2_found_by = found_by

In [None]:
res_all = [*res_1, *res_2]
len(orbit_themes), len(set(res_all))

In [None]:
# Filtering the found themes by their appearence in the orbit search

new_4F100_themes = list(set([res for res in res_1 if res[:5] not in orbit_themes]))
new_4J040_themes = list(set([res for res in res_2 if res[:5] not in orbit_themes]))

search_res = 'Found in search with 4F100 chains of length 4\n--------------------------------------------------------------------------\n'

for res in list(set(res_1)):
    
    line = f'In Orbit: {"TRUE" if res[:5] in orbit_themes else "FALSE"}; {res}: {theme_dict[res[:5]]} \n'
    search_res += line

search_res += 'Found in Search with 4J040 chains of length 4\n--------------------------------------------------------------------------\n'
    
for res in list(set(res_2)):
    
    line = f'In Orbit: {"TRUE" if res[:5] in orbit_themes else "FALSE"}; {res}: {theme_dict[res[:5]]} \n'
    search_res += line

with open('all_chains_len4_search.txt', 'w') as f:
    f.writelines(search_res)
    

In [None]:

# Embedding the query F-Terms
query_1_emb = torch.tensor(np.array([out_emb_dict[fterm[:10]] for fterm in search_fterms['4J040']]))
query_2_emb = torch.tensor(np.array([out_emb_dict[fterm[:10]] for fterm in search_fterms['4F100']]))

# Embedding the found F-Terms
found_1_emb = torch.tensor(np.array([out_emb_dict[fterm[:10]] for fterm in res_1_fterms]))
found_2_emb = torch.tensor(np.array([out_emb_dict[fterm[:10]] for fterm in res_2_fterms]))


# Embedding the orbit fterms
orbit_emb = []
o_ft = []
for fterm in counter_orbit.keys(): 
    try:
        orbit_emb.append(out_emb_dict[fterm])
        o_ft.append(fterm)
    except KeyError:
        pass

orbit_emb = np.array(orbit_emb)

all_emb = np.concatenate([query_1_emb, query_2_emb, found_1_emb, found_2_emb, orbit_emb], 0)

# Calculating the TSNE Representation
tsne = TSNE(n_components=2, verbose=0, random_state=69)
#rep = tsne.fit_transform(all_emb)


datasource_query1 = ColumnDataSource(
        data=dict(
            x = rep[0:query_1_emb.shape[0],0],
            y = rep[0:query_1_emb.shape[0],1],
            fterms = search_fterms['4J040'],
            themes = [theme_dict[fterm[:5]] for fterm in search_fterms['4J040']],
            viewpoints = [viewpoint_dict[fterm[:8]] for fterm in search_fterms['4J040']],
            numbers = [number_dict[fterm[:10]] for fterm in search_fterms['4J040']]))

datasource_query2 = ColumnDataSource(
        data=dict(
            x = rep[query_1_emb.shape[0]:query_1_emb.shape[0] + query_2_emb.shape[0],0],
            y = rep[query_1_emb.shape[0]:query_1_emb.shape[0] + query_2_emb.shape[0],1],
            fterms = search_fterms['4F100'],
            themes = [theme_dict[fterm[:5]] for fterm in search_fterms['4F100']],
            viewpoints = [viewpoint_dict[fterm[:8]] for fterm in search_fterms['4F100']],
            numbers = [number_dict[fterm[:10]] for fterm in search_fterms['4F100']]))

datasource_found1 = ColumnDataSource(
        data=dict(
            x = rep[query_1_emb.shape[0] + query_2_emb.shape[0]: query_1_emb.shape[0] + query_2_emb.shape[0] + found_1_emb.shape[0],0],
            y = rep[query_1_emb.shape[0] + query_2_emb.shape[0]: query_1_emb.shape[0] + query_2_emb.shape[0] + found_1_emb.shape[0],1],
            fterms = res_1_fterms,
            themes = [theme_dict[fterm[:5]] for fterm in res_1_fterms],
            viewpoints = [viewpoint_dict[fterm[:8]] for fterm in res_1_fterms],
            numbers = [number_dict[fterm[:10]] for fterm in res_1_fterms]
        ))

datasource_found2 = ColumnDataSource(
        data=dict(
            x = rep[query_1_emb.shape[0] + query_2_emb.shape[0] + found_1_emb.shape[0]: query_1_emb.shape[0] + query_2_emb.shape[0] + found_1_emb.shape[0] + found_2_emb.shape[0],0],
            y = rep[query_1_emb.shape[0] + query_2_emb.shape[0] + found_1_emb.shape[0]: query_1_emb.shape[0] + query_2_emb.shape[0] + found_1_emb.shape[0] + found_2_emb.shape[0],1],
            fterms = res_2_fterms,
            themes = [theme_dict[fterm[:5]] for fterm in res_2_fterms],
            viewpoints = [viewpoint_dict[fterm[:8]] for fterm in res_2_fterms],
            numbers = [number_dict[fterm[:10]] for fterm in res_2_fterms]
        ))

datasource_orbit = ColumnDataSource(
        data=dict(
            x = rep[ query_1_emb.shape[0] + query_2_emb.shape[0] + found_1_emb.shape[0] + found_2_emb.shape[0]:, 0],
            y = rep[ query_1_emb.shape[0] + query_2_emb.shape[0] + found_1_emb.shape[0] + found_2_emb.shape[0]:, 1],
            fterms = o_ft,
            themes = [theme_dict[fterm[:5]] for fterm in o_ft],
            viewpoints = [viewpoint_dict[fterm[:8]] for fterm in o_ft],
            numbers = [number_dict[fterm[:10]] for fterm in o_ft]))



hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>F-Term:</b> @fterms<br><b>Theme:</b> @themes<br><b>Viewpoint:</b> @viewpoints<br><b>Number:</b> @numbers<br><b>Query:</b> @found_by</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='TSNE Chain Search')
    

plot_tsne.triangle('x', 'y', size=7, fill_color=RGB(200, 30, 30), alpha=0.5, line_width=0, source=datasource_found1, name="Found F-Terms 1")
plot_tsne.triangle('x', 'y', size=7, fill_color=RGB(0, 50, 250), alpha=0.5, line_width=0, source=datasource_found2, name="Found F-Terms 2")
plot_tsne.square('x', 'y', size=7, fill_color=RGB(50, 250, 50), alpha=0.5, line_width=0, source=datasource_orbit, name="Orbit F-Terms")
plot_tsne.circle('x', 'y', size=10, fill_color=RGB(200, 150, 50), alpha=1, line_width=0, source=datasource_query1, name="Query F-Terms 1")
plot_tsne.circle('x', 'y', size=10, fill_color=RGB(200, 50, 150), alpha=1, line_width=0, source=datasource_query2, name="Query F-Terms 2")

show(plot_tsne)

In [None]:
# Show top n search results for evaluation
n = 5
for theme, combs in top_search_res.items():
    for (q1, q2), (desc, simis) in combs.items():
        print('')
        print('QUERY:')
        print( full_descriptions_dict[q1], '|||', full_descriptions_dict[q2])
        print('______________________')
        
        for (ft1, ft2), s in zip(desc[:n], simis[:n]):
            print(s, full_descriptions_dict[ft1[:10]], '|||', full_descriptions_dict[ft2[:10]])
            print('')

In [None]:
###############################################################
# Searching for promising Henkel Combinations  (In Theme)
###############################################################


search_combination = ['4F100/JL11', '4F100/JG01']  # (adhesiveness, conductivity being properties or funcitons)
#search_combination = ['4J040/JB09', '4J040/PA21']  # (pressure sensitive adhesive or adhesive types, use of adhesive characterised by specific shapess of functions)
#search_combination = ['4J004/CC02', '4J004/CA07']  # (foil like, inorganic materials)

simis, descs, idx = search_in_all(*search_combination, all_diffs, step=500)
results = descs[:100]


In [None]:
print('(adhesiveness, conductivity being properties or funcitons)')
for fterm1, fterm2 in results:
    theme = theme_dict[fterm1[:5]]
    vp1 = viewpoint_dict[fterm1[:8]]
    vp2 = viewpoint_dict[fterm2[:8]]
    n1 = number_dict[fterm1[:10]]
    n2 = number_dict[fterm2[:10]]

    print(f'''    
Theme: {theme}
vp1: {vp1}     vp2: {vp2}
n1: {n1}       n2:{n2}
''')

In [None]:
# Extracting uniqe F-terms from the results
found_f_terms = []
[found_f_terms.extend(comb) for comb in results]
found_f_terms = list(set(found_f_terms))

# Embedding the found F-Terms
hits_emb = torch.tensor(np.array([out_emb_dict[fterm[:10]] for fterm in found_f_terms]))

orbit_emb = []
o_ft = []
for fterm in counter_orbit.keys(): 
    try:
        orbit_emb.append(out_emb_dict[fterm])
        o_ft.append(fterm)
    except KeyError:
        pass

orbit_emb = np.array(orbit_emb)

henkel_emb = []
h_ft = []
for fterm in search_combination:
    try:
        henkel_emb.append(out_emb_dict[fterm])
        h_ft.append(fterm)
    except KeyError:
        pass

henkel_emb = np.array(henkel_emb)

all_emb = np.concatenate([orbit_emb, henkel_emb, hits_emb], 0)

tsne = TSNE(n_components=2, verbose=0, random_state=69)
rep = tsne.fit_transform(all_emb)

datasource_henkel = ColumnDataSource(
        data=dict(
            x = rep[len(o_ft):len(o_ft) + len(h_ft),0],
            y = rep[len(o_ft):len(o_ft) + len(h_ft),1],
            fterms = h_ft,
            themes = [theme_dict[fterm[:5]] for fterm in h_ft],
            viewpoints = [viewpoint_dict[fterm[:8]] for fterm in h_ft],
            numbers = [number_dict[fterm[:10]] for fterm in h_ft]))

datasource_orbit = ColumnDataSource(
        data=dict(
            x = rep[:len(o_ft), 0],
            y = rep[:len(o_ft), 1],
            fterms = o_ft,
            themes = [theme_dict[fterm[:5]] for fterm in o_ft],
            viewpoints = [viewpoint_dict[fterm[:8]] for fterm in o_ft],
            numbers = [number_dict[fterm[:10]] for fterm in o_ft]))

datasource_emb_search = ColumnDataSource(
        data=dict(
            x = rep[len(o_ft) + len(h_ft):, 0],
            y = rep[len(o_ft) + len(h_ft):, 1],
            fterms = found_f_terms,
            themes = [theme_dict[fterm[:5]] for fterm in found_f_terms],
            viewpoints = [viewpoint_dict[fterm[:8]] for fterm in found_f_terms],
            numbers = [number_dict[fterm[:10]] for fterm in found_f_terms]))


hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>F-Term:</b> @fterms<br><b>Theme:</b> @themes<br><b>Viewpoint:</b> @viewpoints<br><b>Number:</b> @numbers<br><b>Query:</b> @found_by</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Henkel and Orbit Embeddings')
    
plot_tsne.circle('x', 'y', size=10, fill_color=RGB(250, 50, 75), alpha=1, line_width=0, source=datasource_henkel, name="Henkel Embeddings")
plot_tsne.square('x', 'y', size=7, fill_color=RGB(50, 75, 250), alpha=0.2, line_width=0, source=datasource_orbit, name="Orbit Embeddings")
plot_tsne.triangle('x', 'y', size=7, fill_color=RGB(75, 250, 50), alpha=1, line_width=0, source=datasource_emb_search, name="Cos Similar Embeddings")

show(plot_tsne)