In [None]:
# Own Packages
from Masterarbeit_utils.model_utils_agg import get_tokenizer, load_and_modify_model, load_pretrained_Tokenizer


# Site-Packages
import itertools
import difflib
import dask.dataframe as dd
import torch
import psutil
import os
import sys
import pickle as pk
import pandas as pd
import numpy as np
import bokeh
import time
import random
import os
from collections import Counter
from matplotlib import pyplot as plt
from cProfile import Profile
from pstats import SortKey, Stats

# Dimension reduction algorithms
#from cuml.manifold import TSNE
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from scipy.spatial import distance
from scipy.fft import fft, fftfreq
# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.io import export_png

from transformers import AutoTokenizer, OPTForCausalLM
from tokenizers.processors import TemplateProcessing
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from torch.utils.data import Dataset

%matplotlib inline
output_notebook()

In [None]:
"""
The Paths to important folders have to be changed for your system.
"""

# Name of this experiment
model_name = 'gal_125_agg_aug_1'
checkpoint = 100_000
# If True normalization is applied to the embeddings
norm = True
seq_class = False
context_less = False

if seq_class:
    # Importing code for sequence classification
    from Masterarbeit_utils.model_utils_seq_class import get_tokenizer, load_and_modify_model, load_pretrained_Tokenizer
    from transformers import OPTForSequenceClassification

# This folder will be created and filled with txt.files for each sample after you run the Pytorch Dataset Notebook
dataset_folder = f'data/agg_dataset_samples'

# The folder at which the model will be saved. This folder has to be created for your system 
model_folder = f'data/models/{model_name}'
os.makedirs(model_folder, exist_ok=True)


# Folder in which the tokenizer will be saved
tokenizer_folder = f'data/tokenizers/{model_name}'
os.makedirs(tokenizer_folder, exist_ok=True)

# Folder at which all pickle files are stored. This folder is fixed for this project and should not be changed
dump_dir = r'PK_DUMP'

# Model parameters 
'''
mini	125 M
base	1.3 B
standard	6.7 B
large	30 B
huge	120 B'''
base_model_name = 'mini'

# All new Torch-objects will be by default in this dtype
# if default_type = float16 fp16 must be False
default_dtype = torch.float32
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_default_dtype(default_dtype)

# Default device on which the model will be loaded
default_device = 'cuda:0'

# Number of GPUs the model will be parallelised to 
num_gpus = 1
# If you change 'default_device' to 'cpu', make sure to set num_gpus to zero.
if default_device == 'cpu':
    num_gpus = 0

tensor_parallel = False


###########################
# Loading the Model
###########################
device_map=None
max_memory = {}
if num_gpus > 0:
    # based on https://github.com/huggingface/accelerate/blob/5315290b55ea9babd95a281a27c51d87b89d7c85/src/accelerate/utils/modeling.py#L274
    for i in range(num_gpus):
        _ = torch.tensor([0], device=i)
    for i in range(num_gpus):
        max_memory[i] = torch.cuda.mem_get_info(i)[0]
    device_map = "auto"
max_memory["cpu"] = psutil.virtual_memory().available

if seq_class:
    model = OPTForSequenceClassification.from_pretrained(f'{model_folder}/checkpoint-{checkpoint}', torch_dtype=default_dtype, low_cpu_mem_usage=True,
                                               device_map=device_map, max_memory=max_memory)
else:
    model = OPTForCausalLM.from_pretrained(f'{model_folder}/checkpoint-{checkpoint}', torch_dtype=default_dtype, low_cpu_mem_usage=True,
                                               device_map=device_map, max_memory=max_memory)

###########################
# Loading the Tokenizer
###########################
tokenizer = AutoTokenizer.from_pretrained(tokenizer_folder)
n_f_terms = len(tokenizer) - tokenizer.vocab_size
print('Loaded Tokenizer from serialized instance!')    
print(f'There are {n_f_terms} different F-Terms in the whole Dataset!')


###########################
# Loading Descriptions
###########################
with open(f'{dump_dir}/agg_themes_descriptions.pk', 'rb') as f:
    theme_dict = pk.load(f)
with open(f'{dump_dir}/agg_viewpoints_descriptions.pk', 'rb') as f:
    viewpoint_dict = pk.load(f)
with open(f'{dump_dir}/agg_numbers_descriptions.pk', 'rb') as f:
    number_dict = pk.load(f)
with open(f'{dump_dir}/agg_full_descriptions.pk', 'rb') as f:
    full_descriptions_dict = pk.load(f)


###########################
# Extracting the Embeddings
###########################

# Extracting the classification Head weights
inp_emb = model.get_input_embeddings()

if not seq_class:
    #Embeddings if the model is not a sequence classification model
    out_emb = model.get_output_embeddings()
    out_emb = next(out_emb.parameters()).to('cpu').detach().numpy()[2:]
    inp_emb = inp_emb(torch.arange(len(tokenizer))).to('cpu').detach().numpy()[50002:]

    if context_less:
        # Extracting context less embeddings
        if not os.path.isfile(f'{model_folder}/context_less_emb{checkpoint}.pk'):
            print('Calculating context less embeddings!')
            context_less_emb = [[] for _ in range(13)]
            for i in range(len(tokenizer)):
                print(i, end='\r')
                out = model(input_ids= torch.tensor([[i]]), attention_mask = torch.tensor([[1]]), output_hidden_states=True)
                
                out = out.hidden_states
                for i, k in enumerate(out):
                    context_less_emb[i].append(k.to('cpu').detach().numpy())
            with open(f'{model_folder}/context_less_emb{checkpoint}.pk', 'wb') as f:
                pk.dump(context_less_emb, f)
        else:
            print('Loading context less embeddings from disk')
            with open(f'{model_folder}/context_less_emb{checkpoint}.pk', 'rb') as f:
                context_less_emb = pk.load(f)
        
        # Combining context less embeddings of a layer to a single tensor
        for i, layer in enumerate(context_less_emb):
            layer = [e[0] for e in layer]
            layer = np.concatenate(layer, 0)
            context_less_emb[i] = layer

else: 
    # embeddings if the model is a Sequence Classifier
    inp_emb = inp_emb(torch.arange(50000)).to('cpu').detach().numpy()
    out_emb = model.score.weight
    out_emb.to('cpu').detach().numpy()
    

## Normalizing the embeddings 
def normalize(tensor):
    if norm:
        return torch.nn.functional.normalize(torch.tensor(tensor), p=2).numpy()
    else:
        return tensor

out_emb = normalize(out_emb)
inp_emb = normalize(inp_emb)
if not seq_class:
    if context_less:
        context_less_emb = [normalize(layer) for layer in context_less_emb]

# Extracting the matching F_terms for the weights and creating lists with the defintions
tokens = [tokenizer.decode(i) for i in range(len(tokenizer))]
f_term_tokens = tokens[50002:]

# Creating  a dict with f-Terms and their embedding vectors:
out_emb_dict = {token[:-1]: vec for token, vec in zip(f_term_tokens, out_emb)}
ft_emb_dict = {key: np.abs(fft(value)) for key, value in out_emb_dict.items()}
if seq_class:
    inp_emb_dict = {token[:-1]: vec for token, vec in zip(tokens[:50000], inp_emb)}
else: 
    inp_emb_dict = {token[:-1]: vec for token, vec in zip(f_term_tokens, inp_emb)}
    
# Creating Context Less Embedding Dicts
if context_less:
    if not seq_class:
        context_less_dicts = []
        for layer in context_less_emb:
            context_less_dicts.append({token[:-1]: vec for token, vec in zip(tokens, layer)})
    
####################################################
# Detecting F-Term Triangles with Supposed Similarities
####################################################
df=pd.read_csv("data/f-terms.csv", index_col=0)

#subset with "material" in the viewpoint description
df["vp"]=df.theme+"/"+df.viewpoint
df["fterm"]=df.theme+"/"+df.number
df2=df.copy()
# Aggregating the F-Terms

agg_rows = [row for i, row in df2.iterrows() if row.fterm in out_emb_dict.keys()]
df2 = pd.DataFrame(agg_rows)

#f-term descriptions are searched for the following materials
materials_list=[". Metal", ". Wood", ". Polymer", ". Glass", ". Wool", ". Cutting", ". Bleaching", ". Adhes", ". Heat insulat", ". Heat radiation", ". Cooling", ". Insulat", ". Coating", ". Rubber", ". Rota", ". Compress", ". Bolt", "Welding" ]
materials_f_terms={}
for material in materials_list:
    materials_f_terms[material]=[df2[df2.label.str.contains(material.lower()[2:], na=False, case=False)].theme.unique(), df2[df2.label.str.contains(material.lower()[2:], na=False, case=False)].fterm.values]
    
    
#pairs of two materials with fterms in same viewpoints are created
material_combinations={}
for mat1 in materials_list:
    for mat2 in materials_list:
        for mat3 in materials_list:
            
            if len(set([mat1,mat2,mat3]))!=3: continue
            # filtering shared f-terms with both materials in the label description
            unique_fterms_mat1 = list(set(materials_f_terms[mat1][1]) - set(list(materials_f_terms[mat2][1]) + list(materials_f_terms[mat3][1])))
            unique_fterms_mat2 = list(set(materials_f_terms[mat2][1]) - set(list(materials_f_terms[mat1][1]) + list(materials_f_terms[mat3][1])))
            unique_fterms_mat3 = list(set(materials_f_terms[mat3][1]) - set(list(materials_f_terms[mat1][1]) + list(materials_f_terms[mat2][1])))

            theme_mat1= set([x[:5]for x in unique_fterms_mat1])
            theme_mat2= set([x[:5]for x in unique_fterms_mat2])
            theme_mat3= set([x[:5]for x in unique_fterms_mat3])
            
            shared_theme=[x for x in theme_mat1 if x in theme_mat2 and x in theme_mat3]
            fterm_pairs=[]
            for theme in shared_theme:
                fterm_pairs.append([theme,
                                   [fterm for fterm in unique_fterms_mat1 if fterm.startswith(theme)],
                                   [fterm for fterm in unique_fterms_mat2 if fterm.startswith(theme)],
                                   [fterm for fterm in unique_fterms_mat3 if fterm.startswith(theme)]])


            material_combinations[mat1[2:]+"_"+mat2[2:]+"_"+mat3[2:]]=fterm_pairs

# Dropping empty material combinations
material_combinations = {key: value for key, value in material_combinations.items() if len(value) == 4}


# New Approach: In viewpoint search 

In [None]:
# Creating a hirachical dict with all F-Terms sorted in them 
f_term_dict = {}
for f_term in f_term_tokens:
    theme = f_term.split('/')[0]
    vp = f_term[:8]
    # Creating a dict entry for the theme
    try: 
        _ = f_term_dict[theme]
    except KeyError:
        f_term_dict[theme] = {}

    # Creating a dict entry for the viewpoint
    try:
        # The first dict call will def. work the second may work if the vp-dict entry 
        # was already made. If it works the theme is appended to the viewpoint dict
        f_term_dict[theme][vp].append(f_term)
    except KeyError:
        f_term_dict[theme][vp] = []

In [None]:
###############################################################
# Using String Similarity to find Semantically matching Viewpoints or F-Termsl

def sort_string_list(string_list, chunk_size=100000000, greedy_steps=10):
    '''
    string_list = list of strings which should be sorted by their string similarity
    
    returns a the list with all strings sorted by their string similarity.              
    '''
    # We need at least one step to collect the best match
    greedy_steps += 1
        
    string_list2 = list(string_list)
    current_string = string_list2.pop(0)
    sorted_list = [current_string]
    # First sorting step has to be dones outside the loop to start the loop with pretr
    string_similarities = [difflib.SequenceMatcher(None, string, current_string).ratio() for string in string_list2]
    # Sorting the similarities
    idxs = list(np.argsort(string_similarities))
    # The best matching strings will be added to the output.
    # The last added string will be used as search string in the next iteration
    string_list2 = [string_list2[i] for i in idxs[::-1]]
    similar_strings, string_list2 = string_list2[:chunk_size] , string_list2[chunk_size:]
    while len(sorted_list) < len(string_list):
        print(' '*1000, end='\r')
        print(len(sorted_list), len(string_list2), current_string, end='\r')
        # Calculating the string similarities
        string_similarities = [difflib.SequenceMatcher(None, string, current_string).ratio() for string in similar_strings]
        # Sorting the similarities
        idxs = list(np.argsort(string_similarities))
        # The best matching strings will be added to the output.
        # The last added string will be used as search string in the next iteration
        similar_strings = [similar_strings[i] for i in idxs[::-1]]
        for _ in range(greedy_steps):
            try:
                current_string = similar_strings.pop(0)
                sorted_list.append(current_string)
            except IndexError: 
                continue
        for _ in range(greedy_steps):
            # replenishing the similar strings list with new strings from the whole list
            try:
                similar_strings.append(string_list2.pop(0))   
            except IndexError:
                continue
                    
    return sorted_list
        

# Sorting Viewpoints by string similarity
all_vp = [vp for theme, t_dict in f_term_dict.items() for vp in t_dict.keys()]
all_vp_desc = [viewpoint_dict[vp] for vp in all_vp]
#sorted_vp = sort_string_list(all_vp_desc)

all_f_terms = number_dict.keys()
all_f_term_desc = number_dict.values()

sorted_f_terms = sort_string_list(all_f_term_desc)

# Create all Possible Viewpoint triangles (without using string similarity)

In [None]:
# Creating a hirachical dict with all F-Terms sorted in them 
f_term_dict = {}
for f_term in f_term_tokens:
    theme = f_term.split('/')[0]
    vp = f_term[:8]
    # Creating a dict entry for the theme
    try: 
        _ = f_term_dict[theme]
    except KeyError:
        f_term_dict[theme] = {}

    # Creating a dict entry for the viewpoint
    try:
        # The first dict call will def. work the second may work if the vp-dict entry 
        # was already made. If it works the theme is appended to the viewpoint dict
        f_term_dict[theme][vp].append(f_term)
    except KeyError:
        f_term_dict[theme][vp] = []

# Searching for triangles of identical viewpoints in differend themes
# 3 for triangles, higher combinations of f-terms should also be possible with this algorithm
edges = 3
# Minimum number of themes in which a viewpoint triangle must appear.
min_themes = 6
all_combs = []
for theme, t_dict in f_term_dict.items():
    vps = [viewpoint_dict[vp] for vp in t_dict.keys()]
    # creating the combinations
    combs = list(itertools.combinations(vps, edges))
    # dropping eventual combinations with duplicates
    combs = [frozenset(comb) for comb in combs if len(frozenset(comb)) == edges]
    # dropping duplicate combinations or combinations with permutations
    combs = list(set(combs))
    all_combs.extend(combs)

all_vp_tri_desc = dict(Counter(all_combs))
all_vp_tri_desc = {key: value for key, value in all_vp_tri_desc.items() if value >= min_themes}



In [None]:
# Generating all possilbe viewpoints from the viewpoint description desc triangles.
all_vp_tri={}

for triangle in all_vp_tri_desc.keys():
    all_vp_tri[triangle] = []
    for theme, t_dict in f_term_dict.items():
        matching_vps = [vp for vp in t_dict.keys() if viewpoint_dict[vp] in list(triangle)]
        if len(matching_vps) != 3:
            continue
        all_vp_tri[triangle].append(matching_vps)
        

In [None]:
# Generating all trarget differences

# Herachical dict in the following order triangle -> themes -> diffs_a, diffs_b 
target_diffs ={}
emb = out_emb_dict
print(len(all_vp_tri))

for i, (tri, vps) in enumerate(all_vp_tri.items()):
    print(i, end='\r')
    target_diffs[tri] = {}
    for vp_tri in vps:
        
        f_terms = [f_term_dict[vp[:5]][vp] for vp in vp_tri]
        combinations = list(itertools.product(*f_terms))
        # dropping all duplicate combinations
        combinations = list(set([frozenset(comb) for comb in combinations if len(frozenset(comb)) == 3]))
        
        diffs_a = {(list(comb)[0], list(comb)[1]): normalize(np.array([emb[list(comb)[1][:-1]] - emb[list(comb)[0][:-1]]])) for comb in combinations}
        diffs_b = {(list(comb)[0], list(comb)[2]): normalize(np.array([emb[list(comb)[2][:-1]] - emb[list(comb)[0][:-1]]])) for comb in combinations}
        
        theme = vp_tri[0][:5]
        target_diffs[tri][theme] = [diffs_a, diffs_b]
 

In [None]:
# Creating all Differences

all_diffs = {}
for i, (theme, t_dict) in enumerate(f_term_dict.items()):
    print(i, theme, end='\r')
    theme_f_terms = []
    [theme_f_terms.extend(f_term) for f_term in t_dict.values()]

    combinations = list(itertools.combinations(theme_f_terms, 2))

    # Filtering the combinations from all unwanted combinations
    # 1 Dropping F-Term combinations within the same viewpoint
    combinations = [(a, b) for a, b in combinations if viewpoint_dict[a[:8]] != viewpoint_dict[b[:8]]]
    # Generating the difference vectors for each combination
    diffs = {pair: normalize(np.array([emb[pair[1][:-1]]-emb[pair[0][:-1]]])) for pair in combinations}
    all_diffs = all_diffs | diffs
   

In [None]:
def create_subset(exclude_theme):
    diffs = {comb: diff for comb, diff in all_diffs.items() if comb[0][:5] != exclude_theme}
    return diffs

for t in f_term_dict.keys():
    break

print(t)

len(all_diffs), len(create_subset(t))

In [None]:
# Search 
n_search = 100_000_000_000
cos = torch.nn.CosineSimilarity(dim=1)
for tri, themes_dict in target_diffs.items():
    print(f'''
    {list(tri)[0]}
    
    {list(tri)[1]} 
    
    {list(tri)[2]}    ''')
    for theme, (diffs_a, diffs_b) in themes_dict.items():
        #print(theme, len(diffs_a), end='\r')
        
        
        search_diffs = create_subset(theme)
        potential_hits_a = {key: value for t, (d_a, d_b) in themes_dict.items() for key, value in d_a.items() if t != theme}
        potential_hits_b = {key: value for t, (d_a, d_b) in themes_dict.items() for key, value in d_b.items() if t != theme}
        
        best_simis_a = []
        best_combs_a = []
        qv_a = next(iter(diffs_a.values()))
     
        for i, (comb, diff) in enumerate(search_diffs.items()):
            if i%1000 ==0:
                print('iteration',f'{i:,}',  end='\r')
            simi = cos(torch.tensor(qv_a), torch.tensor(diff))

            if len(best_simis_a) < n_search:
                best_simis_a.append(simi.item())
                best_combs_a.append(comb)
            else:
                lowest_idx = np.argmin(best_simis_a)
                lowest_simi = best_simis_a[lowest_idx]
                if lowest_simi < simi:
                    best_simis_a[lowest_idx] = simi.item()
                    best_combs_a[lowest_idx] = comb

        best_simis_b = []
        best_combs_b = []
        
        qv_b = next(iter(diffs_b.values()))
        for i, (comb, diff) in enumerate(search_diffs.items()):
            if i%1000 ==0:
                print('iteration', f'{i:,}',  end='\r')
            simi = cos(torch.tensor(qv_b), torch.tensor(diff))

            if len(best_simis_b) < n_search:
                best_simis_b.append(simi.item())
                best_combs_b.append(comb)
            else:
                lowest_idx = np.argmin(best_simis_b)
                lowest_simi = best_simis_b[lowest_idx]
                if lowest_simi < simi:
                    best_simis_b[lowest_idx] = simi.item()
                    best_combs_b[lowest_idx] = comb

        idx_a = np.argsort(best_simis_a)[::-1]
        idx_b = np.argsort(best_simis_b)[::-1]
        n =10_000
        best_simis_a = [best_simis_a[i] for i in idx_a][:n]
        best_simis_b = [best_simis_b[i] for i in idx_b][:n]
        best_combs_a = [best_combs_a[i] for i in idx_a][:n]
        best_combs_b = [best_combs_b[i] for i in idx_b][:n]

        break
    break

In [None]:
len(best_simis_a)

In [None]:
idx_a = np.argsort(best_simis_a)
idx_b = np.argsort(best_simis_b)

simis_a_sorted = [best_simis_a[i] for i in idx_a]
simis_b_sorted = [best_simis_b[i] for i in idx_b]

comb_a = [best_combs_a[i] for i in idx_a]
comb_b = [best_combs_b[i] for i in idx_b]

themes_a = [c[0][:5] for c in comb_a]
themes_b = [c[0][:5] for c in comb_b]

vps_a = [c[0][:8] for c in comb_a] 
vps_b = [c[0][:8] for c in comb_b] 

# a = set(vps_a)
# b = set(vps_b)

# z = a.intersection(b)
# for vp in list(a):
#     t = vp[:5]
#     if t in themes_dict.keys():
#         print(vp)

# print(len(z), len(a))
# print(themes_dict.keys())
# ######################################

for tri, themes_dict in target_diffs.items():
    break
    for theme, (diffs_a, diffs_b) in themes_dict.items():
        break
print(themes_dict.keys())

found_tri = []

for c in comb_a:
    vp_a = c[0][:8]
    if vp_a in vps_b:
        for c2 in comb_b: 
            vp_b = c2[0][:8]
            if vp_b == vp_a:
                vp_a = frozenset([f_term[:8] for f_term in c])
                vp_b = frozenset([f_term[:8] for f_term in c2])
                found_tri.extend([vp_a, vp_b])

targets = []

for theme, (diffs_a, diffs_b) in themes_dict.items():
    for comb_a, comb_b in zip(diffs_a.keys(), diffs_b.keys()):

        vp_a = frozenset([f_term[:8] for f_term in comb_a])
        vp_b = frozenset([f_term[:8] for f_term in comb_b])
        targets.extend([vp_a, vp_b])


found_tri = list(set(found_tri))
target_tri = list(set(targets))

print(len(targets), len(found_tri))
correct_tri = []
for tri in found_tri:
    if tri in targets:
        correct_tri.append(tri)
print(correct_tri)
print(f'Apriori probability = {100*len(targets)/len(search_diffs)}%')
print(f'Aposteriori probability = {100*len(correct_tri)/len(found_tri)}')

In [None]:
# Create Substet with all themes that contain a 'purpose' viewpoint, a 'use' viewpoint and a 'structure' viewpoint
min_simi = 0.58
subset = {}
for theme, t_dict in f_term_dict.items():
    subset[theme] = {}
    vps = [viewpoint_dict[vp] for vp in t_dict.keys()]
    
    purpose_simis = [difflib.SequenceMatcher(None, vp, 'purpose').ratio() for vp in vps]
    use_simis = [difflib.SequenceMatcher(None, vp, 'use').ratio() for vp in vps]
    structure_simis = [difflib.SequenceMatcher(None, vp, 'structure').ratio() for vp in vps]

    purpose_idx = np.argmax(purpose_simis)
    use_idx = np.argmax(use_simis)
    structure_idx = np.argmax(structure_simis)

    if len(set([purpose_idx, structure_idx, use_idx])) < 3:
        # all viewpoints need to be different
        subset.pop(theme)
        continue

    if purpose_simis[purpose_idx] > min_simi and use_simis[use_idx] > min_simi and structure_simis[structure_idx] > min_simi:
        vps = list(t_dict.keys())
        subset[theme][vps[purpose_idx]] = t_dict[vps[purpose_idx]]
        subset[theme][vps[use_idx]] = t_dict[vps[use_idx]]
        subset[theme][vps[structure_idx]] = t_dict[vps[structure_idx]]

    else:
        subset.pop(theme)
        continue
    


In [None]:
for k, v in subset.items():
    print(k, ':')
    for k2 in v:
        print(viewpoint_dict[k2])
    print(' ')

# Triangels in Themes

In [None]:
emb_dict = out_emb_dict
# Generating F-Term triangles form the material combination and dropping in viewpoint comparisons
f_term_combinations = {key: [(f1, f2, f3) for f1 in values[1] for f2 in values[2] for f3 in values[3] if len(set([f1[:8], f2[:8], f3[:8]])) == 3] for key, theme_values in material_combinations.items() for values in theme_values}
# Dropping empty combinations and duplicates
f_term_combinations = {tuple(key.split('_')): value for key, value in f_term_combinations.items() if len(value) >=3}
f_term_combinations = {key: value for i, (key, value) in enumerate(f_term_combinations.items()) if i%2==0} 

# Generating differences
f_term_combinations = {key: {(combination[0], combination[1], combination[0], combination[2]): [normalize(np.array([emb_dict[combination[1][:]] - emb_dict[combination[0][:]]]))[0], normalize(np.array([emb_dict[combination[2][:]] - emb_dict[combination[0][:]]]))[0]] for combination in combinations} for key, combinations in f_term_combinations.items()}

f_term_combinations.keys()

In [None]:
##############################################
# Calculating all in theme combinations
theme_f_term_dict = {}
for f_term in f_term_tokens:
    theme = f_term.split('/')[0]
    vp = f_term[:8]
    # Creating a dict entry for the theme
    try: 
        _ = theme_f_term_dict[theme]
    except KeyError:
        theme_f_term_dict[theme] = {}

    # Creating a dict entry for the viewpoint

    try:
        # The first dict call will def. work the second may work if the vp-dict entry 
        # was already made. If it works the theme is appended to the viewpoint dict
        theme_f_term_dict[theme][vp].append(f_term)
    except KeyError:
        theme_f_term_dict[theme][vp] = []
        

n_combs = 0
for vps in theme_f_term_dict.values():
    n_f_terms = np.sum([len(f_terms) for f_terms in vps.values()])
    for vp, f_terms in vps.items():
        #print(len(f_terms), n_f_terms)
        vp_comb = (n_f_terms - len(f_terms)) * len(f_terms)
        n_combs += vp_comb
        #print(vp_comb)
print(f'There are {n_combs:,} in theme combinations')

In [None]:
# Generating the differences
all_diffs = []
all_desc = []
emb_dict = out_emb_dict

for i, (theme, t_dict) in enumerate(theme_f_term_dict.items()):
    print(i, 'Generating Differences for Theme: ', theme, end='\r')
    for vp, f_terms in t_dict.items():
        keys = [key for key in t_dict.keys() if key != vp]
        # This one liner basically concatenates all f_term lists except the f_term list of the current viewpoint
        out_vp_f_terms = [f_term for f_term_list in [t_dict[key] for key in keys] for f_term in f_term_list]    
        # generating the differences
        for f_term1 in f_terms:
            for f_term2 in out_vp_f_terms:
                diff = emb_dict[f_term2[:-1]] - emb_dict[f_term1[:-1]]
                diff = normalize(np.array([diff]))[0]
                all_diffs.append(diff)
                all_desc.append([f_term1, f_term2])
                
all_diffs = np.array(all_diffs)
print(all_diffs.shape)

In [None]:
def drop_theme(f_term_pairs, theme):
    '''
    Drops all f_term pairs that contain a certain theme.

    returns the cleaned f_term_pairs and a boolean index
    '''

    idx = [f_term_pair[0][:5] != theme for f_term_pair in f_term_pairs]
    f_term_pairs = [f_term_pair for f_term_pair, b in zip(f_term_pairs, idx) if b]
    return f_term_pairs, idx


def find_pairs(f_term_pairs, key_a, key_b, exact=False):
    '''
    Returns the indices of the f_term_pairs which contain key_a in the first f-term description and key_b in the description of the second f_term
    '''
    f_term_desc = [[number_dict[pair[0][:-1]], number_dict[pair[1][:-1]]] for pair in f_term_pairs]
    if exact:
        matches = [bool(pair[0].lower().startswith(key_a.lower) * pair[1].lower().startswith(key_b).lower()) for pair in f_term_desc]
    else:
        matches = [bool((key_a in pair[0]) * (key_b in pair[1])) for pair in f_term_desc]
        # preventing one F-Term of the pair contains both F-Terms
        reverse_matches = [bool((key_b in pair[0]) + (key_a in pair[1])) for pair in f_term_desc]
        matches = [bool(match and not r_match) for match, r_match in zip(matches, reverse_matches)]
    idx = np.arange(len(f_term_pairs))
    idx = idx[matches]
    return idx

In [None]:
# Searching for similar triangles in all 1 one 1 in theme (out of viewpoint) combinations

for test_triangle, test_targets in f_term_combinations.items():
    print(test_triangle)
    for f_terms, vecs in test_targets.items():
        print(f_terms[0][:5])

In [None]:
combination = [". metal", ". adhes"]

comb_matches_idx = find_pairs(all_desc, *combination)
comb_desc, comb_diffs = [all_desc[i] for i in comb_matches_idx], all_diffs[comb_matches_idx]

total_hits = 0
total_searched = 0
total_priori_s = 0
total_priori_hits = 0
n = 100
cos = torch.nn.CosineSimilarity(dim=1)

for query_vec, desc in zip(comb_diffs, comb_desc):
    query_vec = torch.tensor(query_vec).unsqueeze(0)
    similarities = torch.tensor([cos(torch.tensor(vec), query_vec) for vec in all_diffs[:]])
    sorted = torch.argsort(similarities).numpy()[::-1]

    guesses = []
    # Also calculating the hits in a theme to be able to calculate how many hits there could be in total
    theme_guesses = []
    i = 0
    # Iterating over all top combinations and removing the in theme combinations until at least n valid comparisons are found
    while len(guesses) < n:
        # Extracting the top -n f-term descriptions
        chunk = sorted[i*n:(i+1)*n]
        chunk_desc = [all_desc[i] for i in chunk]
        # Removing all in theme Combinations
        chunk_clean, idx = drop_theme(chunk_desc, desc[0][:5])
        # idx is a boolean index
        
        idx = np.array([not i for i in idx])
        theme_chunk = [i for i, b in zip(chunk_desc, idx) if b] 
        theme_guesses.extend([theme_chunk] if sum(idx)==-1 else theme_chunk)
        guesses.extend(chunk_clean)
                             
    guesses = guesses[:n]
    hits = len(find_pairs(guesses, *combination))
    theme_hits = len(find_pairs(theme_guesses, *combination))
    
    total_hits += hits
    total_searched += n
    total_priori_s += len(all_desc) - len(theme_guesses)
    total_priori_hits += len(comb_matches_idx) - theme_hits

    print(f' Current Hits: {hits} Current Targets: {len(comb_matches_idx) - theme_hits}; Overall: a priori: {total_priori_hits*100/total_priori_s:.5f}%, a posteriori: {total_hits*100/total_searched:.5f}%; Query Description {number_dict[desc[0][:-1]]}     {number_dict[desc[1][:-1]]}', end='\r')
    