In [3]:
# Own Packages
from Masterarbeit_utils.model_utils_agg import get_tokenizer, load_and_modify_model, load_pretrained_Tokenizer


# Site-Packages
import dask.dataframe as dd
import torch
import psutil
import os
import sys
import pickle as pk
import pandas as pd
import numpy as np
import bokeh
import time
import random
from matplotlib import pyplot as plt
from cProfile import Profile
from pstats import SortKey, Stats

# Dimension reduction algorithms
#from cuml.manifold import TSNE
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from scipy.spatial import distance
# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.io import export_png

from transformers import AutoTokenizer, OPTForCausalLM
from tokenizers.processors import TemplateProcessing
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from torch.utils.data import Dataset

%matplotlib inline
output_notebook()

In [6]:
"""
This cell has to be run twice due to an unknown bug
"""

# Name of this experiment
model_name = 'gal_1300_agg_aug_1'
checkpoint = 83438
# If True normalization is applied to the embeddings
norm = True
seq_class = False
context_less = False

if seq_class:
    # Importing code for sequence classification
    from Masterarbeit_utils.model_utils_seq_class import get_tokenizer, load_and_modify_model, load_pretrained_Tokenizer
    from transformers import OPTForSequenceClassification

# This folder will be created and filled with txt.files for each sample after you run the Pytorch Dataset Notebook
dataset_folder = f'data/agg_dataset_samples'

# The folder at which the model will be saved. This folder has to be created for your system 
model_folder = f'data/models/{model_name}'
os.makedirs(model_folder, exist_ok=True)


# Folder in which the tokenizer will be saved
tokenizer_folder = f'data/tokenizers/{model_name}'
os.makedirs(tokenizer_folder, exist_ok=True)

# Folder at which all pickle files are stored. This folder is fixed for this project and should not be changed
dump_dir = r'PK_DUMP'

# Model parameters 
'''
mini	125 M
base	1.3 B
standard	6.7 B
large	30 B
huge	120 B'''
base_model_name = 'mini'

# All new Torch-objects will be by default in this dtype
# if default_type = float16 fp16 must be False
default_dtype = torch.float32
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_default_dtype(default_dtype)

# Default device on which the model will be loaded
default_device = 'cpu'

# Number of GPUs the model will be parallelised to 
num_gpus = 1
# If you change 'default_device' to 'cpu', make sure to set num_gpus to zero.
if default_device == 'cpu':
    num_gpus = 0

tensor_parallel = False


###########################
# Loading the Model
###########################
device_map=None
max_memory = {}
if num_gpus > 0:
    # based on https://github.com/huggingface/accelerate/blob/5315290b55ea9babd95a281a27c51d87b89d7c85/src/accelerate/utils/modeling.py#L274
    for i in range(num_gpus):
        _ = torch.tensor([0], device=i)
    for i in range(num_gpus):
        max_memory[i] = torch.cuda.mem_get_info(i)[0]
    device_map = "auto"
max_memory["cpu"] = psutil.virtual_memory().available

if seq_class:
    model = OPTForSequenceClassification.from_pretrained(f'{model_folder}/checkpoint-{checkpoint}', torch_dtype=default_dtype, low_cpu_mem_usage=True,
                                               device_map=device_map, max_memory=max_memory)
else:
    model = OPTForCausalLM.from_pretrained(f'{model_folder}/checkpoint-{checkpoint}', torch_dtype=default_dtype, low_cpu_mem_usage=True,
                                               device_map=device_map, max_memory=max_memory)

###########################
# Loading the Tokenizer
###########################
tokenizer = AutoTokenizer.from_pretrained(tokenizer_folder)
n_f_terms = len(tokenizer) - tokenizer.vocab_size
print('Loaded Tokenizer from serialized instance!')    
print(f'There are {n_f_terms} different F-Terms in the whole Dataset!')


###########################
# Loading Descriptions
###########################
with open(f'{dump_dir}/agg_themes_descriptions.pk', 'rb') as f:
    theme_dict = pk.load(f)
with open(f'{dump_dir}/agg_viewpoints_descriptions.pk', 'rb') as f:
    viewpoint_dict = pk.load(f)
with open(f'{dump_dir}/agg_numbers_descriptions.pk', 'rb') as f:
    number_dict = pk.load(f)
with open(f'{dump_dir}/agg_full_descriptions.pk', 'rb') as f:
    full_descriptions_dict = pk.load(f)


###########################
# Extracting the Embeddings
###########################

# Extracting the classification Head weights
inp_emb = model.get_input_embeddings()

if not seq_class:
    #Embeddings if the model is not a sequence classification model
    out_emb = model.get_output_embeddings()
    out_emb = next(out_emb.parameters()).to('cpu').detach().numpy()[2:]
    inp_emb = inp_emb(torch.arange(len(tokenizer))).to('cpu').detach().numpy()[50002:]

    if context_less:
        # Extracting context less embeddings
        if not os.path.isfile(f'{model_folder}/context_less_emb.pk'):
            print('Calculating context less embeddings!')
            context_less_emb = [[] for _ in range(13)]
            for i in range(len(tokenizer)):
                print(i, end='\r')
                out = model(input_ids= torch.tensor([[i]]), attention_mask = torch.tensor([[1]]), output_hidden_states=True)
                
                out = out.hidden_states
                for i, k in enumerate(out):
                    print(i)
                    context_less_emb[i].append(k.to('cpu').detach().numpy())
            with open(f'{model_folder}/context_less_emb.pk', 'wb') as f:
                pk.dump(context_less_emb, f)
        else:
            print('Loading context less embeddings from disk')
            with open(f'{model_folder}/context_less_emb.pk', 'rb') as f:
                context_less_emb = pk.load(f)
        
        # Combining context less embeddings of a layer to a single tensor
        for i, layer in enumerate(context_less_emb):
            layer = [e[0] for e in layer]
            layer = np.concatenate(layer, 0)
            context_less_emb[i] = layer

else: 
    # embeddings if the model is a Sequence Classifier
    inp_emb = inp_emb(torch.arange(50000)).to('cpu').detach().numpy()
    out_emb = model.score.weight
    out_emb.to('cpu').detach().numpy()
    

## Normalizing the embeddings 
def normalize(tensor):
    if norm:
        return torch.nn.functional.normalize(torch.tensor(tensor), p=2).numpy()
    else:
        return tensor

out_emb = normalize(out_emb)
inp_emb = normalize(inp_emb)
if not seq_class:
    context_less_emb = [normalize(layer) for layer in context_less_emb]

# Extracting the matching F_terms for the weights and creating lists with the defintions
tokens = [tokenizer.decode(i) for i in range(len(tokenizer))]
f_term_tokens = tokens[50002:]

# Creating  a dict with f-Terms and their embedding vectors:

out_emb_dict = {token[:-1]: vec for token, vec in zip(f_term_tokens, out_emb)}
if seq_class:
    inp_emb_dict = {token[:-1]: vec for token, vec in zip(tokens[:50000], inp_emb)}
else: 
    inp_emb_dict = {token[:-1]: vec for token, vec in zip(f_term_tokens, inp_emb)}
    
if context_less:
    # Creating Context Less Embedding Dicts
    if not seq_class:
        context_less_dicts = []
        for layer in context_less_emb:
            context_less_dicts.append({token[:-1]: vec for token, vec in zip(tokens, layer)})


####################################################
# Detecting F-Term Pairs with Supposed Similarities
####################################################
df=pd.read_csv("data/f-terms.csv", index_col=0)

#subset with "material" in the viewpoint description
df["vp"]=df.theme+"/"+df.viewpoint
df["fterm"]=df.theme+"/"+df.number
df2=df.copy()

#f-term descriptions are searched for the following materials
materials_list=[". Metal", ". Wood", ". Polymer", ". Cutting", ". Bleaching", ". Adhes"]
materials_f_terms={}
for material in materials_list:
    materials_f_terms[material]=[df2[df2.label.str.startswith(material, na=False)].theme.unique(), df2[df2.label.str.startswith(material, na=False)].fterm.values]

#pairs of two materials with fterms in same viewpoints are created
material_combinations={}
for mat1 in materials_list:
    for mat2 in materials_list:
        
        if mat1==mat2: continue
        if mat2[2:]+"_"+mat1[2:] in materials_f_terms: continue
        
        #filtering shared f-terms with both materials in the label description
        
        unique_fterms_mat1 = list(set(materials_f_terms[mat1][1]) - set(materials_f_terms[mat2][1]))
        unique_fterms_mat2 = list(set(materials_f_terms[mat2][1]) - set(materials_f_terms[mat1][1]))


        vp_mat1= set([x[:5]for x in unique_fterms_mat1])
        vp_mat2= set([x[:5]for x in unique_fterms_mat2])
        shared_vp=[x for x in vp_mat1 if x in vp_mat2]
        fterm_pairs=[]
        for vp in shared_vp:
            fterm_pairs.append([vp,
                                [fterm for fterm in unique_fterms_mat1 if fterm.startswith(vp)],
                                [fterm for fterm in unique_fterms_mat2 if fterm.startswith(vp)]])
            
            
        material_combinations[mat1[2:]+"_"+mat2[2:]]=fterm_pairs

material_combinations.keys()

Loaded Tokenizer from serialized instance!
There are 193383 different F-Terms in the whole Dataset!


  return torch.nn.functional.normalize(torch.tensor(tensor), p=2).numpy()


dict_keys(['Metal_Wood', 'Metal_Polymer', 'Metal_Cutting', 'Metal_Bleaching', 'Metal_Adhes', 'Wood_Metal', 'Wood_Polymer', 'Wood_Cutting', 'Wood_Bleaching', 'Wood_Adhes', 'Polymer_Metal', 'Polymer_Wood', 'Polymer_Cutting', 'Polymer_Bleaching', 'Polymer_Adhes', 'Cutting_Metal', 'Cutting_Wood', 'Cutting_Polymer', 'Cutting_Bleaching', 'Cutting_Adhes', 'Bleaching_Metal', 'Bleaching_Wood', 'Bleaching_Polymer', 'Bleaching_Cutting', 'Bleaching_Adhes', 'Adhes_Metal', 'Adhes_Wood', 'Adhes_Polymer', 'Adhes_Cutting', 'Adhes_Bleaching'])

# Searching for "Triangle" Combinations 

In [7]:
####################################################################################
# Calculating the needed combinations
theme_f_term_dict = {}
for f_term in f_term_tokens:
    theme = f_term.split('/')[0]
    vp = f_term[:8]
    # Creating a dict entry for the theme
    try: 
        _ = theme_f_term_dict[theme]
    except KeyError:
        theme_f_term_dict[theme] = {}

    # Creating a dict entry for the viewpoint

    try:
        # The first dict call will def. work the second may work if the vp-dict entry 
        # was already made. If it works the theme is appended to the viewpoint dict
        theme_f_term_dict[theme][vp].append(f_term)
    except KeyError:
        theme_f_term_dict[theme][vp] = []

# All possible triangular "Material" Combinations
materials_list=[". Metal", ". Wood", ". Polymer", ". Cutting", ". Bleaching", ". Adhes"]
tri_combs = [[f1, f2, f3] for i1, f1 in enumerate(materials_list[:], start=1) for i2, f2 in enumerate(materials_list[i1:], start=1) for f3 in materials_list[i2+i1:]]

#####################################################################################
# Searching for the combinations in the theme_f_term_dict
all_combs = {}
for i, (theme, theme_vp_dict) in enumerate(theme_f_term_dict.items()):
    print(i, 'Generating combinations for Theme: ', theme, end='\r')
    
    all_f_terms = []
    [all_f_terms.extend(f_terms) for f_terms in theme_vp_dict.values()]
    # generating the idices of the f-terms in all_f_terms which start with a "material" from the materials_list
    single_hits = {material: [f_term for f_term in all_f_terms if number_dict[f_term[:-1]].lower().startswith(material.lower())] for material in materials_list}

    # Extracting the material hits for each combination
    combinations = {comb[0]+comb[1]+comb[2]: [single_hits[comb[0]], single_hits[comb[1]], single_hits[comb[2]]] for comb in tri_combs}
    # Generating the F-Term combinations
    combinations = {key: [(f1, f2, f3) for f1 in f_terms[0] for f2 in f_terms[1] for f3 in f_terms[2]] for key, f_terms in combinations.items()} 
    # Dropping empty combinations
    combinations = {key: [ft_comb for ft_comb in value if len(set([f_term[:8] for f_term in ft_comb])) == 3] for key, value in combinations.items()}
    # Ignoring Themes with no combinations
    combinations = {key: value for key, value in combinations.items() if len(value) != 0}
    #  Dropping Combinations within the same viewpoint
    if len(combinations) == 0:
        continue

    all_combs = all_combs | combinations

# Dropping combinations with just one sample 
all_combs = {key: value for key, value in all_combs.items() if len(value) > 1}
all_combs

2253 Generating combinations for Theme:  2C518

{'. Metal. Wood. Adhes': [('4E093/QA04,', '4E093/JA01,', '4E093/GC01,'),
  ('4E093/QA04,', '4E093/HA01,', '4E093/GC01,')],
 '. Metal. Polymer. Adhes': [('4J001/DC09,', '4J001/EB72,', '4J001/JB45,'),
  ('4J001/DC09,', '4J001/EB72,', '4J001/JA18,'),
  ('4J001/DC09,', '4J001/EB71,', '4J001/JB45,'),
  ('4J001/DC09,', '4J001/EB71,', '4J001/JA18,'),
  ('4J001/DC09,', '4J001/EC81,', '4J001/JB45,'),
  ('4J001/DC09,', '4J001/EC81,', '4J001/JA18,'),
  ('4J001/DC09,', '4J001/GB07,', '4J001/JB45,'),
  ('4J001/DC09,', '4J001/GB07,', '4J001/JA18,'),
  ('4J001/DC09,', '4J001/HA03,', '4J001/JB45,'),
  ('4J001/DC09,', '4J001/HA03,', '4J001/JA18,'),
  ('4J001/DC09,', '4J001/EA41,', '4J001/JB45,'),
  ('4J001/DC09,', '4J001/EA41,', '4J001/JA18,'),
  ('4J001/DC09,', '4J001/ED61,', '4J001/JB45,'),
  ('4J001/DC09,', '4J001/ED61,', '4J001/JA18,')],
 '. Metal. Wood. Cutting': [('2B150/DH01,', '2B150/CA31,', '2B150/BE02,'),
  ('2B150/ED09,', '2B150/CA31,', '2B150/BE02,')],
 '. Metal. Cutting. Bleaching': [('4L0

In [None]:
a = ['4E093', '2E093', '4E093']
set(a)
len(set(a))

In [8]:
# Generating the differences
emb_dict = out_emb_dict

for i, (theme, theme_vp_dict) in enumerate(theme_f_term_dict.items()):
    print(i, 'Generating combinations for Theme: ', theme, end='\r')
    combinations = []
    for vp, f_terms in theme_vp_dict.items():
        keys = [key for key in theme_vp_dict.keys() if key != vp]
        # This one liner basically concatenates all f_term lists except the f_term list of the current viewpoint
        out_vp_f_terms = [f_term for f_term_list in [theme_vp_dict[key] for key in keys] for f_term in f_term_list]    
        
    

2253 Generating combinations for Theme:  2C518

In [None]:
for theme, combinations in theme_f_term_dict.items():
    try:
        print(theme_dict[theme], len(combinations))
    except KeyError:
        print('KeyError', theme_dict.keys())
    
        continue
    

# Plotting the Material Combinations

In [None]:
# Generating vectors for all material combinations
emb_dict = out_emb_dict

combination_desc = []
vector_diffs = []
themes = []
theme_desc = []
numbers = []
desc = []
color_ints = []

for i, l  in enumerate(material_combinations.items()):
    key, item = l
    for theme_comb in item:
        theme, mat_1, mat_2 = theme_comb
        for f_term_1 in mat_1:
            for f_term_2 in mat_2:
                try: 
                    vec_1 = emb_dict[f_term_1]
                    vec_2 = emb_dict[f_term_2]
                    theme_desc.append(theme_dict[theme])
                    desc.append(number_dict[f_term_2] + ' - ' + number_dict[f_term_1])
                except KeyError:
                    continue
                    
                diff = vec_2 - vec_1
                combination_desc.append(key)
                vector_diffs.append(diff)
                themes.append(theme)
                numbers.append(f_term_2 + ' - ' + f_term_1)
                color_ints.append(i)
        
len(combination_desc), len(theme_desc), len(desc)

In [None]:
vector_diffs = np.stack(vector_diffs, 0)

tsne = TSNE(n_components=2, verbose=0, random_state=69) 
tsne_rep = tsne.fit_transform(vector_diffs)

In [None]:
bokeh_palette = bokeh.palettes.viridis(30)
color_palette = bokeh_palette

colors = [color_palette[c%30] for c in color_ints]

datasource_diff = ColumnDataSource(
        data=dict(
            x = tsne_rep[:,0],
            y = tsne_rep[:,1],
            combination =  combination_desc,
            themes=themes,
            theme_desc = theme_desc,
            numbers = numbers,
            desc = desc,
            colors = colors
        )
    )

hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>Combination: </b>  @combination<br><b>Theme:</b> @themes<br><b>Theme Description:</b> @theme_desc<br><b>Numbers:</b> @numbers<br><b>Description:</b> @desc</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Material Combintation Differences')
    
plot_tsne.circle('x', 'y', size=8, fill_color='colors', 
                     alpha=0.7, line_width=0, source=datasource_diff, name="Material Combination Differences")

show(plot_tsne)

# Generating in Theme Differences

In [None]:
emb = out_emb_dict
#layer_n = 13
#emb = context_less_dicts[layer_n]
# Dictionary that contains all differences and additional information for each material combination
differences = {}
# Huge dict, that contains the difference vectors as keys and 
diffs_dict = {}

for i, combination in enumerate(material_combinations.items()):
    key, item = combination
    print(i, key, end='\r')
    # Checking if the reverse combination was already calculated
    current_material_keys = []
    for current_key in differences.keys():
      
        current_material_keys.append(current_key.split('_'))
    combination_keys = key.split('_')
    combination_keys.reverse()
    if combination_keys in current_material_keys:
        continue

    # Calculating the differences and storing them in a list of dict
    combinations_list = []
    for theme_comb in item:
        theme, mat_1, mat_2 = theme_comb
        for f_term_1 in mat_1:
            for f_term_2 in mat_2:
                try: 
                    vec_1 = emb[f_term_1]
                    vec_2 = emb[f_term_2]
                    diff = vec_2 - vec_1
                    diff = normalize(np.array([diff]))[0]                   
                    sample_dict = {}
                    sample_dict['Vector'] = diff
                    sample_dict['Theme'] = theme
                    sample_dict['F-Term 1'] = f_term_1
                    sample_dict['F-Term 2'] = f_term_2
                    diffs_dict[sys.intern(str(np.sum(diff)))] = [theme, f_term_1, f_term_2]
                    combinations_list.append(sample_dict)
                except KeyError:
                    continue
    # Now the differences are computed and stored in a list of dicts
            
    differences[key] = combinations_list      

In [None]:
# Pairing the vectors to compare

max_comb = 100000000
sqr_max_comb = int(max_comb**0.5)
in_combination_comparisons = {}

for i, [keys, samples] in enumerate(differences.items()):
    # Adding the comparison of the material combination with itself to the comparisons
    in_combination_comparisons[keys] = {'Vectors A': [], 
                                        'Vectors B': [],
                                        'Themes': [],
                                        'F-Terms A': [],
                                        'F-Terms B': []}
    print(i, keys, len(samples), end = '\r')
    samples_2 = list(samples)
    random.shuffle(samples_2)
    s1 = 0 
    while len(samples_2) > 0:
        s1 += 1
        s2 = 0
        sample_a = samples_2.pop(0)
        random.shuffle(samples_2)
        for sample_b in samples_2:

            if s2 == sqr_max_comb:
                break
            # Ignoring combinations with matching Viewpoints
            if sample_a['Theme'] == sample_b['Theme']:
                continue
            # Debugging remove later 
            ##################################
            themea = sample_a['Theme']
            themeb = sample_b['Theme']
            theme_a = themea.split('/')[0]
            theme_b = themeb.split('/')[0]
            if theme_a == theme_b:
                continue
            ###################################
            in_combination_comparisons[keys]['Vectors A'].append(torch.tensor(np.array([sample_a['Vector']])))
            in_combination_comparisons[keys]['Vectors B'].append(torch.tensor(np.array([sample_b['Vector']])))
            in_combination_comparisons[keys]['Themes'].append([sample_a['Theme'], sample_a['Theme']])
            in_combination_comparisons[keys]['F-Terms A'].append([sample_a['F-Term 1'], sample_a['F-Term 2']])
            in_combination_comparisons[keys]['F-Terms B'].append([sample_b['F-Term 1'], sample_b['F-Term 2']])
            s2 += 1
        if s1 == sqr_max_comb:
            break
 

In [None]:
# Calculating the Cosine Similarities within all combinations
cos = torch.nn.CosineSimilarity(dim =1)
bad_keys = []
min_comb = 10

for i, (key, combination_dict) in enumerate(in_combination_comparisons.items()):
    print(' '*1000, end='\r')
    print(i, key, end = '\r')
    try:
        similarities = cos(torch.cat(combination_dict['Vectors A'], 0), torch.cat(combination_dict['Vectors B'], 0))
        # Dropping combinations with low sample counts
        if len(similarities) < min_comb:
            print('Low number of samples in:', key, len(similarities), end='\r')
            bad_keys.append(key)
            continue
        combination_dict['Cosine Similarities'] = np.array(similarities)
        # Creating a sorted index of the similariteis to compare the best ones with other combinations
        combination_dict['Sort idx'] = np.argsort(np.array(similarities))[::-1]
        
        
    except RuntimeError:
        print(' '*1000, end='\r')
        print('Empty combination dictionary found, dropping  it!', key, end='\r')
        bad_keys.append(key)
        continue
    

for key in bad_keys:
    in_combination_comparisons.pop(key)

In [None]:
# Extracting the best vectors 
for i, (key, combination_dict) in enumerate(in_combination_comparisons.items()):
    top_100 = np.array(combination_dict['Cosine Similarities'])
    idx = combination_dict['Sort idx'][:100]
    top_100 = top_100[idx]
    # Top 100 vectors a
    top_vectors = np.concatenate(combination_dict['Vectors A'], 0)[idx]
    top_vectors = np.concatenate([np.concatenate(combination_dict['Vectors B'], 0)[idx], top_vectors])
    # dropping duplicates
    top_vectors = np.unique(top_vectors, axis = 0)    
    combination_dict['Best Vectors']  = top_vectors
    print(' '*1000, end='\r')
    print(i, key, len(top_vectors), end='\r')  


In [None]:
# Computing the cosine similarities for out of combination comparisons

for i, (key, combination_dict) in enumerate(in_combination_comparisons.items()):
    print(' '*1000, end='\r')
    print(i, key, end=' \r')
    
    # Extracting all unique vectors from other material combinations, which do not share a material with the material 
    # Combination to be checked
    all_vectors = []
    material_1, material_2 = key.split('_')
    for key2, samples in differences.items():
        
        materials2 = key2.split('_')
        # Ignoring material combinations with matching materials 
        if material_1 in materials2 or material_2 in materials2:
            continue
        # Ignoring material combinations which do not appear in the in_combination_comparisons keys
        if not key2 in [k for k in in_combination_comparisons.keys()]:
            continue
        
        for sample_dict in samples:
            all_vectors.append(sample_dict['Vector'])

    if len(all_vectors) == 0: continue
    
    all_vectors = np.stack(all_vectors, 0)
    all_vectors = np.unique(all_vectors, axis=0)
    
    # vectors_a = vectors from top_vectors, vectors_b = vectors from other material combinations to compare the top_vectors with.
    vectors_a , vectors_b = [], []
    top_vectors = combination_dict['Best Vectors'][:50]

    for top_vector in top_vectors:
        vp, _ ,_ = diffs_dict[str(np.sum(top_vector))]
        for vector in all_vectors:
            vp2, _, _ = diffs_dict[str(np.sum(vector))]
            # Skipping vectors with the same viewpoints
            if vp2 == vp:
                continue
            ##### Debugging remove later
            theme_a = vp.split('/')[0]
            theme_b = vp2.split('/')[0]
            if theme_a == theme_b:
                continue
            #############################
            vectors_a.append(top_vector)
            vectors_b.append(vector)
    
    vectors_a = np.stack(vectors_a, 0)
    vectors_b = np.stack(vectors_b, 0)
   
    similarities = cos(torch.tensor(vectors_a), torch.tensor(vectors_b)).numpy()
    combination_dict['Out of Comb Simis'] = similarities
    # Just to generate the Matrices of the top combinations
    combination_dict['Out of Comb Vec A'] = vectors_a
    combination_dict['Out of Comb Vec B'] = vectors_b


In [None]:
layer_n = f'In Theme Combinations 10 min comb'
# Plotting all out of comb similarities vs all in comb similarities (combined)
all_in_comb = []
all_out_comb = []
for comb_dict in in_combination_comparisons.values():
    all_in_comb.extend(comb_dict['Cosine Similarities'])
    all_out_comb.extend(comb_dict['Out of Comb Simis'])

all_in_comb = np.array(all_in_comb)
all_out_comb = np.array(all_out_comb)
mean_in = np.mean(all_in_comb)
mean_out = np.mean(all_out_comb)

fig, axs = plt.subplots(nrows=1, ncols=1, figsize=[10, 10])


axs.hist(all_in_comb, bins=500, alpha=0.75)
axs.hist(all_out_comb, bins=500, alpha=0.75)
axs.axvline(mean_in , color='red', linestyle='dashed', linewidth=1)
axs.axvline(mean_out, color='magenta', linestyle='dashed', linewidth=1)
axs.text(mean_in + 0.01, axs.get_ylim()[1] * 0.9, f'Mean: {mean_in:.6f}', color='grey')
axs.text(mean_in + 0.01, axs.get_ylim()[1] * 0.925, f'Out of Comb Mean: {mean_out:.6f}', color='grey')
axs.set_title(f'Layer {layer_n} Embedding 1 one 1 Similarities Best Vector Elements {model_name} {checkpoint}')
plt.show()
fig.savefig(f'{model_folder}/{layer_n}  {model_name} {checkpoint}')

In [None]:
# Plotting the out of comb similarities

fig, axs = plt.subplots(nrows = len(in_combination_comparisons), ncols=1, figsize=[10, 200])
for i, (key, combination_dict) in enumerate(in_combination_comparisons.items()):
    print(i, end='\r')
    simis = np.array(combination_dict['Cosine Similarities'])
    out_simis = np.array(combination_dict['Out of Comb Simis'])
    # Pruning the out of comb similarities to the same length as the similarities
    out_simis = out_simis
    
    mean_top_100 = np.mean(np.sort(simis)[-100:])
    out_mean_top_100 = np.mean(np.sort(out_simis)[-100:])
    
    axs[i].hist(simis, bins=50, alpha=.75)
    axs[i].hist(out_simis[:len(simis)], bins=50, alpha=.75)
    axs[i].set_title(key)
    mean =  np.mean(simis)
    mean_out = np.mean(out_simis)
  
    axs[i].axvline(mean, color='red', linestyle='dashed', linewidth=1)
    axs[i].axvline(mean_out, color='magenta', linestyle='dashed', linewidth=1)
    axs[i].text(mean + 0.01, axs[i].get_ylim()[1] * 0.9, f'Mean: {mean:.6f}', color='grey')
    axs[i].text(mean + 0.01, axs[i].get_ylim()[1] * 0.925, f'Out of Comb Mean: {mean_out:.6f}', 
                color='green' if mean > mean_out else 'red')
    axs[i].text(mean + 0.01, axs[i].get_ylim()[1] * 0.875, f'Number of Comparisons: {len(simis)}', color='grey')
    axs[i].text(mean + 0.01, axs[i].get_ylim()[1] * 0.85, f'Mean of Top 100: {mean_top_100:.6f}', color='grey')
    axs[i].text(mean + 0.01, axs[i].get_ylim()[1] * 0.825, f'Mean of Top 100 Out of Comb: {out_mean_top_100:.6f}', 
                color='green' if mean_top_100 > out_mean_top_100 else 'red')
    

plt.show()


# Searching in all In Theme Combinations, Disregarding In Viewpoint Combinations

In [10]:
# Calculating the needed combinations
theme_f_term_dict = {}
for f_term in f_term_tokens:
    theme = f_term.split('/')[0]
    vp = f_term[:8]
    # Creating a dict entry for the theme
    try: 
        _ = theme_f_term_dict[theme]
    except KeyError:
        theme_f_term_dict[theme] = {}

    # Creating a dict entry for the viewpoint

    try:
        # The first dict call will def. work the second may work if the vp-dict entry 
        # was already made. If it works the theme is appended to the viewpoint dict
        theme_f_term_dict[theme][vp].append(f_term)
    except KeyError:
        theme_f_term_dict[theme][vp] = []
        

n_combs = 0
for vps in theme_f_term_dict.values():
    n_f_terms = np.sum([len(f_terms) for f_terms in vps.values()])
    for vp, f_terms in vps.items():
        #print(len(f_terms), n_f_terms)
        vp_comb = (n_f_terms - len(f_terms)) * len(f_terms)
        n_combs += vp_comb
        #print(vp_comb)
print(f'There are {n_combs:,} in theme combinations')

There are 21,685,138 in theme combinations


In [None]:
# Generating the differences
all_diffs = []
all_desc = []
emb_dict = out_emb_dict

for i, (theme, theme_dict) in enumerate(theme_f_term_dict.items()):
    print(i, 'Generating Differences for Theme: ', theme, end='\r')
    for vp, f_terms in theme_dict.items():
        keys = [key for key in theme_dict.keys() if key != vp]
        # This one liner basically concatenates all f_term lists except the f_term list of the current viewpoint
        out_vp_f_terms = [f_term for f_term_list in [theme_dict[key] for key in keys] for f_term in f_term_list]    
        # generating the differences
        for f_term1 in f_terms:
            for f_term2 in out_vp_f_terms:
                diff = emb_dict[f_term2[:-1]] - emb_dict[f_term1[:-1]]
                diff = normalize(np.array([diff]))[0]
                all_diffs.append(diff)
                all_desc.append([f_term1, f_term2])

all_diffs = np.array(all_diffs)

print(all_diffs.shape)

912 Generating Differences for Theme:  5F849

In [None]:
def drop_theme(f_term_pairs, theme):
    '''
    Drops all f_term pairs that contain a certain theme.

    returns the cleaned f_term_pairs and a boolean index
    '''

    idx = [f_term_pair[0][:5] != theme for f_term_pair in f_term_pairs]
    f_term_pairs = [f_term_pair for f_term_pair, b in zip(f_term_pairs, idx) if b]
    return f_term_pairs, idx


def find_pairs(f_term_pairs, key_a, key_b, exact=False):
    '''
    Returns the indices of the f_term_pairs which contain key_a in the first f-term description and key_b in the description of the second f_term
    '''
    f_term_desc = [[number_dict[pair[0][:-1]], number_dict[pair[1][:-1]]] for pair in f_term_pairs]
    if exact:
        matches = [bool(pair[0].startswith(key_a) * pair[1].startswith(key_b)) for pair in f_term_desc]
    else:
        matches = [bool((key_a in pair[0]) * (key_b in pair[1])) for pair in f_term_desc]
        # preventing one F-Term of the pair contains both F-Terms
        reverse_matches = [bool((key_b in pair[0]) + (key_a in pair[1])) for pair in f_term_desc]
        matches = [bool(match and not r_match) for match, r_match in zip(matches, reverse_matches)]
    idx = np.arange(len(f_term_pairs))
    idx = idx[matches]
    return idx

In [None]:
combination = [". metal", ". adhes"]

comb_matches_idx = find_pairs(all_desc, *combination)
comb_desc, comb_diffs = [all_desc[i] for i in comb_matches_idx], all_diffs[comb_matches_idx]


In [None]:
total_hits = 0
total_searched = 0
total_priori_s = 0
total_priori_hits = 0
n = 100
cos = torch.nn.CosineSimilarity(dim=1)

for query_vec, desc in zip(comb_diffs, comb_desc):
    query_vec = torch.tensor(query_vec).unsqueeze(0)
    similarities = torch.tensor([cos(torch.tensor(vec), query_vec) for vec in all_diffs[:]])
    sorted = torch.argsort(similarities).numpy()[::-1]

    guesses = []
    # Also calculating the hits in a theme to be able to calculate how many hits there could be in total
    theme_guesses = []
    i = 0
    # Iterating over all top combinations and removing the in theme combinations until at least n valid comparisons are found
    while len(guesses) < n:
        # Extracting the top -n f-term descriptions
        chunk = sorted[i*n:(i+1)*n]
        chunk_desc = [all_desc[i] for i in chunk]
        # Removing all in theme Combinations
        chunk_clean, idx = drop_theme(chunk_desc, desc[0][:5])
        # idx is a boolean index
        
        idx = np.array([not i for i in idx])
        theme_chunk = [i for i, b in zip(chunk_desc, idx) if b] 
        theme_guesses.extend([theme_chunk] if sum(idx)==-1 else theme_chunk)
        guesses.extend(chunk_clean)
                             
    guesses = guesses[:n]
    hits = len(find_pairs(guesses, *combination))
    theme_hits = len(find_pairs(theme_guesses, *combination))
    
    total_hits += hits
    total_searched += n
    total_priori_s += len(all_desc) - len(theme_guesses)
    total_priori_hits += len(comb_matches_idx) - theme_hits

    print(f' Current Hits: {hits} Current Targets: {len(comb_matches_idx) - theme_hits}; Overall: a priori: {total_priori_hits*100/total_priori_s:.5f}%, a posteriori: {total_hits*100/total_searched:.5f}%; Query Description {number_dict[desc[0][:-1]]}     {number_dict[desc[1][:-1]]}', end='\r')
    

In [None]:
a = np.array([True, False, False])
sum(a)

In [None]:
[". Metal", ". Wood", ". Polymer", ". Cutting", ". Bleaching", ". Adhes"]

In [None]:
all_desc[:1000]