In [None]:
# Own Packages
from Masterarbeit_utils.model_utils import get_tokenizer, load_and_modify_model, load_pretrained_Tokenizer

# Site-Packages
import dask.dataframe as dd
import torch
import psutil
import os
import sys
import pickle as pk
import pandas as pd
import numpy as np
import bokeh
import time

# Dimension reduction algorithms
#from cuml.manifold import TSNE
from sklearn.manifold import TSNE
# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.io import export_png

from transformers import AutoTokenizer, OPTForCausalLM
from tokenizers.processors import TemplateProcessing
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from torch.utils.data import Dataset

%matplotlib inline
output_notebook()

In [None]:
"""
The Paths to important folders have to be changed for your system.
"""

# Name of this experiment
model_name = 'gal_125_1'
checkpoint = 140000

# This folder will be created and filled with txt.files for each sample after you run the Pytorch Dataset Notebook
dataset_folder = f'data/dataset_samples'

# The folder at which the model will be saved. This folder has to be created for your system 
model_folder = f'data/models/{model_name}'
os.makedirs(model_folder, exist_ok=True)


# Folder in which the tokenizer will be saved
tokenizer_folder = f'data/tokenizers/{model_name}'
os.makedirs(tokenizer_folder, exist_ok=True)

# Folder at which all pickle files are stored. This folder is fixed for this project and should not be changed
dump_dir = r'PK_DUMP'

# Model parameters 
'''
mini	125 M
base	1.3 B
standard	6.7 B
large	30 B
huge	120 B'''
base_model_name = 'mini'

# All new Torch-objects will be by default in this dtype
# if default_type = float16 fp16 must be False
default_dtype = torch.float32
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_default_dtype(default_dtype)

# Default device on which the model will be loaded
default_device = 'cpu'

# Number of GPUs the model will be parallelised to 
num_gpus = 1
# If you change 'default_device' to 'cpu', make sure to set num_gpus to zero.
if default_device == 'cpu':
    num_gpus = 0

tensor_parallel = False


###########################
# Loading the Model
###########################
device_map=None
max_memory = {}
if num_gpus > 0:
    # based on https://github.com/huggingface/accelerate/blob/5315290b55ea9babd95a281a27c51d87b89d7c85/src/accelerate/utils/modeling.py#L274
    for i in range(num_gpus):
        _ = torch.tensor([0], device=i)
    for i in range(num_gpus):
        max_memory[i] = torch.cuda.mem_get_info(i)[0]
    device_map = "auto"
max_memory["cpu"] = psutil.virtual_memory().available
             
model = OPTForCausalLM.from_pretrained(f'{model_folder}/checkpoint-{checkpoint}', torch_dtype=default_dtype, low_cpu_mem_usage=True,
                                               device_map=device_map, max_memory=max_memory)

###########################
# Loading the Tokenizer
###########################
tokenizer = AutoTokenizer.from_pretrained(tokenizer_folder)
n_f_terms = len(tokenizer) - tokenizer.vocab_size
print('Loadede Tokenizer from serialized instance!')    
print(f'There are {n_f_terms} different F-Terms in the whole Dataset!')


###########################
# Loading Descriptions
###########################
with open(f'{dump_dir}/themes_descriptions.pk', 'rb') as f:
    theme_dict = pk.load(f)
with open(f'{dump_dir}/viewpoints_descriptions.pk', 'rb') as f:
    viewpoint_dict = pk.load(f)
with open(f'{dump_dir}/numbers_descriptions.pk', 'rb') as f:
    number_dict = pk.load(f)
with open(f'{dump_dir}/full_descriptions.pk', 'rb') as f:
        full_descriptions_dict = pk.load(f)


###########################
# Extracting the Embeddings
###########################

# Extracting the classification Head weights
out_emb = model.get_output_embeddings()
out_emb = next(out_emb.parameters()).detach().numpy()[2:]

# Extracting the matching F_terms for the weights and creating lists with the defintions
tokens = [tokenizer.decode(i) for i in range(len(tokenizer))]
f_term_tokens = tokens[50002:]

# Creating  a dict with f-Terms and their embedding vectors:

emb_dict = {token[:-1]: vec for token, vec in zip(f_term_tokens, out_emb)}

# Detecting pairs of f-terms with supposed similarity

In [None]:

df=pd.read_csv("data/f-terms.csv", index_col=0)

In [None]:
#subset with "material" in the viewpoint description
df2=df[df.viewpoint_label.str.contains("material",case=False, na=False)].copy()
df2["vp"]=df2.theme+"/"+df2.viewpoint
df2["fterm"]=df2.theme+"/"+df2.number

In [None]:
#f-term descriptions are searched for the following materials
materials_list=["metal", "wood", "concrete", "resin", "fiber", "wool", "sand", "cord", "copper", "iron", "silver", "gold", "lead", "glass", "stone", "titanium", "steel", "cement", "silicon", "polymer", "ceramics"]
materials_f_terms={}
for material in materials_list:
    materials_f_terms[material]=[df2[df2.label.str.contains(material,case=False, na=False)].vp.unique(), df2[df2.label.str.contains(material,case=False, na=False)].fterm.values]

In [None]:
#pairs of two materials with fterms in same viewpoints are created
material_combinations={}
for mat1 in materials_list:
    for mat2 in materials_list:
        
        if mat1==mat2: continue
        if mat2+"_"+mat1 in materials_f_terms: continue
        
        #filtering shared f-terms with both materials in the label description
        
        unique_fterms_mat1 = list(set(materials_f_terms[mat1][1]) - set(materials_f_terms[mat2][1]))
        unique_fterms_mat2 = list(set(materials_f_terms[mat2][1]) - set(materials_f_terms[mat1][1]))


        vp_mat1= set([x[:8]for x in unique_fterms_mat1])
        vp_mat2= set([x[:8]for x in unique_fterms_mat2])
        shared_vp=[x for x in vp_mat1 if x in vp_mat2]
        fterm_pairs=[]
        for vp in shared_vp:
            fterm_pairs.append([vp,
                                [fterm for fterm in unique_fterms_mat1 if fterm.startswith(vp)],
                                [fterm for fterm in unique_fterms_mat2 if fterm.startswith(vp)]])
            
            
        material_combinations[mat1+"_"+mat2]=fterm_pairs
        

In [None]:
# Generating vectors for all material combinations


combination_desc = []
vector_diffs = []
vps = []
vp_desc = []
numbers = []
desc = []
color_ints = []

for i, l  in enumerate(material_combinations.items()):
    key, item = l
    for viewpoint_comb in item:
        viewpoint, mat_1, mat_2 = viewpoint_comb
    
        for f_term_1 in mat_1:
            for f_term_2 in mat_2:
                try: 
                    vec_1 = emb_dict[f_term_1]
                    vec_2 = emb_dict[f_term_2]
                    vp_desc.append(viewpoint_dict[viewpoint])
                    desc. append(number_dict[f_term_2] + ' - ' + number_dict[f_term_1])
                except KeyError:
                    continue
                    
                diff = vec_2 - vec_1
                combination_desc.append(key)
                vector_diffs.append(diff)
                vps.append(viewpoint)
                numbers.append(f_term_2 + ' - ' + f_term_1)
                color_ints.append(i)
        
len(combination_desc), len(vp_desc), len(desc)

In [None]:
vector_diffs = np.stack(vector_diffs, 0)

tsne = TSNE(n_components=2, verbose=0, random_state=69) 
tsne_rep = tsne.fit_transform(vector_diffs)

In [None]:
bokeh_palette = bokeh.palettes.Turbo256
color_palette = bokeh_palette

colors = [color_palette[c%256] for c in color_ints]

datasource_diff = ColumnDataSource(
        data=dict(
            x = tsne_rep[:,0],
            y = tsne_rep[:,1],
            combination =  combination_desc,
            viewpoints=vps,
            vp_desc = vp_desc,
            numbers = numbers,
            desc = desc,
            colors = colors
        )
    )

hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>Combination: </b>  @combination<br><b>Viewpoint:</b> @viewpoints<br><b>Viewpoint Description:</b> @vp_desc<br><b>Numbers:</b> @numbers<br><b>Description:</b> @desc</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Material Combintation Differences')
    
plot_tsne.circle('x', 'y', size=8, fill_color='colors', 
                     alpha=0.7, line_width=0, source=datasource_diff, name="Material Combination Differences")

show(plot_tsne)

# Comparing the Cosine Similarities of all Vector Combinations

In [None]:

# Generating the 1 on 1 differences for a single material combination
combination = material_combinations['metal_glass']
descriptions = []
diffs = []

for viewpoint, fterms1, fterms2 in combination:
    for fterm1 in fterms1:
        for fterm2 in fterms2:
            try: 
                vec1 = emb_dict[fterm1]
                vec2 = emb_dict[fterm2]
            except KeyError:
                continue

            diffs.append(vec2- vec1)
            descriptions.append([viewpoint, fterm1, fterm2])

descriptions = np.array(descriptions)
diffs = torch.tensor(diffs)

# Generating the cosine similarities

cos = torch.nn.CosineSimilarity(dim =1)

diffs = torch.nn.functional.normalize(diffs, p=2, dim=1)
diffs1 = diffs.unsqueeze(0)
diffs2 = diffs.unsqueeze(1)
desc1 = np.expand_dims(descriptions, 0)
desc2 = np.expand_dims(descriptions, 1)
diffs1 = diffs1.expand(diffs.shape[0], diffs.shape[0], diffs.shape[1]).flatten(end_dim=-2)
diffs2 = diffs2.expand(diffs.shape[0], diffs.shape[0], diffs.shape[1]).flatten(end_dim=-2)
desc1 = np.broadcast_to(desc1, [descriptions.shape[0], descriptions.shape[0], descriptions.shape[1]]).reshape(-1, desc1.shape[-1])
desc2 = np.broadcast_to(desc2, [descriptions.shape[0], descriptions.shape[0], descriptions.shape[1]]).reshape(-1, desc2.shape[-1])

desc = np.stack([desc1, desc2],-1)

co_simis = cos(diffs1, diffs2)

diffs2.shape, diffs1.shape, desc.shape, co_simis.shape

In [None]:
# sorting the similarities
sort_out = torch.argsort(co_simis)

# Removing 'wrong' similarities
clean_sort_simis = []
clean_desc = []

for idx in sort_out:
    simi = co_simis[idx]
    des = desc[idx]
    vp_a, vp_b = des[0]
    if vp_a == vp_b:
        continue
    clean_desc.append(des)
    clean_sort_simis.append(simi)



In [None]:
max_simis = []
min_simis = []
max_50_simis = []
min_50_simis = []
max_50_desc = []
min_50_desc = []
mean_simis = []
max_simis_desc = []
min_simis_desc = []
n_comb = []

for i, pair  in enumerate(material_combinations.items()):
    name, combination = pair
    descriptions = []
    diffs = []
    
    for viewpoint, fterms1, fterms2 in combination:
        for fterm1 in fterms1:
            for fterm2 in fterms2:
                try: 
                    vec1 = emb_dict[fterm1]
                    vec2 = emb_dict[fterm2]
                except KeyError:
                    continue
    
                diffs.append(vec2- vec1)
                descriptions.append([viewpoint, fterm1, fterm2, name])
    
    descriptions = np.array(descriptions)
    diffs = torch.tensor(diffs)
    
    # Generating the cosine similarities
    
    cos = torch.nn.CosineSimilarity(dim =1)
    try:
        diffs = torch.nn.functional.normalize(diffs, p=2, dim=1)
    except IndexError:
        continue
        
    diffs1 = diffs.unsqueeze(0)
    diffs2 = diffs.unsqueeze(1)
    desc1 = np.expand_dims(descriptions, 0)
    desc2 = np.expand_dims(descriptions, 1)
    diffs1 = diffs1.expand(diffs.shape[0], diffs.shape[0], diffs.shape[1]).flatten(end_dim=-2)
    diffs2 = diffs2.expand(diffs.shape[0], diffs.shape[0], diffs.shape[1]).flatten(end_dim=-2)
    desc1 = np.broadcast_to(desc1, [descriptions.shape[0], descriptions.shape[0], descriptions.shape[1]]).reshape(-1, desc1.shape[-1])
    desc2 = np.broadcast_to(desc2, [descriptions.shape[0], descriptions.shape[0], descriptions.shape[1]]).reshape(-1, desc2.shape[-1])
    
    desc = np.stack([desc1, desc2],-1)
    
    co_simis = cos(diffs1, diffs2)

    # sorting the similarities
    sort_out = torch.argsort(co_simis)
    
    # Removing 'wrong' similarities (cosine similarities where the theme is identical for both vector differences)
    clean_sort_simis = []
    clean_desc = []
    
    for idx in sort_out:
        simi = co_simis[idx]
        des = desc[idx]
        vp_a, vp_b = des[0]
        if vp_a == vp_b:
            continue
        clean_desc.append(des)
        clean_sort_simis.append(simi)
    try:
        
        max_simis.append(clean_sort_simis[-1])
        min_simis.append(clean_sort_simis[0])
        max_50_simis.extend(clean_sort_simis[-50:])
        min_50_simis.extend(clean_sort_simis[:50])
        max_50_desc.extend(clean_desc[-50:])
        min_50_desc.extend(clean_desc[:50])

        mean_simis.append(np.mean(clean_sort_simis))
        max_simis_desc.append(clean_desc[-1])
        min_simis_desc.append(clean_desc[0])
        n_comb.append(len(sort_out))
    except IndexError:
        continue

    print(f'{i}  {name} max: {max_simis[-1]}, min: {min_simis[-1]}, mean: {mean_simis[-1]}, mean_max: {np.mean(max_simis)}, mean_min: {np.mean(min_simis)} mean_mean: {np.sum(np.array(mean_simis)*np.array(n_comb))/np.sum(n_comb)}', end='\r')


In [None]:
sort_max_simis = np.argsort(max_simis)
sort_max_simis = sort_max_simis[::-1]
top_50 = []
for i, idx in enumerate(sort_max_simis[:50]):
    output_dict = {}
    desc = max_simis_desc[idx]
    output_dict['Combination'] = desc[-1][0].replace('_', '-')
    output_dict['Cosine Similarity'] = max_simis[idx].item()
    output_dict['Viewpoint 1'] = desc[0][0]
    output_dict['Viewpoint 1 Description'] = viewpoint_dict[desc[0][0]]
    output_dict['Theme 1'] = theme_dict[desc[0][0].split('/')[0]]
    output_dict['Viewpoint 2'] = desc[0][1]
    output_dict['Viewpoint 2 Description'] = viewpoint_dict[desc[0][1]]
    output_dict['Theme 2'] = theme_dict[desc[0][1].split('/')[0]]
    output_dict['F-Terms Vector 1'] = [desc[1][0].tolist(), desc[2][0].tolist()]
    output_dict['F-Terms Vector 1 Description'] = 'Description a: ' + number_dict[desc[1][0]] + '     Description b: ' + number_dict[desc[2][0]]
    output_dict['F-Terms Vector 2'] = [desc[1][1].tolist(), desc[2][1].tolist()]
    output_dict['F-Terms Vector 2 Description'] = 'Description a: ' + number_dict[desc[1][1]] + '     Description b: ' + number_dict[desc[2][1]]
    top_50.append(output_dict)

top_50 = pd.DataFrame(top_50)
top_50.to_csv('top_50.csv')
top_50

In [None]:
sort_min_simis = np.argsort(min_simis)

worst_50 = []
for i, idx in enumerate(sort_min_simis[:50]):
    output_dict = {}
    desc = min_simis_desc[idx]
    output_dict['Combination'] = desc[-1][0].replace('_', '-')
    output_dict['Cosine Similarity'] = min_simis[idx].item()
    output_dict['Viewpoint 1'] = desc[0][0]
    output_dict['Viewpoint 1 Description'] = viewpoint_dict[desc[0][0]]
    output_dict['Theme 1'] = theme_dict[desc[0][0].split('/')[0]]
    output_dict['Viewpoint 2'] = desc[0][1]
    output_dict['Viewpoint 2 Description'] = viewpoint_dict[desc[0][1]]
    output_dict['Theme 2'] = theme_dict[desc[0][1].split('/')[0]]
    output_dict['F-Terms Vector 1'] = [desc[1][0].tolist(), desc[2][0].tolist()]
    output_dict['F-Terms Vector 1 Description'] = 'Description a: ' + number_dict[desc[1][0]] + '     Description b: ' + number_dict[desc[2][0]]
    output_dict['F-Terms Vector 2'] = [desc[1][1].tolist(), desc[2][1].tolist()]
    output_dict['F-Terms Vector 2 Description'] = 'Description a: ' + number_dict[desc[1][1]] + '     Description b: ' + number_dict[desc[2][1]]
    worst_50.append(output_dict)

worst_50 = pd.DataFrame(worst_50)
worst_50.to_csv('worst_50.csv')
worst_50

In [None]:
sort_max_simis = np.argsort(max_50_simis)
sort_max_simis = sort_max_simis[::-1]
top_50_overall = []
for i, idx in enumerate(sort_max_simis[:50]):
    output_dict = {}
    desc = max_50_desc[idx]
    output_dict['Combination'] = desc[-1][0].replace('_', '-')
    output_dict['Cosine Similarity'] = max_50_simis[idx].item()
    output_dict['Viewpoint 1'] = desc[0][0]
    output_dict['Viewpoint 1 Description'] = viewpoint_dict[desc[0][0]]
    output_dict['Theme 1'] = theme_dict[desc[0][0].split('/')[0]]
    output_dict['Viewpoint 2'] = desc[0][1]
    output_dict['Viewpoint 2 Description'] = viewpoint_dict[desc[0][1]]
    output_dict['Theme 2'] = theme_dict[desc[0][1].split('/')[0]]
    output_dict['F-Terms Vector 1'] = [desc[1][0].tolist(), desc[2][0].tolist()]
    output_dict['F-Terms Vector 1 Description'] = 'Description a: ' + number_dict[desc[1][0]] + '     Description b: ' + number_dict[desc[2][0]]
    output_dict['F-Terms Vector 2'] = [desc[1][1].tolist(), desc[2][1].tolist()]
    output_dict['F-Terms Vector 2 Description'] = 'Description a: ' + number_dict[desc[1][1]] + '     Description b: ' + number_dict[desc[2][1]]
    top_50_overall.append(output_dict)

top_50_overall = pd.DataFrame(top_50_overall)
top_50_overall.to_csv('top_50_overall.csv')
top_50_overall

In [None]:
sort_max_simis = np.argsort(min_50_simis)
sort_max_simis = sort_max_simis[::-1]
top_50_overall = []
for i, idx in enumerate(sort_max_simis[:50]):
    output_dict = {}
    desc = max_50_desc[idx]
    output_dict['Combination'] = desc[-1][0].replace('_', '-')
    output_dict['Cosine Similarity'] = max_50_simis[idx].item()
    output_dict['Viewpoint 1'] = desc[0][0]
    output_dict['Viewpoint 1 Description'] = viewpoint_dict[desc[0][0]]
    output_dict['Theme 1'] = theme_dict[desc[0][0].split('/')[0]]
    output_dict['Viewpoint 2'] = desc[0][1]
    output_dict['Viewpoint 2 Description'] = viewpoint_dict[desc[0][1]]
    output_dict['Theme 2'] = theme_dict[desc[0][1].split('/')[0]]
    output_dict['F-Terms Vector 1'] = [desc[1][0].tolist(), desc[2][0].tolist()]
    output_dict['F-Terms Vector 1 Description'] = 'Description a: ' + number_dict[desc[1][0]] + '     Description b: ' + number_dict[desc[2][0]]
    output_dict['F-Terms Vector 2'] = [desc[1][1].tolist(), desc[2][1].tolist()]
    output_dict['F-Terms Vector 2 Description'] = 'Description a: ' + number_dict[desc[1][1]] + '     Description b: ' + number_dict[desc[2][1]]
    top_50_overall.append(output_dict)

top_50_overall = pd.DataFrame(top_50_overall)
top_50_overall.to_csv('top_50_overall.csv')
top_50_overall

In [None]:
sort_min_simis = np.argsort(min_50_simis)
worst_50_overall = []

for i, idx in enumerate(sort_min_simis[:50]):
    output_dict = {}
    desc = min_50_desc[idx]
    output_dict['Combination'] = desc[-1][0].replace('_', '-')
    output_dict['Cosine Similarity'] = min_50_simis[idx].item()
    output_dict['Viewpoint 1'] = desc[0][0]
    output_dict['Viewpoint 1 Description'] = viewpoint_dict[desc[0][0]]
    output_dict['Theme 1'] = theme_dict[desc[0][0].split('/')[0]]
    output_dict['Viewpoint 2'] = desc[0][1]
    output_dict['Viewpoint 2 Description'] = viewpoint_dict[desc[0][1]]
    output_dict['Theme 2'] = theme_dict[desc[0][1].split('/')[0]]
    output_dict['F-Terms Vector 1'] = [desc[1][0].tolist(), desc[2][0].tolist()]
    output_dict['F-Terms Vector 1 Description'] = 'Description a: ' + number_dict[desc[1][0]] + '     Description b: ' + number_dict[desc[2][0]]
    output_dict['F-Terms Vector 2'] = [desc[1][1].tolist(), desc[2][1].tolist()]
    output_dict['F-Terms Vector 2 Description'] = 'Description a: ' + number_dict[desc[1][1]] + '     Description b: ' + number_dict[desc[2][1]]
    worst_50_overall.append(output_dict)

worst_50_overall = pd.DataFrame(worst_50_overall)
worst_50_overall.to_csv('worst_50_overall.csv')
worst_50_overall