In [34]:
# Own Packages
from Masterarbeit_utils.model_utils import load_pretrained_model, load_pretrained_Tokenizer

# Site Packages
import torch
import numpy as np
import os
import sys
import pickle as pk
import pandas as pd
import bokeh
import time
from matplotlib import pyplot as plt

# Dimension reduction algorithms
#from cuml.manifold import TSNE
from scipy.spatial import distance
from sklearn.manifold import TSNE
# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.layouts import gridplot
from bokeh.io import export_png

from transformers import AutoTokenizer, OPTForCausalLM
from tokenizers.processors import TemplateProcessing
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from torch.utils.data import Dataset

%matplotlib inline
output_notebook()

In [35]:
"""
The Paths to important folders have to be changed for your system.
"""

# Name of this experiment
model_name = 'opt_125_nochange'
checkpoint = 'no_checkpoint'

# This folder will be created and filled with txt.files for each sample after you run the Pytorch Dataset Notebook
dataset_folder = f'data/dataset_samples'

# The folder at which the model will be saved. This folder has to be created for your system 
model_folder = f'data/models/{model_name}'
os.makedirs(model_folder, exist_ok=True)


# Folder in which the tokenizer will be saved
tokenizer_folder = f'data/tokenizers/{model_name}'
os.makedirs(tokenizer_folder, exist_ok=True)

# Folder at which all pickle files are stored. This folder is fixed for this project and should not be changed
dump_dir = r'PK_DUMP'

# Model parameters 
'''
mini	125 M
base	1.3 B
standard	6.7 B
large	30 B
huge	120 B'''
base_model_name = 'opt mini'

# All new Torch-objects will be by default in this dtype
# if default_type = float16 fp16 must be False
default_dtype = torch.float32
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_default_dtype(default_dtype)

# Default device on which the model will be loaded
default_device = 'cuda:0'

# Number of GPUs the model will be parallelised to 
num_gpus = 1
# If you change 'default_device' to 'cpu', make sure to set num_gpus to zero.
if default_device == 'cpu':
    num_gpus = 0

tensor_parallel = False

# Start idx = 0 if tokenizer.decode does not add a start-token, if it adds a start-token start_idx = 1
start_idx = 1

###########################
# Loading the Model
###########################
model = load_pretrained_model(base_model_name, default_dtype, tensor_parallel, num_gpus)


###########################
# Loading the Tokenizer
###########################
tokenizer = load_pretrained_Tokenizer(base_model_name)

############################
# Extracting the Embeddings
############################
out_emb = model.get_output_embeddings()
out_emb = next(out_emb.parameters()).to('cpu').detach().numpy()
# Input Embeddings
inp_emb = model.get_input_embeddings()
inp_emb = inp_emb(torch.arange(len(tokenizer))).to('cpu').detach().numpy()

############################
# Scaling the embeddings
############################
out_emb = torch.nn.functional.normalize(torch.tensor(out_emb), dim=1, p=2).numpy()
inp_emb = torch.nn.functional.normalize(torch.tensor(inp_emb), dim=1, p=2).numpy()

Max Memory {0: 18980601856, 'cpu': 149574623232}


In [36]:
print(out_emb.shape, inp_emb.shape)

(50272, 768) (50265, 768)


In [37]:

if not os.path.isfile(f'{model_folder}/context_less_emb.pk'):
    print('Calculating context less embeddings!')
    context_less_emb = [[] for _ in range(13)]
    for i in range(len(tokenizer)):
        print(i, end='\r')
        out = model(input_ids= torch.tensor([[i]]), attention_mask = torch.tensor([[1]]), output_hidden_states=True)
        
        out = out.hidden_states
        for i, k in enumerate(out):
            context_less_emb[i].append(k.to('cpu').detach().numpy())
    with open(f'{model_folder}/context_less_emb.pk', 'wb') as f:
        pk.dump(context_less_emb, f)
else:
    print('Loading context less embeddings from disk')
    with open(f'{model_folder}/context_less_emb.pk', 'rb') as f:
        context_less_emb = pk.load(f)

Loading context less embeddings from disk


In [38]:
# Combining context less embeddings of a layer to a single tensor
for i, layer in enumerate(context_less_emb):
    layer = [e[0] for e in layer]
    layer = np.concatenate(layer, 0)
    context_less_emb[i] = layer

# Calculating and saving T-SNE representations for all layer embeddings
layer_tsne_reps = []
for i, emb in enumerate(context_less_emb):
    if os.path.isfile(f'{model_folder}/output_tsne_rep_{checkpoint}_layer{i}.pk'):
        print('Loading TSNE representation for layer ', i)
        with open(f'{model_folder}/output_tsne_rep_{checkpoint}_layer{i}.pk', 'rb') as f:
            layer_tsne_reps.append(pk.load(f))
    else:
        print('Calculating TSNE representation for layer ', i)
        tsne = TSNE(n_components=2, verbose=0, random_state=69) 
        tsne_rep = tsne.fit_transform(emb)
        with open(f'{model_folder}/output_tsne_rep_{checkpoint}_layer{i}.pk', 'wb') as f:
                pk.dump(tsne_rep, f)

Loading TSNE representation for layer  0
Loading TSNE representation for layer  1
Loading TSNE representation for layer  2
Loading TSNE representation for layer  3
Loading TSNE representation for layer  4
Loading TSNE representation for layer  5
Loading TSNE representation for layer  6
Loading TSNE representation for layer  7
Loading TSNE representation for layer  8
Loading TSNE representation for layer  9
Loading TSNE representation for layer  10
Loading TSNE representation for layer  11
Loading TSNE representation for layer  12


In [40]:
layer = 12
colors = []
for i in range(len(tokenizer)):
    if i < 60000:
        colors.append('blue')
    else: 
        colors.append('red')
        
datasource = ColumnDataSource(
        data=dict(
            x = layer_tsne_reps[layer][:,0],
            y = layer_tsne_reps[layer][:,1],
            tokens = [tokenizer.decode(i) for i in range(len(tokenizer))],
            colors = colors
        )
    )
    
hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>Token:</b> @tokens</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title=f'Context less Embeddings Layer {layer}')
        
plot_tsne.circle('x', 'y', size=8, fill_color='colors', 
                 alpha=0.7, line_width=0, source=datasource, name="Tokens")

show(plot_tsne)

In [None]:
# Calculating the TSNE representation of all output embeddigns
if os.path.isfile(f'{model_folder}/output_tsne_rep_{checkpoint}.pk'):
    with open(f'{model_folder}/output_tsne_rep_{checkpoint}.pk', 'rb') as f:
        out_tsne_rep = pk.load(f)
else:
    tsne = TSNE(n_components=2, verbose=0, random_state=69) 
    out_tsne_rep = tsne.fit_transform(out_emb)
    with open(f'{model_folder}/output_tsne_rep_{checkpoint}.pk', 'wb') as f:
        pk.dump(out_tsne_rep, f)

# Calculating the TSNE representation of all input embeddigns
if os.path.isfile(f'{model_folder}/input_tsne_rep_{checkpoint}.pk'):
    with open(f'{model_folder}/input_tsne_rep_{checkpoint}.pk', 'rb') as f:
        inp_tsne_rep = pk.load(f)
else:
    tsne = TSNE(n_components=2, verbose=0, random_state=69) 
    inp_tsne_rep = tsne.fit_transform(inp_emb)
    with open(f'{model_folder}/input_tsne_rep_{checkpoint}.pk', 'wb') as f:
        pk.dump(inp_tsne_rep, f)

In [None]:
# Theme T-SNE

datasource_text = ColumnDataSource(
        data=dict(
            x = out_tsne_rep[:,0],
            y = out_tsne_rep[:,1],
            tokens = [tokenizer.decode(i) for i in range(len(tokenizer))]
        )
    )

hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>Token:</b> @tokens</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Text Output Embeddings')
    
plot_tsne.circle('x', 'y', size=8, fill_color='blue', 
                     alpha=0.7, line_width=0, source=datasource_text, name="Text-Tokens")

show(plot_tsne)

In [None]:
# Theme T-SNE

datasource_text = ColumnDataSource(
        data=dict(
            x = inp_tsne_rep[:,0],
            y = inp_tsne_rep[:,1],
            tokens = [tokenizer.decode(i) for i in range(len(tokenizer))]
        )
    )

hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>Token:</b> @tokens</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Text Input Embeddings')
    
plot_tsne.circle('x', 'y', size=8, fill_color='orange', 
                     alpha=0.7, line_width=0, source=datasource_text, name="Text-Tokens")

show(plot_tsne)

# Testing the Country/ Capital Relations

In [None]:
country_capitals = pd.read_csv('data/Staat Hauptstadt.csv', encoding='UTF-8', delimiter=';')
# Cleaning the data from multiple token Cities of states

country_token_ids = []
capital_token_ids = []

for country, capital in zip(country_capitals['Staat'], country_capitals['Hauptstadt']):
    try:
        country_tokenized = tokenizer.encode(country+' ')[start_idx:]
    except Exception:
        continue
    try:
        capital_tokenized = tokenizer.encode(capital+' ')[start_idx:]
    except Exception:
        continue

    #print(country_tokenized, capital_tokenized)
    if len(country_tokenized) != 1:
        continue
    if len(capital_tokenized) != 1:
        continue

    country_token_ids.append(country_tokenized)
    capital_token_ids.append(capital_tokenized)

for country_id, capital_id in zip(country_token_ids, capital_token_ids):
    print(tokenizer.decode(country_id))
    print(tokenizer.decode(capital_id))
    print('')


#This won't work because just one pair was found that can be encoded with just one token

# Testing the Male/ Female Relations

In [18]:
male_female_df = pd.read_csv('data/Male Female Nouns.csv', encoding='UTF-8', delimiter=';')
male_female_df

Unnamed: 0,MALE,FEMALE
0,boy,girl
1,gentleman,lady
2,husband,wife
3,fiancé,fiancée
4,man,woman
5,brother,sister
6,uncle,aunt
7,widower,widow
8,nephew,niece
9,dad,mum/mom


In [25]:
# Debug for new embedding retrival method
layer = 12
inp_emb = context_less_emb[layer]
out_emb = context_less_emb[layer]

In [26]:
def extract_emb():
    male_female_df = pd.read_csv('data/Male Female Nouns.csv', encoding='UTF-8', delimiter=';')
    
    male = {}
    female = {}
    
    # extracting all meaningfully encoded male and female nouns
    for i, (male_noun, female_noun) in male_female_df.iterrows():
        male_noun = male_noun.strip().lower()
        female_noun = female_noun.strip().lower()
        
        male_synonyms = [' '+male_noun, male_noun+' ', male_noun]
        female_synonyms = [' '+female_noun, female_noun+' ', female_noun]
        
        male_emb = []
        female_emb = []
    
        for noun in male_synonyms:
            
            idx = tokenizer.encode(noun)[start_idx:]
            if len(idx) == 1:
                male_emb.append(inp_emb[idx])
            noun = noun.capitalize()
            idx = tokenizer.encode(noun)[start_idx:]
            if len(idx) == 1:
                male_emb.append(inp_emb[idx])
    
        for noun in female_synonyms:
            idx = tokenizer.encode(noun)[start_idx:]
            if len(idx) == 1:
                female_emb.append(inp_emb[idx])
            noun = noun.capitalize()
            idx = tokenizer.encode(noun)[start_idx:]
            if len(idx) == 1: 
                female_emb.append(inp_emb[idx])
            
            
        if len(male_emb) >= 1 and len(female_emb) >= 1:
            try:
                _ = male[male_noun]
            except KeyError:
                try:
                     _ = female[female_noun]
                except KeyError:
                    #print(male_noun, female_noun, len(female_emb), len(male_emb))
                    male[male_noun] = male_emb[:1]
                    female[female_noun] = female_emb[:1]
                    #print(male.keys(), female.keys())

    return male, female

male, female = extract_emb()

len(male), len(female)

(13, 13)

In [27]:
def retrieve_nearest(vec):
    cos = torch.nn.CosineSimilarity(dim = 1)
    male, female = extract_emb()
    all_embs = male | female
    #all_dis = [distance.euclidean(vec.flatten(), v[0].flatten()) for v in all_embs.values()]
    all_dis = [cos(torch.tensor(v[0]), torch.tensor(vec)).detach().numpy()[0] for v in all_embs.values()] 
    all_dis_ind = np.argsort(all_dis)[::-1]
    
    #print(np.array(all_dis)[all_dis_ind])
    return [key for key in all_embs.keys()][all_dis_ind[0]],[key for key in all_embs.keys()][ all_dis_ind[1]]

def test_prediction(test_pair):
    male, female = extract_emb()
    test_male = male.pop(test_pair[0])[0]
    test_female = female.pop(test_pair[1])[0]
    #print('M:', test_pair[0], 'KEYS', male.keys())
    male_emb = np.concatenate([i for item in male.values() for i in item], 0)
    female_emb = np.concatenate([i for item in female.values() for i in item], 0)
    
    std_male = np.std(male_emb, axis=0)
    mean_male = np.mean(male_emb, axis=0)
    std_female = np.std(female_emb, axis=0)
    mean_female = np.mean(female_emb, axis=0)
    
    male_idx = np.argsort(std_male)
    female_idx = np.argsort(std_female)

    
    mean_diff = mean_female - mean_male
    test_mtf = np.array(test_male) + mean_diff
    test_mtf = torch.nn.functional.normalize(torch.tensor(test_mtf), p=2).numpy()

    nearest_neighbor = retrieve_nearest(test_mtf)
    base_nn = retrieve_nearest(test_male)
    
    cos = torch.nn.CosineSimilarity(dim = 1)
    
    print(f'''
{test_pair[0]} {test_pair[1]}:
    Similarity test_male - test_female: {cos(torch.tensor(test_male), torch.tensor(test_female)).item():.5f}
    Similarity test_mtf - test_female: {cos(torch.tensor(test_mtf), torch.tensor(test_female)).item():.5f}
    Distance test_male - test_female: {distance.euclidean(test_male.flatten(), test_female.flatten()):.5f}
    Distance test_mtf - test_female: {distance.euclidean(test_mtf.flatten(), test_female.flatten()):.5f}
    CoS Increase:      {cos(torch.tensor(test_mtf), torch.tensor(test_female)).item() - cos(torch.tensor(test_male), torch.tensor(test_female)).item():.5f}
    Distance Decrease: {(distance.euclidean(test_mtf.flatten(), test_female.flatten()))-(distance.euclidean(test_male.flatten(), test_female.flatten())):.5f}
    Nearest Neighbor: {nearest_neighbor}
    (base) Nearest Neighbor: {base_nn}
    ''')

In [31]:
m, f = extract_emb()
for layer in range(12):
    print(f'''
---------------------------------------------------------------------------------------------------------------------------    
Layer {layer}
---------------------------------------------------------------------------------------------------------------------------''')

    inp_emb = context_less_emb[layer]
    out_emb = context_less_emb[layer]
    out_emb = torch.nn.functional.normalize(torch.tensor(out_emb), dim=1, p=2).numpy()
    inp_emb = torch.nn.functional.normalize(torch.tensor(inp_emb), dim=1, p=2).numpy()
    for male, female in zip(m.keys(), f.keys()):
        test_prediction((male, female))


---------------------------------------------------------------------------------------------------------------------------    
Layer 0
---------------------------------------------------------------------------------------------------------------------------

boy girl:
    Similarity test_male - test_female: 0.90550
    Similarity test_mtf - test_female: 0.93215
    Distance test_male - test_female: 0.43475
    Distance test_mtf - test_female: 0.36838
    CoS Increase:      0.02665
    Distance Decrease: -0.06637
    Nearest Neighbor: ('boy', 'girl')
    (base) Nearest Neighbor: ('boy', 'girl')
    

gentleman lady:
    Similarity test_male - test_female: 0.87980
    Similarity test_mtf - test_female: 0.89659
    Distance test_male - test_female: 0.49030
    Distance test_mtf - test_female: 0.45477
    CoS Increase:      0.01679
    Distance Decrease: -0.03553
    Nearest Neighbor: ('gentleman', 'lady')
    (base) Nearest Neighbor: ('gentleman', 'lady')
    

husband wife:
    Simila

In [41]:
cos()

TypeError: CosineSimilarity.forward() missing 2 required positional arguments: 'x1' and 'x2'

In [24]:
cos = torch.nn.CosineSimilarity(dim = 1)

king_id = tokenizer.encode(' king')[1:]
queen_id = tokenizer.encode(' queen')[1:]
kings_id = tokenizer.encode(' kings')[1:]
queens_id = tokenizer.encode(' queens')[1:]

king_emb = torch.tensor(inp_emb[king_id])
queen_emb = torch.tensor(inp_emb[queen_id])
kings_emb = torch.tensor(inp_emb[kings_id])
queens_emb = torch.tensor(inp_emb[queens_id])

king_queen = queen_emb - king_emb
king_kings = kings_emb - king_emb
kings_queens = queens_emb - kings_emb
queen_queens = queens_emb - queen_emb

cos_king_queen = cos(king_emb, queen_emb)
cos_king_kings = cos(king_emb, kings_emb)
cos_king_queen_add = cos(king_emb + kings_queens, queen_emb)
cos_king_kings_add = cos(king_emb + queen_queens, kings_emb)

dis_king_queen = distance.euclidean(king_emb.flatten(), queen_emb.flatten())
dis_king_kings = distance.euclidean(king_emb.flatten(), kings_emb.flatten())
dis_king_queen_add = distance.euclidean((king_emb + kings_queens).flatten(), queen_emb.flatten())
dis_king_kings_add = distance.euclidean((king_emb + queen_queens).flatten(), kings_emb.flatten())

print(f'king queen {cos_king_queen.item():.5f}, king kings {cos_king_kings.item():.5f}, king queen add {cos_king_queen_add.item():.5f}, king kings add {cos_king_kings_add.item():.5f}')
print(f'king queen {dis_king_queen:.5f}, king kings {dis_king_kings:.5f}, king queen add {dis_king_queen_add:.7f}, king kings add {dis_king_kings_add:.7f}')


king queen 0.90099, king kings 0.90442, king queen add 0.92768, king kings add 0.91917
king queen 1.29003, king kings 1.25596, king queen add 1.1805787, king kings add 1.1805787


In [33]:
# All male female distance and cosine pairs 
cos = torch.nn.CosineSimilarity(dim = 1)

names = []
for key in m.keys():
    names.append(key)
for key in f.keys():
    names.append(key)

names = set(names)

simi_pairs = {}
dis_pairs = {}

names_1 = []
for _ in names:
    names_1.extend(names)

names_2 = []
for name in names:
    for _ in range(len(names)):
        names_2.append(name)


emb_1 = torch.stack([torch.tensor(out_emb[tokenizer.encode(name)[1]]) for name in names_1])
emb_2 = torch.stack([torch.tensor(out_emb[tokenizer.encode(name)[1]]) for name in names_2])
name_comb = np.array([name_1 + '-' + name_2 for name_1, name_2 in zip(names_1, names_2)])

simis = cos(emb_1, emb_2).numpy()
dis = np.array([distance.euclidean(v1, v2) for v1, v2 in zip(emb_1, emb_2)])

idx_simis = np.argsort(simis)
idx_dis = np.argsort(dis)

res_simis = []
for name, simi in zip(name_comb[idx_simis], simis[idx_simis]):
    if simi <0.99:
        res_simis.append([name, simi])

res_dis = []
for name, dis in zip(name_comb[idx_dis], dis[idx_dis]):
    if dis > 0.01:
        res_dis.append([name, dis])

for res_1, res_2 in zip(reversed(res_simis), res_dis):
    print('Res simi', res_1, 'Res dis', res_2)

In [56]:
layer = 2
inp_emb = context_less_emb[layer]
out_emb = context_less_emb[layer]
out_emb = torch.nn.functional.normalize(torch.tensor(out_emb), dim=1, p=2).numpy()
inp_emb = torch.nn.functional.normalize(torch.tensor(inp_emb), dim=1, p=2).numpy()

monk_emb = tokenizer.encode('Ford')[1]
men_emb = tokenizer.encode('the')[1]
print(tokenizer.decode(men_emb))
monk_emb = out_emb[monk_emb]
men_emb = out_emb[men_emb]
cos(torch.tensor([men_emb]), torch.tensor([monk_emb]))

the


tensor([0.9999])