In [None]:
import os
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm as progressbar
from transformers import AutoTokenizer, AutoModel
import pickle
import torch
import torch.nn.functional as F

In [None]:
torch.__version__

In [None]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"]="expandable_segments:True"

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = 'cpu'

### Loading preprocessed deduped data

In [None]:
df = pd.read_csv('../preprocessed/attribute.seqs.tsv', sep = '\t', names=['name', 'text']).dropna()

In [None]:
classes=df.groupby(by=["name"]).count().reset_index().name.values

### Making directories to keep results

In [None]:
!mkdir -p ../results/sbert-similarity-matrics ../results/sbert-similar-pairs

### Loading pretrained tokenizer and model

For model tuning specs see [SBERT here](https://www.sbert.net/docs/pretrained_models.html)  
For "naked" model usage see [huggingface](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)

In [None]:
NAMESPACE='sbert'

In [None]:
## all-mpnet-base-v2 does not fit into my 12GB CUDA together with vectors

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2').to(device)

<img src='img/SBERT-models.png' height="300px">

## Checking vocabulary overlap  
As BERT tokenizes into "morphemes", will consider word an OOV if it tokenizes into number of tokens more than 1/2 of original length,
which is not perfect.

In [None]:
%load_ext autoreload
%autoreload 2

from vocaverlap import VocabOverlap

In [None]:
%%time        
def if_oov(t):
    tt = tokenizer.tokenize(t)
    if tt is None or len(tt) == 0:
        return True
    if len(tt) == 1:
        return False
    if (len(tt) * 1.0)/len(t) >= 0.5:
        return True
    return False

vo = VocabOverlap(df[['name', 'text']].values).apply(if_oov)

with open(f'../preprocessed/attribute.token-freqs.{NAMESPACE}.info.pickle', 'wb') as f:
    pickle.dump(vo.info, f, protocol=pickle.HIGHEST_PROTOCOL)    


In [None]:
!ls -haltr ../preprocessed/attribute.token-freqs.$NAMESPACE.info.pickle

In [None]:
distribution = []

for k, v in vo.info.items():
    distribution.append([k, 1-v['oov_tokens']/v['tokens'], 1-v['oov_uniq']/v['uniq']])

distribution = sorted(distribution, key=lambda x :x[2], reverse=True)
df_distribution = pd.DataFrame(distribution, columns = ['name', 'r_tokens', 'r_uniq'])

In [None]:
plt.figure(figsize=(15,5))
plt.plot(df_distribution['r_uniq'])
plt.plot(df_distribution['r_tokens'])
plt.title(f'Pretrained {NAMESPACE.upper()} vocabulary fitness')
plt.xticks(range(0, df_distribution.shape[0]), df_distribution['name'], rotation='vertical')
plt.legend(['Tokens Uniq/OOV ratio', 'Token Total/OOV ratio'], loc='lower left')
plt.show()

In [None]:
good_overlap = df_distribution[df_distribution.r_uniq>0.75]
good_overlap.to_csv(f'../preprocessed/attribute.{NAMESPACE}.75.overlap.tsv', sep = '\t', index=False, header=False)

### Making similarity matrixes and top similar pairs per attribute

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
%%time
for a in progressbar(classes, total=len(classes), file=sys.stdout):
    sentences = df[df.name == a].text.values.tolist()
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    embeddings = F.normalize(embeddings, p=2, dim=1)

    matrix = np.matmul(embeddings, embeddings.T)
    np.save(f'../results/{NAMESPACE}-similarity-matrics/{NAMESPACE}-{a}.similarity-score-matrix', matrix, allow_pickle = False)

    scores = []
    x = np.tril(matrix, k=-1)
    for r, c in zip(*np.where(x>0.7)):
        scores.append([a, x[r, c], sentences[r], sentences[c]])
        
    pd.DataFrame(scores, columns=['attr', 
                                  'score', 
                                  's1', 
                                  's2']).to_csv(f'../results/{NAMESPACE}-similar-pairs/{NAMESPACE}-{a}.similar-pairs.tsv', sep = '\t', index=False)

In [None]:
!ls -lh ../results/sbert-similarity-matrics/*.similarity-score-matrix.*