In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.modules.loss import _WeightedLoss
from torchtext.data.metrics import bleu_score
import tensorflow as tf
import tensorflow_hub as hub

from collections import defaultdict
import math
import re
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
class SmoothCrossEntropyLoss(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    def k_one_hot(self, targets:torch.Tensor, n_classes:int, smoothing=0.0):
        with torch.no_grad():
            targets = torch.empty(size=(targets.size(0), n_classes),
                                  device=targets.device) \
                                  .fill_(smoothing /(n_classes-1)) \
                                  .scatter_(1, targets.data.unsqueeze(1), 1.-smoothing)
        return targets

    def reduce_loss(self, loss):
        return loss.mean() if self.reduction == 'mean' else loss.sum() \
        if self.reduction == 'sum' else loss

    def forward(self, inputs, targets):
        assert 0 <= self.smoothing < 1

        targets = self.k_one_hot(targets, inputs.size(-1), self.smoothing)
        log_preds = F.log_softmax(inputs, -1)

        if self.weight is not None:
            log_preds = log_preds * self.weight.unsqueeze(0)

        return self.reduce_loss(-(targets * log_preds).sum(dim=-1))

In [None]:
# testing loss function
crit = SmoothCrossEntropyLoss(smoothing=0.5)
predict = torch.FloatTensor([[0, 0.2, 0.7, 0.1, 0],
                            [0, 0.9, 0.2, 0.2, 1], 
                            [1, 0.2, 0.7, 0.9, 1]])
v = crit(Variable(predict),
        Variable(torch.LongTensor([2, 1, 0])))
print(v)

In [None]:
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
Clinical_bert = AutoModel.from_pretrained("bert-base-uncased")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
class Decoder(nn.Module): 
    def __init__(self, nhead=8, num_layers=6, d_model=768, constrained_embedding=True, d_pos=512, vocab=None, sequence_memory=False,
                pos_embedding = Clinical_bert.embeddings.position_embeddings.weight, word_embedding=Clinical_bert.embeddings.word_embeddings.weight ): 
        super(Decoder, self).__init__() 
        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        ##initialize transformer decoder
        self.transformer =  nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        ##initialize position encoding
        self.pos_embedding = nn.Embedding(d_pos, d_model)
        self.pos_embedding.weight = nn.Parameter(pos_embedding)
        self.pos_embedding.weight.requires_grad = False
        ##initialize projection layer
        #self.projection = nn.Linear(768, 28996, bias=False)
        if constrained_embedding:
            self.word_embedding = nn.Parameter(torch.index_select(word_embedding,0,vocab))
        else:
            self.word_embedding = nn.Parameter(word_embedding)
        self.word_embedding.requires_grad = True     
        self.sequence_memory = sequence_memory
        #self.activation = nn.Softmax(dim=-1)
        
    def forward(self,de_input, memory):
        de_input_pos_emd = self.pos_embedding(torch.arange(de_input.shape[1]).to(device))
        de_input = de_input + de_input_pos_emd.unsqueeze(0).repeat(de_input.shape[0],1,1) ###add positional encoding to decoder input 
        if not self.sequence_memory:
            memory = memory.unsqueeze(1).repeat(1,de_input.shape[1],1)
        memory = memory + de_input_pos_emd.unsqueeze(0).repeat(de_input.shape[0],1,1) ###add positional encoding to memory
        x = self.transformer(de_input, memory)
        
        #y = self.projection(x)
        y = torch.matmul(x, self.word_embedding.t())
        return y

In [None]:
class clinical_embedding(Dataset):
    def __init__(self, text, de_input,transform=None, target_transform=None):
        self.text = text
        self.de_input = de_input
        self.transform = transform
        self.target_transform = target_transform
        
    def __len__(self):
        return self.text.shape[0]

    def __getitem__(self, idx):
        X = self.text[idx]
        memory = torch.Tensor(X).to(torch.float32)
        de_input = torch.tensor(self.de_input[idx]).to(torch.int64)
        target = torch.roll(de_input, -1, dims=0)
        target[-1] = 0
        #target = F.one_hot(target, num_classes=28996)
        de_input = Clinical_bert.embeddings.word_embeddings(de_input)
        
        sample = {"memory": memory, "target": target, "de_input": de_input}
       
        return sample


def vec_translate(a, my_dict):    
        return np.vectorize(my_dict.__getitem__)(a)

batch_size = 60
sequential_embedding = True
###loading data from npy
if sequential_embedding:
    embeddings = np.load("/kaggle/input/toy-encoding-data/airline_unpooled_1.npy")
    embeddings = embeddings.reshape(-1, 128, 768)
else:
    embeddings = np.load("/kaggle/input/toy-encoding-data/airline_pooled_1.npy")
    embeddings = embeddings.reshape(-1, 768)
tgt = np.loadtxt("/kaggle/input/toy-encoding-data/toy_input_ids.txt").astype(int)###load the subset
tgt = tgt[:embeddings.shape[0],:]
vocab = np.unique(tgt)
print(vocab.shape)
idx_2_token = {}
token_2_idx = {}
for idx, token in enumerate(vocab):
    idx_2_token[idx] = token
    token_2_idx[token] = idx
vocab = torch.Tensor(vocab).to(torch.int64)
new_tgt= vec_translate(tgt, token_2_idx)

X_tr_mem, X_test_mem, y_tr_dein, y_test_dein = train_test_split(embeddings, new_tgt, test_size=0.1, random_state=2022)

### convert decoder input into one hot target

training_data = clinical_embedding(X_tr_mem, y_tr_dein)
test_data = clinical_embedding(X_test_mem, y_test_dein)
train_dataloader = DataLoader(training_data, batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size, shuffle=True)

In [None]:
###constrained translation
dataset = "/kaggle/input/toy-encoding-data/toy_airline.txt"
df = pd.read_csv(dataset, sep='\t', header=None)
data = df[0].to_list()
vocab_words = set([w.strip(")").strip("(") for x in data for w in re.split(' ',x) ])
vocab_IDs = tokenizer(list(vocab_words), truncation=True, max_length=128)["input_ids"]
vocab_IDs = [id[1:-1] for id in vocab_IDs]
single_IDs = [x[0] for x in vocab_IDs if len(x)==1]
translation_constraint = defaultdict(set)
for id in vocab_IDs[1:]:
    if len(id) <= 1:
        continue
    else:
        translation_constraint[id[0]].add(id[1])
#print(translation_constraint)        

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    for d in data_loader:
        memory = d["memory"].to(device)
        de_input = d["de_input"].to(device)
        targets = d["target"].to(device)
        #print("memory", memory.shape)
        #print("de_input", de_input.shape)
        #print("targets", targets.shape)
        outputs = model(
                de_input,
                memory
                )
      
        loss = loss_fn(outputs.view(-1, outputs.shape[-1]), torch.flatten(targets))
        losses.append(loss.item())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return  np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    
    with torch.no_grad():
        for d in data_loader:
            memory = d["memory"].to(device)
            de_input = d["de_input"].to(device)
            targets = d["target"].to(device)
            outputs = model(
                      de_input,
                      memory)
            loss = loss_fn(outputs.view(-1, outputs.shape[-1]), torch.flatten(targets))
            losses.append(loss.item())
    return np.mean(losses) 

In [None]:
pre_trained = Decoder(num_layers=6, constrained_embedding=False,sequence_memory=True)
pre_trained.load_state_dict(torch.load('/kaggle/input/toy-encoding-data/pretrained_decoder_state.bin'))
pretrained_dict = pre_trained.state_dict()

In [None]:
EPOCHS = 20
model = Decoder(vocab=vocab, num_layers=6, sequence_memory=sequential_embedding, word_embedding=pre_trained.word_embedding)
#model_dict = model.state_dict()
#filtered_dict = {k: v for k, v in pretrained_dict.items() if k!="word_embedding"}
#model_dict.update(filtered_dict)
#model.load_state_dict(model_dict)
model.load_state_dict(torch.load('/kaggle/input/toy-encoding-data/finetuned_decoder_state_X.bin'))
model = model.to(device)
optimizer = AdamW(model.parameters(), betas=(0.9, 0.98), eps=1e-9, lr=0.0001)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=300,
  num_training_steps=total_steps
)
#loss_fn = nn.CrossEntropyLoss(size_average=False, ignore_index=0)
loss_fn = SmoothCrossEntropyLoss(smoothing=0.5)

In [None]:
history = defaultdict(list)
best_ce = float("inf")
patience = 0
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    train_loss = train_epoch(
                            model,
                            train_dataloader,
                            loss_fn,
                            optimizer,
                            device,
                            scheduler,
                            len(X_tr_mem)
                            )
    print(f'Train loss {train_loss} ')
    val_loss = eval_model(
                        model,
                        test_dataloader,
                        loss_fn,
                        device,
                        len(X_test_mem)
                        )
    print(f'Val   loss {val_loss} ')
    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    if val_loss < best_ce:
        torch.save(model.state_dict(), 'finetuned_decoder_state.bin')
        best_ce = val_loss
        patience = 0
    else:
        patience += 1
        if patience > 2:
            break

In [None]:
def beam_search(model,tgt, mem, max_len, idx_2_token,beam_size=8):
    model.eval()
    tgt = np.expand_dims(tgt, 0)
    mem = np.expand_dims(mem,0)
    hypothesis = torch.Tensor(tgt).to(torch.int64)
    hypothesis[:,1:] = 0
    hypotheses = [hypothesis] ###initialize hypotheses
    probabs = [1] ###initialize probabs
    mem = torch.Tensor(mem).to(torch.float32)
    ### BOS: 101 ; EOS: 102 ; padding 0 ; "."" : 119 ; "!":106 ; ",":117
    for i in range(max_len):
        next_hypotheses = []
        next_prob = []
        ### search for all hypotheses
        
        for hypo, prob in zip(hypotheses, probabs):
            
            de_input = Clinical_bert.embeddings.word_embeddings(hypo)
            outputs = model(de_input.to(device),
                            mem.to(device))
            
            #probas = F.softmax(outputs[:,i,:], dim=-1)
            tokens = torch.topk(outputs[:,i,:], beam_size).indices
            values = torch.topk(outputs[:,i,:], beam_size).values
            values = F.gumbel_softmax(values, tau= 5,dim=-1)
            for j in range(beam_size):
                idx = tokens[0][j].item()
                value = values[0][j].item()
                token = idx_2_token[idx] ### project the idx to original token
                
                if (token>200) and (token not in hypo[:,i-1:i+1]): ### if token is not a special character
                    hypo[:,i+1] = token
                    next_hypotheses.append(hypo.detach().clone())
                    next_prob.append(prob*value)
            
        if len(next_hypotheses)>beam_size:
            
            #top_idx = np.asarray(next_prob).argsort()[-beam_size:]
            top_idx = np.random.choice(len(next_prob), beam_size)
            hypotheses = [x for i,x in enumerate(next_hypotheses) if i in top_idx.tolist() ]
            probabs = [x for i,x in enumerate(next_prob) if i in top_idx.tolist()]
        else:
            hypotheses = next_hypotheses
            probabs = next_prob
        
        #if token == 102 or token == 0:
        #    break
    
    
    ground_truth = tokenizer.decode(np.trim_zeros(np.squeeze(vec_translate(tgt, idx_2_token)))).strip("[SEP]")
    all_translations = []
    for hypo in hypotheses:
        decoded = tokenizer.decode(np.squeeze(hypo)).strip(" [PAD] ")
        all_translations.append(decoded)
    
    return (ground_truth, all_translations)

In [None]:
import string
exclude = " ".join(list(string.punctuation)+["[PAD]"])
filter_set = tokenizer.encode(exclude)

In [None]:
def inference(model,tgt, mem, idx_2_token, prefix=0, max_len=10, beam_size=20):
    model.eval()
    tgt = np.expand_dims(tgt, 0)
    mem = np.expand_dims(mem,0)
    hypothesis = torch.Tensor(tgt).to(torch.int64)
    hypothesis[:,1:] = 0
    hypotheses = []
    mem = torch.Tensor(mem).to(torch.float32)
    ### BOS: 101 ; EOS: 102 ; padding 0
   
    
    de_input = Clinical_bert.embeddings.word_embeddings(hypothesis)
    outputs = model(de_input.to(device),
                        mem.to(device))
    for pos in range(max_len):
        tokens = torch.topk(outputs[:,pos,:], beam_size).indices
        #values = torch.topk(outputs[:,idx,:], beam_size).values
        #values = F.gumbel_softmax(values, tau= 3,dim=-1)
        #values_ = np.squeeze(values.cpu().detach().numpy())
        hypothesis = torch.zeros(tgt.shape, dtype=torch.int64)
        for j in range(beam_size):
            idx = tokens[0][j].item()
            token = idx_2_token[idx]
            
            if (token>200) and (token not in filter_set):
                hypothesis[:,j+1] = token 
        hypotheses.append(hypothesis)       
    
    ground_truth = tokenizer.decode(np.trim_zeros(np.squeeze(vec_translate(tgt, idx_2_token))[1:])).strip("[SEP]")
    #decoded = tokenizer.decode(torch.squeeze(hypothesis)[1:]).strip(" [PAD] ")
    all_translations = []
    for hypo in hypotheses:
        decoded = tokenizer.decode(np.squeeze(hypo)[1:]).strip(" [PAD] ")
        all_translations.append(decoded)
    return (ground_truth, all_translations)

In [None]:
#res=inference(model, y_test_dein[0], X_test_mem[0], idx_2_token, beam_size=20)
#print(res[0])
#print("-"*10 + "keywords" + "-"*10)
#for i in range(10):
#    print(res[1][i])

In [None]:
import random
def translate(model,tgt, mem, max_len, idx_2_token, beam_size=5, prefix=3, temp=2.5):
    model.eval()
    tgt = np.expand_dims(tgt, 0)
    mem = np.expand_dims(mem,0)
    hypothesis = torch.Tensor(tgt).to(torch.int64)
    hypothesis[:,prefix+1:] = 0
    mem = torch.Tensor(mem).to(torch.float32)
    ### BOS: 101 ; EOS: 102 ; padding 0
    #consecutive = 0
    for i in range(prefix, max_len):
        de_input = Clinical_bert.embeddings.word_embeddings(hypothesis)
        outputs = model(de_input.to(device),
                        mem.to(device))
        tokens = torch.topk(outputs[:,i,:], beam_size).indices
        values = torch.topk(outputs[:,i,:], beam_size).values
        values = F.gumbel_softmax(values, tau= temp,dim=-1)
        values_ = np.squeeze(values.cpu().detach().numpy())
        for j in range(beam_size):
            k = np.random.choice(beam_size,1, p=values_)
            idx = tokens[0][k].item()
            token = idx_2_token[idx]
            
            if (token>200) and (token not in hypothesis[:,:i+1])and (token not in filter_set):
                hypothesis[:,i+1] = token
                #if i < prefix+1: break
                break
        if token == 102 or token == 0:
            break
    for i in range(prefix+1):
        hypothesis[:,i] = idx_2_token[hypothesis[:,i].item()]
    ground_truth = tokenizer.decode(np.trim_zeros(np.squeeze(vec_translate(tgt, idx_2_token))[1:])).strip("[SEP]")
    decoded = tokenizer.decode(torch.squeeze(hypothesis)[1:]).strip(" [PAD] ")
    #decoded = decoded[:len(ground_truth)]
    return (ground_truth, decoded)

In [None]:
###transferbility
twt = pd.read_csv('/kaggle/input/toy-encoding-data/twt_airline_reviews.tsv', sep='\t', header=0)
twt_embedding  = np.load("/kaggle/input/toy-encoding-data/pooled_1.npy")
twt_embedding = twt_embedding.reshape(-1, 128, 768)
label = twt["Airport/City"]
code = {"ATL": "Atlanta",
        "DFW": "Dallas",
        "DEN": "Denver",
        "ORD": "Chicago",
        "LAX": "Los Angeles",
        "CLT": "Charlotte",
        "LAS": "Las Vegas",
        "PHX": "Phoenix",
        "MCO": "Orlando",
        "SEA": "Seattle",
        "MIA": "Miami",
        "IAH": "Houston",
        "JFK": "New York",
        "FLL": "Fort Lauderdale",
        "EWR": "Newark",
        "SFO": "San Francisco",
        "MSP": "Minneapolis",
        "DTW": "Detroit",
        "BOS": "Boston",
        "SLC": "Salt Lake City"
       }

In [None]:
def twt_translate(model, mem, max_len, idx_2_token, beam_size=5, prefix=3, temp=2.5):
    model.eval()
    tgt=torch.zeros(128)
    tgt = np.expand_dims(tgt, 0)
    mem = np.expand_dims(mem,0)
    hypothesis = torch.Tensor(tgt).to(torch.int64)
    #hypothesis[:,prefix+1:] = 0
    mem = torch.Tensor(mem).to(torch.float32)
    ### BOS: 101 ; EOS: 102 ; padding 0
    #consecutive = 0
    for i in range(prefix, max_len):
        de_input = Clinical_bert.embeddings.word_embeddings(hypothesis)
        outputs = model(de_input.to(device),
                        mem.to(device))
        tokens = torch.topk(outputs[:,i,:], beam_size).indices
        values = torch.topk(outputs[:,i,:], beam_size).values
        values = F.gumbel_softmax(values, tau= temp,dim=-1)
        values_ = np.squeeze(values.cpu().detach().numpy())
        for j in range(beam_size):
            k = np.random.choice(beam_size,1, p=values_)
            idx = tokens[0][k].item()
            token = idx_2_token[idx]
            
            if (token>200) and (token not in hypothesis[:,:i+1])and (token not in filter_set):
                hypothesis[:,i+1] = token
                #if i < prefix+1: break
                break
        if token == 102 or token == 0:
            break
    for i in range(prefix+1):
        hypothesis[:,i] = idx_2_token[hypothesis[:,i].item()]
    decoded = tokenizer.decode(torch.squeeze(hypothesis)[1:]).strip(" [PAD] ")
    
    return decoded

In [None]:
Ground_truth = []
Prediction = []
for i in range(100):
    Res = []
    for j in range(10):
        res = twt_translate(model, twt_embedding[i], 15, idx_2_token, prefix=0 , beam_size=10, temp=2)
        Res.append(res)
    print("-"*10 + str(i) + "-"*10)
    gt = twt["Reviews"].iloc[i]
    #print(gt)
    #print("-"*21)
    Prediction.append(Res)
    Ground_truth.append(gt)
    #for k in range(10):
        #print(Res[k])
    

In [None]:
###define semantic similarity
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/4"  
embed = hub.load(module_url)
def semantic_sim(messages):
        #messages = ["That band rocks!", "That song is really cool."]
        scores = []
        for m in messages:
            message_embeddings = embed(m)["outputs"]
            a, b = message_embeddings[0].numpy(), message_embeddings[1].numpy()
            scores.append(np.inner(a,b))
        return np.mean(scores)
        
###define Bleu score
def bleu(candidates, references):
    
    return bleu_score(candidates, references, max_n=2, weights=[0.5, 0.5])

###define sentiment similarity

def sentiment_sim(messages):
    Scores = []
    for message in messages:
        score1 = sia.polarity_scores(message[0])
        score2 = sia.polarity_scores(message[1])
        a = np.array([score1["neg"], score1["neu"], score1["pos"]])
        b = np.array([score2["neg"], score2["neu"], score2["pos"]])
        Scores.append(np.inner(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
    return np.mean(Scores)

###top_k prediction
from collections import Counter
import nltk
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))
def top_k(prediction):
    percentiles = [5, 10, 20, 50]
    top_words = defaultdict(list)
    
    for p in prediction:
        tempt = []
        for x in p: tempt += x.split()
        c = Counter(tempt)
        words = [k for k, v in sorted(c.items(), key=lambda item: item[1], reverse=True) if k not in stops]
        for p in percentiles:
            top_words[p].append(words[:int(p*len(words)/100)+1])
    
    return top_words

###top-k accuracy
def top_k_accuracy(top, multi):
    acc = {}
    for k in top.keys():
        res = []
        for t,m in zip(top[k], multi):
            ans = False
            for c in set(m.split()):
                if c in t: ans = True
            res.append(ans)
        acc[k] = sum(res)/len(res)
    return acc

from geopy.geocoders import Nominatim
from functools import partial 
geolocator = Nominatim(user_agent = "geoapiExercises")
geocode = partial(geolocator.geocode, language="en")
#location = geocode("london")
#print("Country Name: ", location)

def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

def expand_keyword(label):
    res = []
    location = geocode(label)
    res.extend(location.raw["display_name"].split(", "))
    res = [x for x in res if not has_numbers(x)]
    return res


In [None]:
#label = twt["Airport/City"].str.lower().tolist()[:10]
label = []
for t in Ground_truth:
    ans = []
    for c in code.keys():
        if c.lower() in t or code[c].lower() in t: ans.extend([c.lower(), code[c].lower()] + expand_keyword(code[c].lower())) ###expand the keyword set
    label.append(" ".join(ans).lower())

In [None]:
res = top_k(Prediction)
top_k_accuracy(res, label)

In [None]:
int_label = np.load("/kaggle/input/toy-encoding-data/test_y.npy")
city = ["hong kong", "london", "toronto", "paris", "rome", "sydney", "dubai", "bangkok", "singapore", "frankfurt"]
int_label = [city[int(l)] for l in int_label]

In [None]:
Ground_truth = []
Prediction = []
for i in range(100):
    Res = []
    for j in range(10):
        res = translate(model, y_test_dein[i], X_test_mem[i], 15, idx_2_token, prefix=0 , beam_size=10, temp=4)
        Res.append(res[1])
    print("-"*10 + str(i) + "-"*10)
    #print(res[0])
    Ground_truth.append(res[0])
    Prediction.append(Res)
    #print("-"*21)
    #for k in range(10):
        #print(Res[k])



In [None]:
multi_label = []
for t in Ground_truth:
    ans = []
    for c in city:
        if c in t: ans.append(c) ###expand the keyword set
    multi_label.append(" ".join(ans).lower())

In [None]:
res = top_k(Prediction)
top_k_accuracy(res, multi_label)

In [None]:
###BLEU score
candidates = [x[1].split() for x in Translations]
references = [[x[0].split()] for x in Translations]
bleu_ = bleu(candidates, references)
print("BLEU score is:",bleu_ )
###Semantic similarity
sem_sim = semantic_sim(Translations)
print("Semantic Similarity is:", sem_sim)
##Setiment similarity
sen_sim = sentiment_sim(Translations)
print("Sentiment Similarity is:", sen_sim)