In [1]:
!pip install transformers
!pip install NLTK
import nltk
nltk.download('vader_lexicon')

Collecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m106.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m76.7 MB/s[0m eta [36m0:00:

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [2]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.modules.loss import _WeightedLoss
from torchtext.data.metrics import bleu_score
import tensorflow as tf
import tensorflow_hub as hub
from transformers import AutoTokenizer, AutoModel
from transformers import AdamW, get_linear_schedule_with_warmup
from collections import defaultdict
import math
import re
import csv
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
import os

In [28]:
def prepare_input(data, model, tokenizer):
    batch_size = 64
    input_ids, token_type_ids, attention_mask = [], [], []
    print("tokenization starts")

    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        batch_tokens = tokenizer(batch, truncation=True, padding='max_length', max_length=128)

        #csv.writer(input_file, delimiter=' ').writerows(batch_tokens["input_ids"])
        #csv.writer(type_file, delimiter=' ').writerows(batch_tokens["token_type_ids"])
        #csv.writer(att_file, delimiter=' ').writerows(batch_tokens["attention_mask"])
        input_ids.extend(batch_tokens["input_ids"])
        token_type_ids.extend(batch_tokens["token_type_ids"])
        attention_mask.extend(batch_tokens["attention_mask"])

    print("tokenization done")


    tensor_input_ids = torch.tensor(input_ids).to(torch.int64)
    tensor_token_ids = torch.tensor(token_type_ids).to(torch.int64)
    tensor_attention = torch.tensor(attention_mask).to(torch.int64)

    dataset = TensorDataset(tensor_input_ids, tensor_token_ids, tensor_attention)
    dataloader = DataLoader(dataset, batch_size=batch_size)

    model.to(device)
    model.eval()


    res = []


    for batch in dataloader:

        b_input_ids, b_token_type, b_input_mask = batch

        with torch.no_grad():

            outputs = model(b_input_ids.to(device), token_type_ids=b_token_type.to(device),
                        attention_mask=b_input_mask.to(device))
            hidden_states = outputs[2][-1]

            res.append(hidden_states.cpu().numpy())


    return res, input_ids


**Data Preprocessing**

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased", output_hidden_states=True)
model.to(device)
df = pd.read_csv("mt_subset_spacy.csv") # this is a sample provided for you
data = df["text"].to_list()

ans, input_ids = prepare_input(data, model, tokenizer)


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenization starts
tokenization done


In [5]:
np.save("embeddings.npy", np.concatenate(ans, axis=0))  # its good to save this data so that you save time for next run

In [6]:
data = np.load("embeddings.npy")
data.shape

(2404, 128, 768)

**Start Experiment**

In [7]:
class SmoothCrossEntropyLoss(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    def k_one_hot(self, targets:torch.Tensor, n_classes:int, smoothing=0.0):
        with torch.no_grad():
            targets = torch.empty(size=(targets.size(0), n_classes),
                                  device=targets.device) \
                                  .fill_(smoothing /(n_classes-1)) \
                                  .scatter_(1, targets.data.unsqueeze(1), 1.-smoothing)
        return targets

    def reduce_loss(self, loss):
        return loss.mean() if self.reduction == 'mean' else loss.sum() \
        if self.reduction == 'sum' else loss

    def forward(self, inputs, targets):
        assert 0 <= self.smoothing < 1

        targets = self.k_one_hot(targets, inputs.size(-1), self.smoothing)
        log_preds = F.log_softmax(inputs, -1)

        if self.weight is not None:
            log_preds = log_preds * self.weight.unsqueeze(0)

        return self.reduce_loss(-(targets * log_preds).sum(dim=-1))

In [8]:
# testing loss function
crit = SmoothCrossEntropyLoss(smoothing=0.5)
predict = torch.FloatTensor([[0, 0.2, 0.7, 0.1, 0],
                            [0, 0.9, 0.2, 0.2, 1],
                            [1, 0.2, 0.7, 0.9, 1]])
v = crit(Variable(predict),
        Variable(torch.LongTensor([2, 1, 0])))
print(v)

tensor(1.5161)


In [9]:
#The decoder model architecture
model.to("cpu")
class Decoder(nn.Module):
    def __init__(self, nhead=8, num_layers=6, d_model=768, constrained_embedding=True, d_pos=512, vocab=None, sequence_memory=False,
                pos_embedding = model.embeddings.position_embeddings.weight, word_embedding=model.embeddings.word_embeddings.weight ):
        super(Decoder, self).__init__()
        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        ##initialize transformer decoder
        self.transformer =  nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        ##initialize position encoding
        self.pos_embedding = nn.Embedding(d_pos, d_model)
        self.pos_embedding.weight = nn.Parameter(pos_embedding)
        self.pos_embedding.weight.requires_grad = False
        ##initialize projection layer
        #self.projection = nn.Linear(768, 28996, bias=False)
        if constrained_embedding:
            self.word_embedding = nn.Parameter(torch.index_select(word_embedding,0,vocab))
        else:
            self.word_embedding = nn.Parameter(word_embedding)
        self.word_embedding.requires_grad = True
        self.sequence_memory = sequence_memory
        #self.activation = nn.Softmax(dim=-1)

    def forward(self,de_input, memory):
        de_input_pos_emd = self.pos_embedding(torch.arange(de_input.shape[1]).to(device))
        de_input = de_input + de_input_pos_emd.unsqueeze(0).repeat(de_input.shape[0],1,1) ###add positional encoding to decoder input
        if not self.sequence_memory:
            memory = memory.unsqueeze(1).repeat(1,de_input.shape[1],1)
        memory = memory + de_input_pos_emd.unsqueeze(0).repeat(de_input.shape[0],1,1) ###add positional encoding to memory
        x = self.transformer(de_input, memory)

        #y = self.projection(x)
        y = torch.matmul(x, self.word_embedding.t())
        return y

In [10]:
class clinical_embedding(Dataset):
    def __init__(self, text, de_input,transform=None, target_transform=None):
        self.text = text
        self.de_input = de_input
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return self.text.shape[0]

    def __getitem__(self, idx):
        X = self.text[idx]
        memory = torch.Tensor(X).to(torch.float32)
        de_input = torch.tensor(self.de_input[idx]).to(torch.int64)
        target = torch.roll(de_input, -1, dims=0)
        target[-1] = 0
        #target = F.one_hot(target, num_classes=28996)
        de_input = model.embeddings.word_embeddings(de_input)

        sample = {"memory": memory, "target": target, "de_input": de_input}

        return sample


def vec_translate(a, my_dict):
        return np.vectorize(my_dict.__getitem__)(a)

batch_size = 60
sequential_embedding = True

embeddings = np.load("embeddings.npy")
tgt = input_ids
tgt = tgt[:len(embeddings)]
vocab = np.unique(tgt)

idx_2_token = {}
token_2_idx = {}
for idx, token in enumerate(vocab):
    idx_2_token[idx] = token
    token_2_idx[token] = idx
vocab = torch.Tensor(vocab).to(torch.int64)
new_tgt= vec_translate(tgt, token_2_idx)

X_tr_mem, X_test_mem, y_tr_dein, y_test_dein = train_test_split(embeddings, new_tgt, test_size=0.1, random_state=2022)

### convert decoder input into one hot target

training_data = clinical_embedding(X_tr_mem, y_tr_dein)
test_data = clinical_embedding(X_test_mem, y_test_dein)
train_dataloader = DataLoader(training_data, batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size, shuffle=True)

In [11]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model.to(device)
    model = model.train()
    losses = []
    for d in data_loader:
        memory = d["memory"].to(device)
        de_input = d["de_input"].to(device)
        targets = d["target"].to(device)

        outputs = model(
                de_input,
                memory
                )

        loss = loss_fn(outputs.view(-1, outputs.shape[-1]), torch.flatten(targets))
        losses.append(loss.item())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return  np.mean(losses)

In [12]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []

    with torch.no_grad():
        for d in data_loader:
            memory = d["memory"].to(device)
            de_input = d["de_input"].to(device)
            targets = d["target"].to(device)
            outputs = model(
                      de_input,
                      memory)
            loss = loss_fn(outputs.view(-1, outputs.shape[-1]), torch.flatten(targets))
            losses.append(loss.item())
    return np.mean(losses)

**Train your decoder**

In [13]:
EPOCHS = 20
decoder = Decoder(vocab=vocab, num_layers=6, sequence_memory=sequential_embedding)
decoder.to(device)
optimizer = AdamW(decoder.parameters(), betas=(0.9, 0.98), eps=1e-9, lr=0.0001)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=300,
  num_training_steps=total_steps
)
#loss_fn = nn.CrossEntropyLoss(size_average=False, ignore_index=0)
loss_fn = SmoothCrossEntropyLoss(smoothing=0.5)



In [14]:
#train/finetune the decoder
#for simlicity, the pretraining is skipped here, refer to Sec. 9
history = defaultdict(list)
best_ce = float("inf")
patience = 2
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    train_loss = train_epoch(
                            decoder,
                            train_dataloader,
                            loss_fn,
                            optimizer,
                            device,
                            scheduler,
                            len(X_tr_mem)
                            )
    print(f'Train loss {train_loss} ')
    val_loss = eval_model(
                        decoder,
                        test_dataloader,
                        loss_fn,
                        device,
                        len(X_test_mem)
                        )
    print(f'Val   loss {val_loss} ')
    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    if val_loss < best_ce:
        torch.save(decoder.state_dict(), 'finetuned_decoder_state.bin')
        best_ce = val_loss
        patience = 0
    else:
        patience += 1
        if patience > 2:
            break

Epoch 1/20
----------
Train loss 6.994896231470881 
Val   loss 5.688381195068359 
Epoch 2/20
----------
Train loss 5.4508769576613965 
Val   loss 5.309327125549316 
Epoch 3/20
----------
Train loss 5.279418893762537 
Val   loss 5.210868549346924 
Epoch 4/20
----------
Train loss 5.223765746967213 
Val   loss 5.176285171508789 
Epoch 5/20
----------
Train loss 5.188881642109639 
Val   loss 5.247062492370605 
Epoch 6/20
----------
Train loss 5.164689772837871 
Val   loss 5.214378356933594 
Epoch 7/20
----------
Train loss 5.144145991351153 
Val   loss 5.162922191619873 
Epoch 8/20
----------
Train loss 5.117914161166629 
Val   loss 5.107823371887207 
Epoch 9/20
----------
Train loss 5.091563482542296 
Val   loss 5.099489402770996 
Epoch 10/20
----------
Train loss 5.0684392903302165 
Val   loss 5.097560024261474 
Epoch 11/20
----------
Train loss 5.046686262697787 
Val   loss 5.126954650878906 
Epoch 12/20
----------
Train loss 5.026753928210284 
Val   loss 5.055900382995605 
Epoch 13/20

**Inference**

In [15]:
import string
exclude = " ".join(list(string.punctuation)+["[PAD]"])
filter_set = tokenizer.encode(exclude)

In [16]:
bert = model
def inference(model,tgt, mem, idx_2_token, prefix=0, max_len=10, beam_size=20):
    model.eval()
    tgt = np.expand_dims(tgt, 0)
    mem = np.expand_dims(mem,0)
    hypothesis = torch.Tensor(tgt).to(torch.int64)
    hypothesis[:,1:] = 0
    hypotheses = []
    mem = torch.Tensor(mem).to(torch.float32)
    ### BOS: 101 ; EOS: 102 ; padding 0


    de_input = bert.embeddings.word_embeddings(hypothesis)
    outputs = model(de_input.to(device),
                        mem.to(device))
    for pos in range(max_len):
        tokens = torch.topk(outputs[:,pos,:], beam_size).indices
        #values = torch.topk(outputs[:,idx,:], beam_size).values
        #values = F.gumbel_softmax(values, tau= 3,dim=-1)
        #values_ = np.squeeze(values.cpu().detach().numpy())
        hypothesis = torch.zeros(tgt.shape, dtype=torch.int64)
        for j in range(beam_size):
            idx = tokens[0][j].item()
            token = idx_2_token[idx]

            if (token>200) and (token not in filter_set):
                hypothesis[:,j+1] = token
        hypotheses.append(hypothesis)

    ground_truth = tokenizer.decode(np.trim_zeros(np.squeeze(vec_translate(tgt, idx_2_token))[1:])).strip("[SEP]")
    #decoded = tokenizer.decode(torch.squeeze(hypothesis)[1:]).strip(" [PAD] ")
    all_translations = []
    for hypo in hypotheses:
        decoded = tokenizer.decode(np.squeeze(hypo)[1:]).strip(" [PAD] ")
        all_translations.append(decoded)
    return (ground_truth, all_translations)

Evaluate on test set

In [17]:
res=inference(decoder, y_test_dein[0], X_test_mem[0], idx_2_token, beam_size=20)
print("-"*10 + "groundtruth" + "-"*10)
print(res[0])
print("-"*10 + "Reconstructions" + "-"*10)
for i in range(10):
    print(res[1][i])

----------groundtruth----------
partial proximal obstruction patent distal system 
----------Reconstructions----------
the a no this right general operation an [PAD] without neck his obstruction operative superior patent left radical was
bladder patent obstruction common patient pending partial otherwise per nasal artery via islous closure tract necessitatedhm
bladder obstruction patent partial pending of common point failure appliedrencewayucible [PAD] otherwise nasal via arteryfle
pending obstruction patent partial of bladder [PAD] commonhr viarantuciblemas procedure closure applieddder failure artery
of pending [PAD] and bladder applied patent partialhr obstruction artery common failure via [PAD]omy br conductchner
pending [PAD] partial ofomy obstruction patent failure arteryhrrence bladder applied andym procedure traction common conduct
pending [PAD] obstruction partialrencehr patent bladder bilateral ofym closure common viaomy traction and nasal br
pending partial bladderhr of obs

In [18]:
import random
bert = model
def translate(model,tgt, mem, max_len, idx_2_token, beam_size=5, prefix=3, temp=2.5):
    model.eval()
    tgt = np.expand_dims(tgt, 0)
    mem = np.expand_dims(mem,0)
    hypothesis = torch.Tensor(tgt).to(torch.int64)
    hypothesis[:,prefix+1:] = 0
    mem = torch.Tensor(mem).to(torch.float32)
    ### BOS: 101 ; EOS: 102 ; padding 0
    #consecutive = 0
    for i in range(prefix, max_len):
        de_input = bert.embeddings.word_embeddings(hypothesis)
        outputs = model(de_input.to(device),
                        mem.to(device))
        tokens = torch.topk(outputs[:,i,:], beam_size).indices
        values = torch.topk(outputs[:,i,:], beam_size).values
        values = F.gumbel_softmax(values, tau= temp,dim=-1)
        values_ = np.squeeze(values.cpu().detach().numpy())
        for j in range(beam_size):
            k = np.random.choice(beam_size,1, p=values_)
            idx = tokens[0][k].item()
            token = idx_2_token[idx]

            if (token>200) and (token not in hypothesis[:,:i+1])and (token not in filter_set):
                hypothesis[:,i+1] = token
                #if i < prefix+1: break
                break
        if token == 102 or token == 0:
            break
    for i in range(prefix+1):
        hypothesis[:,i] = idx_2_token[hypothesis[:,i].item()]
    ground_truth = tokenizer.decode(np.trim_zeros(np.squeeze(vec_translate(tgt, idx_2_token))[1:])).strip("[SEP]")
    decoded = tokenizer.decode(torch.squeeze(hypothesis)[1:]).strip(" [PAD] ")
    #decoded = decoded[:len(ground_truth)]
    return (ground_truth, decoded)

transferbility, this evaluate the decoder on another dataset

In [29]:
cms = pd.read_csv('cms_spacy.csv', header="infer")
label = cms["spacy_label"]
# again generate input for this dataset
cms_embedding, _ = prepare_input(cms["text"].tolist(), model, tokenizer)
cms_embedding = np.concatenate(cms_embedding, axis=0)

"""
code = {"ATL": "Atlanta",
        "DFW": "Dallas",
        "DEN": "Denver",
        "ORD": "Chicago",
        "LAX": "Los Angeles",
        "CLT": "Charlotte",
        "LAS": "Las Vegas",
        "PHX": "Phoenix",
        "MCO": "Orlando",
        "SEA": "Seattle",
        "MIA": "Miami",
        "IAH": "Houston",
        "JFK": "New York",
        "FLL": "Fort Lauderdale",
        "EWR": "Newark",
        "SFO": "San Francisco",
        "MSP": "Minneapolis",
        "DTW": "Detroit",
        "BOS": "Boston",
        "SLC": "Salt Lake City"
       }
"""

tokenization starts
tokenization done


'\ncode = {"ATL": "Atlanta",\n        "DFW": "Dallas",\n        "DEN": "Denver",\n        "ORD": "Chicago",\n        "LAX": "Los Angeles",\n        "CLT": "Charlotte",\n        "LAS": "Las Vegas",\n        "PHX": "Phoenix",\n        "MCO": "Orlando",\n        "SEA": "Seattle",\n        "MIA": "Miami",\n        "IAH": "Houston",\n        "JFK": "New York",\n        "FLL": "Fort Lauderdale",\n        "EWR": "Newark",\n        "SFO": "San Francisco",\n        "MSP": "Minneapolis",\n        "DTW": "Detroit",\n        "BOS": "Boston",\n        "SLC": "Salt Lake City"\n       }\n'

In [39]:
model.to("cpu")
def transfer_translate(model, mem, max_len, idx_2_token, beam_size=5, prefix=3, temp=2.5):
    model.eval()
    tgt=torch.zeros(128)
    tgt = np.expand_dims(tgt, 0)
    mem = np.expand_dims(mem,0)
    hypothesis = torch.Tensor(tgt).to(torch.int64)
    #hypothesis[:,prefix+1:] = 0
    mem = torch.Tensor(mem).to(torch.float32)
    ### BOS: 101 ; EOS: 102 ; padding 0
    #consecutive = 0
    for i in range(prefix, max_len):
        de_input = bert.embeddings.word_embeddings(hypothesis)
        outputs = model(de_input.to(device),
                        mem.to(device))
        tokens = torch.topk(outputs[:,i,:], beam_size).indices
        values = torch.topk(outputs[:,i,:], beam_size).values
        values = F.gumbel_softmax(values, tau= temp,dim=-1)
        values_ = np.squeeze(values.cpu().detach().numpy())
        for j in range(beam_size):
            k = np.random.choice(beam_size,1, p=values_)
            idx = tokens[0][k].item()
            token = idx_2_token[idx]

            if (token>200) and (token not in hypothesis[:,:i+1])and (token not in filter_set):
                hypothesis[:,i+1] = token
                #if i < prefix+1: break
                break
        if token == 102 or token == 0:
            break
    for i in range(prefix+1):
        hypothesis[:,i] = idx_2_token[hypothesis[:,i].item()]
    decoded = tokenizer.decode(torch.squeeze(hypothesis)[1:]).strip(" [PAD] ")

    return decoded

In [40]:
#reconstruct the sentences
#try first 100 samples
Ground_truth = []
Prediction = []
for i in range(100):
    Res = []
    for j in range(10):
        res = transfer_translate(decoder, cms_embedding[i], 15, idx_2_token, prefix=0 , beam_size=10, temp=2)
        Res.append(res)
    print("-"*10 + str(i) + "-"*10)
    gt = cms["text"].iloc[i]
    print(gt)
    print("-"*21)
    Prediction.append(Res)
    Ground_truth.append(gt)
    for k in range(10):
        print(Res[k])


----------0----------
hospital observation care on day of discharge
---------------------
the bladderponpsage and was gall [PAD]uteriter [PAD]rp bio ct
an and sterile
general an padudge and wastail the without bun dr
the pad for induction arrest injectionificder [PAD]age surgery physician pathology condition dr
the care diagnostic
heart waslang [PAD] statusalhalyr lapage due or microscope
heart and waslangemayst needle
the careage hemural was lab
this bro the of labemcoax
anressed catyleylaillailaril
----------1----------
hospital observation care, typically 70 minutes
---------------------
this for par an and or the
thistail the foricalutionuter
this insulin condition [PAD] and sterileage drison the without
a forillailleilillo to
this for par overnight pending morning the distal
a insulin the and or was await repeat
the care nurse and wasramco bunusnos nasal shave shaved
heartstation [PAD] equals was grafyst [PAD] minutes years months weeksmal conditionase
an was echoterol the without

Metrics

In [86]:
###define semantic similarity
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/4"
embed = hub.load(module_url)
def semantic_sim(messages):
        #messages = ["That band rocks!", "That song is really cool."]
        scores = []
        for m in messages:
            message_embeddings = embed(m)["outputs"]
            a, b = message_embeddings[0].numpy(), message_embeddings[1].numpy()
            scores.append(np.inner(a,b))
        return np.mean(scores)

###define Bleu score
def bleu(candidates, references):

    return bleu_score(candidates, references, max_n=2, weights=[0.5, 0.5])

###define sentiment similarity

def sentiment_sim(messages):
    Scores = []
    for message in messages:
        score1 = sia.polarity_scores(message[0])
        score2 = sia.polarity_scores(message[1])
        a = np.array([score1["neg"], score1["neu"], score1["pos"]])
        b = np.array([score2["neg"], score2["neu"], score2["pos"]])
        Scores.append(np.inner(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
    return np.mean(Scores)

###top_k prediction
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))
def top_k(prediction):
    percentiles = [5, 10, 20, 50]
    top_words = defaultdict(list)
    prediction = " ".join(prediction)

    tempt = []
    for x in prediction.split(): tempt += x.split()
    c = Counter(tempt)

    words = [k for k, v in sorted(c.items(), key=lambda item: item[1], reverse=True) if k not in stops]
    for p in percentiles:
        top_words[p].extend(words[:int(p*len(words)/100)+1])

    return top_words

###top-k accuracy
def top_k_accuracy(top, multi):
    acc = {}
    for k in top.keys():
        res = []
        for t,m in zip(top[k], multi):
            ans = False
            for c in set(m.split()):
                if c in t: ans = True
            res.append(ans)
        acc[k] = sum(res)/len(res)
    return acc

from geopy.geocoders import Nominatim
from functools import partial
geolocator = Nominatim(user_agent = "geoapiExercises")
geocode = partial(geolocator.geocode, language="en")
#location = geocode("london")
#print("Country Name: ", location)

def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

def expand_keyword(label):
    res = []
    location = geocode(label)
    res.extend(location.raw["display_name"].split(", "))
    res = [x for x in res if not has_numbers(x)]
    return res


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [88]:
# Now let's count keywords that we just generated
count = 0
for i in range(100):
    res = top_k(Prediction[i])

    if isinstance(label.iloc[i], float):
        continue
    for keyword in label.iloc[i].split():
        if keyword in res[50]:
            count += 1
print(count)

33


Refer to the paper for the tips to improve reconstruction