In [1]:
import torch
from torch.nn.functional import softmax
from gpt2 import GPT2LanguageModel
import numpy as np
from tqdm import tqdm_notebook
import time

model_117M = GPT2LanguageModel(model_name='117M')
model_345M = GPT2LanguageModel(model_name='345M')

model_name = "345M"
model = model_117M if model_name == "117M" else model_345M

In [22]:
def saveSingleMultipleTokens(probsWithWords):
    singleWordTokens = []
    otherWordTokens = []
    probs = []
    for entry in probsWithWords:
        if len(entry[0]) == 1:
            singleWordTokens.append(entry[1])
        else:
            otherWordTokens.append(entry[1])
    #         print(entry[1], len(entry[0]))
    print("Single word tokens: ", len(singleWordTokens), "out of ", len(probsWithWords))

    with open("SingleTokenEmotions.txt", "w") as f:
        for word in singleWordTokens:
            f.write(word+"\n")
    with open("MultipleTokenEmotions.txt", "w") as f:
        for word in otherWordTokens:
            f.write(word+"\n")

def get_next_words(model, context, words, depth):
    # get next "word" given context
    if depth == 0:
        return words
    new_words = []
    for i, word in enumerate(words):
        logits = model.predict(context, word)
        # take the one with the highest probability
        # next_word_logit, next_index = logits.topk(1)
        next_index = torch.argmax(logits)
        next_word = model[next_index.item()]

        new_words.append(word + next_word)
    return get_next_words(model, context, new_words, depth-1)

def get_probabilities_words(model, context, words):
    # encode words to tokens
    # Add a whitespace to the comparisons if there is no trailing whitespace in context
    encoded_comp = model.tokenizer.encode(" " + words if context[-1] != " " else words)
    # If comparison is composed of multiple words find them one after the other
    probs = []
    new_context = context
    for token in encoded_comp:
        logits = model.predict(new_context, None)
        probabilities = softmax(logits, dim=-1)
        probs.append(probabilities[token].item())
        # feed in model with new context (oldContext+token) and get probability
        new_context += model.tokenizer.decode([token])
#     print(encoded_comp, model.tokenizer.decode(encoded_comp), probs)
    # calculate with proper bayesian probability? --> now done in get_emotionvector()
    return probs, words

def get_emotionvector(model, context, emotions):
    probsWithWords = []
    tokenProbs = np.zeros(len(words))
    for i, comp in enumerate(tqdm_notebook(words)):
        probsWithWords.append(get_probabilities_words(model, context, comp))
        # print(f'With probability of {probs[-1]}: "{comp}"')
        # calc joint propability of tokens (e.g. p(dis|context) x p(heartening | context+dis))
        tokenProbs[i] = np.prod(probsWithWords[-1][0])
        # Scale the probabilities to sum up to one
    tokenProbs = tokenProbs/np.sum(tokenProbs)
    return probsWithWords, tokenProbs


In [23]:
with open("emotions.txt", "r") as f:
    words = f.readlines()
    
emotions = [e.strip() for e in words]

# NOTE A trailing whitespace gives other output than without
context = "Breakfast is"
# comparisons = ["big myth", "myth", "fascinating", "hoax", "farce", "onomatopeia"]

# filter words given comparison list
print("Context = ", context)

probsWithWords, emotionVector = get_emotionvector(model, context, emotions)

sorted_idx = np.argsort(emotionVector)[::-1]
for i in sorted_idx:
    print(emotionVector[i], probsWithWords[i][1])
# saveSingleMultipleTokens(probsWithWords)


Context =  Breakfast is


HBox(children=(IntProgress(value=0, max=269), HTML(value='')))

0.34827687099191595 amazing

0.1892199945864834 boring

0.02664123861483101 comforting

0.026391151597361 serving

0.025217002422359577 interesting

0.02323742942762173 delightful

0.01663978815407615 disgusting

0.01591167670620117 exciting

0.014693094097842405 satisfying

0.01184292366834498 refreshing

0.010701702868020635 disappointing

0.00995347928723631 stressful

0.009640320158480441 warming

0.009405740740294327 entertaining

0.009309975670778039 overwhelming

0.008644818260195149 rewarding

0.008478370767815177 intoxicating

0.008366345445213514 fascinating

0.007808770645420368 burning

0.007762653788837993 painful

0.007507108908732956 exhausting

0.007180752639392829 inspiring

0.0068337860755324125 helpful

0.006262568365279209 nourishing

0.0061614588162481705 confusing

0.005633763318283479 stunning

0.005527887529525227 terrifying

0.005435452528591607 failing

0.005077046320155718 relaxing

0.004519993150849971 liberating

0.0043536556749136784 impressive

0.00424443

1.0945904528390248e-05 provoking

1.0412204133458578e-05 brutalizing

1.0271037837415188e-05 enlivening

9.56597808805683e-06 raping

8.750043467484331e-06 enslaving

8.47207438743961e-06 endangering

8.456784016163399e-06 condemning

7.632656395217982e-06 repelling

7.606783832447599e-06 harassing

7.541029002562874e-06 enveloping

6.312167627768535e-06 persuading

5.711396841380007e-06 unsupportive

5.193127736507212e-06 terrorizing

5.024130442962304e-06 assaulting

5.007400918713741e-06 invalidating

3.0750839507926666e-06 tricking

2.666064484813402e-06 impeding

2.6575304514999222e-06 perturbing

2.514203138671018e-06 constrictive

1.935644352961174e-06 persecuting

1.7121677928588102e-06 scarring

1.4483893973560992e-06 exploitative

6.228608755381516e-07 deterring

6.217854310587187e-07 mistreating

5.872552657346229e-07 unappreciative



In [None]:
topk = 10

logits = model.predict(context, None)

probabilities = softmax(logits, dim=-1)
best_logits, best_indices = logits.topk(topk)
best_words = [model[idx.item()] for idx in best_indices]

most_likely_words = get_next_words(model, context, best_words, 0)

best_probabilities = probabilities[best_indices].tolist()

print("Input: ", context)
for i, prob in enumerate(best_probabilities):
    print(f"{prob*100:.3f}%: {most_likely_words[i].strip()}")