<a href="https://colab.research.google.com/github/Gunakemm/Jupyter/blob/main/123/02_sem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data = pd.read_json('./arxivData.json')
data.sample(n=5)

Unnamed: 0,author,day,id,link,month,summary,tag,title,year
6032,"[{'name': 'Ashkan Esmaeili'}, {'name': 'Farokh...",12,1606.03672v1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",6,"In this paper, we will investigate the efficac...","[{'term': 'cs.LG', 'scheme': 'http://arxiv.org...",Comparison of Several Sparse Recovery Methods ...,2016
22405,"[{'name': 'Md ashad Alam'}, {'name': 'Osamu Ko...",1,1606.00118v1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",6,"In genome-wide interaction studies, to detect ...","[{'term': 'stat.ML', 'scheme': 'http://arxiv.o...",Gene-Gene association for Imaging Genetics Dat...,2016
21220,"[{'name': 'Wei Gao'}, {'name': 'David Hsu'}, {...",16,1710.05627v2,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",10,How can a delivery robot navigate reliably to ...,"[{'term': 'cs.AI', 'scheme': 'http://arxiv.org...",Intention-Net: Integrating Planning and Deep L...,2017
767,"[{'name': 'Richard J. Preen'}, {'name': 'Larry...",18,1204.4200v2,"[{'rel': 'related', 'href': 'http://dx.doi.org...",4,A number of representation schemes have been p...,"[{'term': 'cs.AI', 'scheme': 'http://arxiv.org...",Discrete Dynamical Genetic Programming in XCS,2012
125,"[{'name': 'Jacob Andreas'}, {'name': 'Marcus R...",9,1511.02799v4,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",11,Visual question answering is fundamentally com...,"[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",Neural Module Networks,2015


In [None]:
data.loc[0, 'summary']

'We propose an architecture for VQA which utilizes recurrent layers to\ngenerate visual and textual attention. The memory characteristic of the\nproposed recurrent attention units offers a rich joint embedding of visual and\ntextual features and enables the model to reason relations between several\nparts of the image and question. Our single model outperforms the first place\nwinner on the VQA 1.0 dataset, performs within margin to the current\nstate-of-the-art ensemble model. We also experiment with replacing attention\nmechanisms in other state-of-the-art models with our implementation and show\nincreased accuracy. In both cases, our recurrent attention mechanism improves\nperformance in tasks requiring sequential or relational reasoning on the VQA\ndataset.'

In [None]:
lines = data.apply(lambda row: row['title'] + ' ; ' + row['summary'].replace('\n', ' '), axis=1).to_list()

sorted(lines, key=len)[:3]

['Differential Contrastive Divergence ; This paper has been retracted.',
 'What Does Artificial Life Tell Us About Death? ; Short philosophical essay',
 'P=NP ; We claim to resolve the P=?NP problem via a formal argument for P=NP.']

In [None]:
from nltk import WordPunctTokenizer

tokenizer = WordPunctTokenizer()

lines = [' '.join(tokens for tokens in tokenizer.tokenize(line.lower())) for line in lines]

In [None]:
assert sorted(lines, key=len)[0] == 'differential contrastive divergence ; this paper has been retracted .'
assert sorted(lines, key=len)[2] == 'p = np ; we claim to resolve the p =? np problem via a formal argument for p = np .'

In [None]:
from tqdm import tqdm
from collections import defaultdict, Counter

# special tokens:
# - 'UNK' represents absent tokens,
# - 'EOS' is a special token after the end of sequence

UNK, EOS = '_UNK_', '_EOS_'

def count_ngrams(lines, n):
    counts = defaultdict(Counter)
    for line in lines:
        tokens = tokenizer.tokenize(line)
        tokens = [UNK] * (n - 1) + tokens
        tokens.append(EOS)

        for idx in range(n - 1, len(tokens)):
            context = tuple(tokens[idx - n + 1: idx])
            next_token = tokens[idx]
            counts[context][next_token] += 1

    return counts

In [None]:
dummy_lines = sorted(lines, key=len)[:100]
dummy_counts = count_ngrams(dummy_lines, n=3)
assert set(map(len, dummy_counts.keys())) == {2}, "please only count {n-1}-grams"
assert len(dummy_counts[('_UNK_', '_UNK_')]) == 78
assert dummy_counts['_UNK_', 'a']['note'] == 3
assert dummy_counts['p', '=']['np'] == 2
assert dummy_counts['author', '.']['_EOS_'] == 1

In [None]:
class NGramLanguageModel:
    def __init__(self, lines, n):
        """
        Train a simple count-based language model:
        compute probabilities P(w_t | prefix) given ngram counts

        :param n: computes probability of next token given (n - 1) previous words
        :param lines: an iterable of strings with space-separated tokens
        """
        assert n >= 1
        self.n = n

        counts = count_ngrams(lines, self.n)

        # compute token proabilities given counts
        self.probs = defaultdict(Counter)
        # probs[(word1, word2)][word3] = P(word3 | word1, word2)
        for key, counter in counts.items():
            for val, count in counter.items():
                self.probs[key][val] = count / sum(counter.values())
        # populate self.probs with actual probabilities

    def get_possible_next_tokens(self, prefix):
        """
        :param prefix: string with space-separated prefix tokens
        :returns: a dictionary {token : it's probability} for all tokens with positive probabilities
        """
        prefix = prefix.split()
        prefix = prefix[max(0, len(prefix) - self.n + 1):]
        prefix = [UNK] * (self.n - 1 - len(prefix)) + prefix
        return self.probs[tuple(prefix)]

    def get_next_token_prob(self, prefix, next_token):
        """
        :param prefix: string with space-separated prefix tokens
        :param next_token: the next token to predict probability for
        :returns: P(next_token|prefix) a single number, 0 <= P <= 1
        """
        return self.get_possible_next_tokens(prefix).get(next_token, 0)

dummy_lm = NGramLanguageModel(dummy_lines, n=3)

p_initial = dummy_lm.get_possible_next_tokens('') # '' -> ['_UNK_', '_UNK_']

In [None]:
dummy_lm = NGramLanguageModel(dummy_lines, n=3)

p_initial = dummy_lm.get_possible_next_tokens('') # '' -> ['_UNK_', '_UNK_']
assert np.allclose(p_initial['learning'], 0.02)
assert np.allclose(p_initial['a'], 0.13)
assert np.allclose(p_initial.get('meow', 0), 0)
assert np.allclose(sum(p_initial.values()), 1)

p_a = dummy_lm.get_possible_next_tokens('a') # '' -> ['_UNK_', 'a']
assert np.allclose(p_a['machine'], 0.15384615)
assert np.allclose(p_a['note'], 0.23076923)
assert np.allclose(p_a.get('the', 0), 0)
assert np.allclose(sum(p_a.values()), 1)

assert np.allclose(dummy_lm.get_possible_next_tokens('a note')['on'], 1)
assert dummy_lm.get_possible_next_tokens('a machine') == \
    dummy_lm.get_possible_next_tokens("there have always been ghosts in a machine"), \
    "your 3-gram model should only depend on 2 previous words"

In [None]:
lm = NGramLanguageModel(lines=lines, n=3)

In [None]:
probs = np.array(list(lm.get_possible_next_tokens('a').values()))
probs = probs ** (1 / 2) / sum(probs)

total_probs = sum(probs)

probs = probs / total_probs
sum(probs)

0.999999999999975

In [None]:
def get_next_token(lm, prefix, temperature=1.0):

    if temperature == 0:
        return max(lm.get_possible_next_tokens(prefix), key=lm.get_possible_next_tokens(prefix).get)

    tokens = np.array(list(lm.get_possible_next_tokens(prefix).keys()))
    probs = np.array(list(lm.get_possible_next_tokens(prefix).values()))
    probs = probs ** (1 / temperature) / sum(probs)

    total_prob = sum(probs)

    probs = probs / total_prob

    next_token = np.random.choice(tokens, p=probs)

    return next_token

In [None]:
from collections import Counter
test_freqs = Counter([get_next_token(lm, 'there have') for _ in range(10000)])
assert 250 < test_freqs['not'] < 450
assert 8500 < test_freqs['been'] < 9500
assert 1 < test_freqs['lately'] < 200

test_freqs = Counter([get_next_token(lm, 'deep', temperature=1.0) for _ in range(10000)])
assert 1500 < test_freqs['learning'] < 3000
test_freqs = Counter([get_next_token(lm, 'deep', temperature=0.5) for _ in range(10000)])
assert 8000 < test_freqs['learning'] < 9000
test_freqs = Counter([get_next_token(lm, 'deep', temperature=0.0) for _ in range(10000)])
assert test_freqs['learning'] == 10000

print("Looks nice!")

Looks nice!


In [None]:
prefix = 'function'

for i in range(100):
    prefix += ' ' + get_next_token(lm, prefix)
    if prefix.endswith(EOS) or len(lm.get_possible_next_tokens(prefix)) == 0:
        break

print(prefix)

function driven diffusion for personalized counterfactual inference ; variational inference that can then help derive higher level of syntactic trees . we apply our semi - supervised setting , based on ontology have emerged as a part of our model first adjusts each word is linked to local columbus receptors . running the base mean level of abstraction to represent the results suggest that a specific task of novelty , rather than isolated object instances in terms of accuracy . _EOS_


In [None]:
def perplexity(lm, lines, min_logprob=np.log(10 ** -50.)):

    total_log_prob = 0.0
    total_tokens = 0

    for line in lines:
        tokens = tokenizer.tokenize(line.lower())
        tokens.append(EOS)

        total_tokens += len(tokens)

        sentence_log_prob = 0.0

        for i in range(len(tokens)):
            context = tokens[:i]
            context_str = ' '.join(context) if context else ''

            prob = lm.get_next_token_prob(context_str, tokens[i])

            if prob <= 0:
                log_prob = min_logprob
            else:
                log_prob = np.log(prob)

                if log_prob < min_logprob:
                    log_prob = min_logprob

            sentence_log_prob += log_prob

        total_log_prob += sentence_log_prob

    if total_tokens == 0:
        return float('inf')

    perplexity = np.exp(-total_log_prob / total_tokens)

    return perplexity

perplexity(lm, dummy_lines)

13.368554817277525

In [None]:
lm1 = NGramLanguageModel(dummy_lines, n=1)
lm3 = NGramLanguageModel(dummy_lines, n=3)
lm10 = NGramLanguageModel(dummy_lines, n=10)

ppx1 = perplexity(lm1, dummy_lines)
ppx3 = perplexity(lm3, dummy_lines)
ppx10 = perplexity(lm10, dummy_lines)
ppx_missing = perplexity(lm3, ['the jabberwock , with eyes of flame , '])  # thanks, L. Carrol

print("Perplexities: ppx1=%.3f ppx3=%.3f ppx10=%.3f" % (ppx1, ppx3, ppx10))

assert all(0 < ppx < 500 for ppx in (ppx1, ppx3, ppx10)), "perplexity should be non-negative and reasonably small"
assert ppx1 > ppx3 > ppx10, "higher N models should overfit and "
assert np.isfinite(ppx_missing) and ppx_missing > 10 ** 6, "missing words should have large but finite perplexity. " \
    " Make sure you use min_logprob right"
assert np.allclose([ppx1, ppx3, ppx10], (318.2132342216302, 1.5199996213739575, 1.1838145037901249))

Perplexities: ppx1=318.213 ppx3=1.520 ppx10=1.184


In [None]:
from sklearn.model_selection import train_test_split
train_lines, test_lines = train_test_split(lines, test_size=0.25, random_state=42)

for n in (1, 2, 3):
    lm = NGramLanguageModel(n=n, lines=train_lines)
    ppx = perplexity(lm, test_lines)
    print("N = %i, Perplexity = %.5f" % (n, ppx))

N = 1, Perplexity = 1832.23136
N = 2, Perplexity = 85653987.28543
N = 3, Perplexity = 61999196239911532363776.00000


In [None]:
class LaplaceLanguageModel(NGramLanguageModel):
    """ this code is an example, no need to change anything """
    def __init__(self, lines, n, delta=1.0):
        self.n = n
        counts = count_ngrams(lines, self.n)
        self.vocab = set(token for token_counts in counts.values() for token in token_counts)
        self.probs = defaultdict(Counter)

        for prefix in counts:
            token_counts = counts[prefix]
            total_count = sum(token_counts.values()) + delta * len(self.vocab)
            self.probs[prefix] = {token: (token_counts[token] + delta) / total_count
                                          for token in token_counts}
    def get_possible_next_tokens(self, prefix):
        token_probs = super().get_possible_next_tokens(prefix)
        missing_prob_total = 1.0 - sum(token_probs.values())
        missing_prob = missing_prob_total / max(1, len(self.vocab) - len(token_probs))
        return {token: token_probs.get(token, missing_prob) for token in self.vocab}

    def get_next_token_prob(self, prefix, next_token):
        token_probs = super().get_possible_next_tokens(prefix)
        if next_token in token_probs:
            return token_probs[next_token]
        else:
            missing_prob_total = 1.0 - sum(token_probs.values())
            missing_prob_total = max(0, missing_prob_total) # prevent rounding errors
            return missing_prob_total / max(1, len(self.vocab) - len(token_probs))


In [None]:
#test that it's a valid probability model
for n in (1, 2, 3):
    dummy_lm = LaplaceLanguageModel(dummy_lines, n=n)
    assert np.allclose(sum([dummy_lm.get_next_token_prob('a', w_i) for w_i in dummy_lm.vocab]), 1), "I told you not to break anything! :)"

In [None]:
for n in (1, 2, 3):
    lm = LaplaceLanguageModel(train_lines, n=n, delta=0.1)
    ppx = perplexity(lm, test_lines)
    print("N = %i, Perplexity = %.5f" % (n, ppx))

In [None]:
class KneserNeyLanguageModel(NGramLanguageModel):

    def __init__(self, lines, n, delta=1.0):
        self.n = n
        counts = {}
        for i in range(1, self.n):
            counts[i] = count_ngrams(lines, i)

        self.vocab = set(token for token_counts in counts.values() for token in token_counts)
        self.probs = defaultdict(Counter)

        for prefix in counts:
            token_counts = counts[prefix]
            total_count = sum(token_counts.values())




    def get_possible_next_tokens(self, prefix):
        token_probs = super().get_possible_next_tokens(prefix)

    def get_next_token_prob(self, prefix, next_token):
        token_probs = super().get_next_token_prob(prefix, next_token)