In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from nltk import WordPunctTokenizer


In [2]:
# read the data from the file txt
with open('/Users/markkhaus/Documents/GitHub/hse/Armageddon2419-A.D..txt', 'r') as f:
    data = f.read()


In [3]:
# transfrom the data to pandas dataframe
text = pd.DataFrame(data.split('.'), columns=['text'])
text.head(10)

Unnamed: 0,text
0,Project\tGutenberg's\tArmageddon--2419\tA
1,D
2,",\tby\tPhilip\tFrancis\tNowlan This\teBook\tis..."
3,"\t\tYou\tmay\tcopy\tit,\tgive\tit\taway\tor\n\..."
4,gutenberg
5,org\n\nTitle:\tArmageddon--2419\tA
6,D
7,\n\nAuthor:\tPhilip\tFrancis\tNowlan\n\nIllust...
8,"\tPaul\n\nRelease\tDate:\tMay\t26,\t2010\t[EBo..."
9,D


In [4]:
data = text['text'].tolist()


In [5]:
data = [x.replace('\t', ' ') for x in data]
data = [x.replace('\n', ' ') for x in data]
data = [x.replace('\r', ' ') for x in data]
data = [x.replace('"', '') for x in data]
data = [x.replace("'", '') for x in data]
data = [x for x in data if x != ' ']
data = [x for x in data if x != '']
data = [x for x in data if len(x.split()) > 1]

# delete more than 1 spaces
data = [x.replace('  ', ' ') for x in data]
data = [x.replace('   ', ' ') for x in data]
data = [x.replace('    ', ' ') for x in data]
data = [x.replace('     ', ' ') for x in data]



In [6]:
# assemble lines: concatenate title and description
from nltk import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
lines = [
    ' '.join(
        tokenizer.tokenize(line.lower())
    ) for line in tqdm(data)
]

  0%|          | 0/1500 [00:00<?, ?it/s]

In [7]:
from tqdm import tqdm
from collections import defaultdict, Counter

# special tokens: 
# - unk represents absent tokens, 
# - eos is a special token after the end of sequence

UNK, EOS = "_UNK_", "_EOS_"

def count_ngrams(lines, n):
    """
    Count how many times each word occured after (n - 1) previous words
    :param lines: an iterable of strings with space-separated tokens
    :returns: a dictionary { tuple(prefix_tokens): {next_token_1: count_1, next_token_2: count_2}}

    When building counts, please consider the following two edge cases
    - if prefix is shorter than (n - 1) tokens, it should be padded with UNK. For n=3,
      empty prefix: "" -> (UNK, UNK)
      short prefix: "the" -> (UNK, the)
      long prefix: "the new approach" -> (new, approach)
    - you should add a special token, EOS, at the end of each sequence
      "... with deep neural networks ." -> (..., with, deep, neural, networks, ., EOS)
      count the probability of this token just like all others.
    """
    counts = defaultdict(Counter)
    # counts[(word1, word2)][word3] = how many times word3 occured after (word1, word2)

    for line in tqdm(lines, desc='N-grams'):
        unk_prefix = ' '.join([UNK] * (n - 1))
        eos_suffix = EOS
        tokens = f'{unk_prefix} {line} {eos_suffix}'.split()
        for i in range(n - 1, len(tokens)):
            n_gram = tuple(tokens[i - n + 1: i])
            counts[n_gram].update([tokens[i]])

    
    return counts


In [8]:
# let's test it
dummy_lines = sorted(lines, key=len)[:100]
dummy_counts = count_ngrams(dummy_lines, n=3)



N-grams: 100%|██████████| 100/100 [00:00<00:00, 73856.38it/s]


In [9]:
dummy_counts[('_UNK_', 'a')]

Counter()

In [10]:
class NGramLanguageModel:    
    def __init__(self, lines, n):
        """ 
        Train a simple count-based language model: 
        compute probabilities P(w_t | prefix) given ngram counts
        
        :param n: computes probability of next token given (n - 1) previous words
        :param lines: an iterable of strings with space-separated tokens
        """
        assert n >= 1
        self.n = n
    
        counts = count_ngrams(lines, self.n)
        
        # compute token proabilities given counts
        self.probs = defaultdict(Counter)
        # probs[(word1, word2)][word3] = P(word3 | word1, word2)
        
        # populate self.probs with actual probabilities
        for k,v in tqdm(counts.items()):
            s = sum(v.values())
            for word, cout in v.items():
                self.probs[k][word] = counts[k][word] / s 
            
    def get_possible_next_tokens(self, prefix):
        """
        :param prefix: string with space-separated prefix tokens
        :returns: a dictionary {token : it's probability} for all tokens with positive probabilities
        """
        prefix = prefix.split()
        prefix = prefix[max(0, len(prefix) - self.n + 1):]
        prefix = [ UNK ] * (self.n - 1 - len(prefix)) + prefix
        return self.probs[tuple(prefix)]
    
    def get_next_token_prob(self, prefix, next_token):
        """
        :param prefix: string with space-separated prefix tokens
        :param next_token: the next token to predict probability for
        :returns: P(next_token|prefix) a single number, 0 <= P <= 1
        """
        return self.get_possible_next_tokens(prefix).get(next_token, 0)

In [11]:
dummy_lm = NGramLanguageModel(dummy_lines, n=3)

p_initial = dummy_lm.get_possible_next_tokens('') # '' -> ['_UNK_', '_UNK_']

assert dummy_lm.get_possible_next_tokens('a machine') == \
    dummy_lm.get_possible_next_tokens("there have always been ghosts in a machine"), \
    "your 3-gram model should only depend on 2 previous words"

N-grams: 100%|██████████| 100/100 [00:00<00:00, 103845.11it/s]
100%|██████████| 321/321 [00:00<00:00, 546638.89it/s]


In [12]:
lm = NGramLanguageModel(lines, n=3)

N-grams: 100%|██████████| 1500/1500 [00:00<00:00, 13710.64it/s]
100%|██████████| 19594/19594 [00:00<00:00, 665779.80it/s]


In [13]:
def get_next_token(lm, prefix, temperature=1.0):
    """
    return next token after prefix;
    :param temperature: samples proportionally to lm probabilities ^ (1 / temperature)
        if temperature == 0, always takes most likely token. Break ties arbitrarily.
    """
    next_tokens = lm.get_possible_next_tokens(prefix)
    if temperature == 0:
        sorted_next_tokens = dict(
            sorted(tuple(next_tokens.items()), key=lambda x:x[1], 
                   reverse=True)
        )
        next_token = tuple(sorted_next_tokens.items())[0][0]
    else:
        sum_probs = sum([
            prob ** (1 / temperature) for prob in next_tokens.values()
        ])

        next_tokens = {
            token: prob ** (1 / temperature) / sum_probs
            for token, prob in next_tokens.items()
        }
        tokens = list(next_tokens.keys())
        probs = list(next_tokens.values())
        next_token = np.random.choice(tokens, 1, p=probs)[0]
    return next_token

In [14]:
from collections import Counter
test_freqs = Counter([get_next_token(lm, 'have') for _ in range(10000)])


test_freqs = Counter([get_next_token(lm, 'have', temperature=1.0) for _ in range(20000)])
test_freqs = Counter([get_next_token(lm, 'have', temperature=0.5) for _ in range(20000)])
test_freqs = Counter([get_next_token(lm, 'have', temperature=0.0) for _ in range(20000)])

print("Looks nice!")

Looks nice!


In [15]:
prefix = 'have' # <- your ideas :)

for i in range(100):
    prefix += ' ' + get_next_token(lm, prefix)
    if prefix.endswith(EOS) or len(lm.get_possible_next_tokens(prefix)) == 0:
        break
        
print(prefix)

have a feeling something is going to mean real business for all we could _EOS_


In [16]:
prefix = 'interesting prophecies' # <- more of your ideas

for i in range(100):
    prefix += ' ' + get_next_token(lm, prefix, temperature=0.5)
    if prefix.endswith(EOS) or len(lm.get_possible_next_tokens(prefix)) == 0:
        break
        
print(prefix)

interesting prophecies , of course , i think we can train all our gangs to use it under the arms , she would have been observed ? i asked _EOS_


In [17]:
def perplexity(lm, lines, min_logprob=np.log(10 ** -7.)):
    """
    :param lines: a list of strings with space-separated tokens
    :param min_logprob: if log(P(w | ...)) is smaller than min_logprop, set it equal to min_logrob
    :returns: corpora-level perplexity - a single scalar number from the formula above
    
    Note: do not forget to compute P(w_first | empty) and P(eos | full_sequence)
    
    PLEASE USE lm.get_next_token_prob and NOT lm.get_possible_next_tokens
    """
    total_length = 0
    log_pp = 0

    for line in tqdm(lines):
        tokens = [''] + line.split(' ') + [EOS]

        for t in range(1, len(tokens)):
            prefix = ' '.join(tokens[:t])
            log_pp += max(
                min_logprob, np.log(lm.get_next_token_prob(prefix, tokens[t]))
            )
            total_length += 1
    
    return np.exp(-( 1 / total_length) * log_pp)

In [18]:
lm1 = NGramLanguageModel(dummy_lines, n=1)
lm3 = NGramLanguageModel(dummy_lines, n=3)
lm10 = NGramLanguageModel(dummy_lines, n=10)

ppx1 = perplexity(lm1, dummy_lines)
ppx3 = perplexity(lm3, dummy_lines)
ppx10 = perplexity(lm10, dummy_lines)
ppx_missing = perplexity(lm3, ['the jabberwock , with eyes of flame , '])  # thanks, L. Carrol

print("Perplexities: ppx1=%.3f ppx3=%.3f ppx10=%.3f" % (ppx1, ppx3, ppx10))

assert all(0 < ppx < 500 for ppx in (ppx1, ppx3, ppx10)), "perplexity should be nonnegative and reasonably small"
assert ppx1 > ppx3 > ppx10, "higher N models should overfit and "
assert np.isfinite(ppx_missing) and ppx_missing > 10 ** 6, "missing words should have large but finite perplexity. " \
    " Make sure you use min_logprob right"


N-grams: 100%|██████████| 100/100 [00:00<00:00, 194541.00it/s]
100%|██████████| 1/1 [00:00<00:00, 7476.48it/s]
N-grams: 100%|██████████| 100/100 [00:00<00:00, 151528.32it/s]
100%|██████████| 321/321 [00:00<00:00, 576111.08it/s]
N-grams: 100%|██████████| 100/100 [00:00<00:00, 121222.66it/s]
100%|██████████| 333/333 [00:00<00:00, 546868.92it/s]
100%|██████████| 100/100 [00:00<00:00, 62036.74it/s]
100%|██████████| 100/100 [00:00<00:00, 71832.57it/s]
100%|██████████| 100/100 [00:00<00:00, 76804.69it/s]
  min_logprob, np.log(lm.get_next_token_prob(prefix, tokens[t]))
100%|██████████| 1/1 [00:00<00:00, 4809.98it/s]

Perplexities: ppx1=86.300 ppx3=2.654 ppx10=2.593





In [19]:
from sklearn.model_selection import train_test_split
train_lines, test_lines = train_test_split(lines, test_size=0.25, random_state=42)

for n in (1, 2, 3):
    lm = NGramLanguageModel(n=n, lines=train_lines)
    ppx = perplexity(lm, test_lines)
    print("N = %i, Perplexity = %.5f" % (n, ppx))


N-grams: 100%|██████████| 1125/1125 [00:00<00:00, 60173.07it/s]
100%|██████████| 1/1 [00:00<00:00, 1030.79it/s]
  min_logprob, np.log(lm.get_next_token_prob(prefix, tokens[t]))
100%|██████████| 375/375 [00:00<00:00, 14918.99it/s]


N = 1, Perplexity = 851.52224


N-grams: 100%|██████████| 1125/1125 [00:00<00:00, 49750.04it/s]
100%|██████████| 4007/4007 [00:00<00:00, 494267.45it/s]
100%|██████████| 375/375 [00:00<00:00, 10976.18it/s]


N = 2, Perplexity = 14903.66023


N-grams: 100%|██████████| 1125/1125 [00:00<00:00, 35270.75it/s]
100%|██████████| 15712/15712 [00:00<00:00, 218872.26it/s]
100%|██████████| 375/375 [00:00<00:00, 8508.22it/s]

N = 3, Perplexity = 639725.17200





In [20]:
class LaplaceLanguageModel(NGramLanguageModel): 
    """ this code is an example, no need to change anything """
    def __init__(self, lines, n, delta=1.0):
        self.n = n
        counts = count_ngrams(lines, self.n)
        self.vocab = set(token for token_counts in counts.values() for token in token_counts)
        self.probs = defaultdict(Counter)

        for prefix in counts:
            token_counts = counts[prefix]
            total_count = sum(token_counts.values()) + delta * len(self.vocab)
            self.probs[prefix] = {token: (token_counts[token] + delta) / total_count
                                          for token in token_counts}
    def get_possible_next_tokens(self, prefix):
        token_probs = super().get_possible_next_tokens(prefix)
        missing_prob_total = 1.0 - sum(token_probs.values())
        missing_prob = missing_prob_total / max(1, len(self.vocab) - len(token_probs))
        return {token: token_probs.get(token, missing_prob) for token in self.vocab}
    
    def get_next_token_prob(self, prefix, next_token):
        token_probs = super().get_possible_next_tokens(prefix)
        if next_token in token_probs:
            return token_probs[next_token]
        else:
            missing_prob_total = 1.0 - sum(token_probs.values())
            missing_prob_total = max(0, missing_prob_total) # prevent rounding errors
            return missing_prob_total / max(1, len(self.vocab) - len(token_probs))
        

In [21]:
#test that it's a valid probability model
for n in (1, 2, 3):
    dummy_lm = LaplaceLanguageModel(dummy_lines, n=n)
    assert np.allclose(sum([dummy_lm.get_next_token_prob('a', w_i) for w_i in dummy_lm.vocab]), 1), "I told you not to break anything! :)"

N-grams: 100%|██████████| 100/100 [00:00<00:00, 45844.40it/s]
N-grams: 100%|██████████| 100/100 [00:00<00:00, 103282.54it/s]
N-grams: 100%|██████████| 100/100 [00:00<00:00, 138334.56it/s]


In [22]:
for n in (1, 2, 3):
    lm = LaplaceLanguageModel(train_lines, n=n, delta=0.1)
    ppx = perplexity(lm, test_lines)
    print("N = %i, Perplexity = %.5f" % (n, ppx))

N-grams: 100%|██████████| 1125/1125 [00:00<00:00, 60250.68it/s]
100%|██████████| 375/375 [00:00<00:00, 9018.30it/s]


N = 1, Perplexity = 854.38045


N-grams: 100%|██████████| 1125/1125 [00:00<00:00, 43893.47it/s]
100%|██████████| 375/375 [00:00<00:00, 11394.10it/s]


N = 2, Perplexity = 810.43758


N-grams: 100%|██████████| 1125/1125 [00:00<00:00, 33264.19it/s]
100%|██████████| 375/375 [00:00<00:00, 9701.73it/s]

N = 3, Perplexity = 2224.67414





In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
%matplotlib inline

In [24]:

sorted(lines, key=len)[:3]

# Task: convert lines (in-place) into strings of space-separated tokens. import & use WordPunctTokenizer
from nltk import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
lines = [
    ' '.join(
        tokenizer.tokenize(line.lower())
    ) for line in tqdm(lines)
]

  0%|          | 0/1500 [00:00<?, ?it/s]

In [25]:
train, valid = train_test_split(lines, test_size=0.2)
lm_datasets = {'train' : train, 'valid' : valid}

In [26]:
from datasets import Dataset
my_dict = {"text": lines}
datasets = Dataset.from_dict(my_dict)
tr_test_datasets = datasets.train_test_split(test_size=0.1)

In [27]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML
from sklearn.model_selection import train_test_split

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [28]:
clm_model_checkpoint = "gpt2"
clm_tokenizer_checkpoint = "gpt2"

In [29]:
from transformers import GPT2Tokenizer, GPT2Model, AutoModelForCausalLM
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# model = AutoModelForCausalLM.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('zaaabik/gpt2-arxiv-clm')

In [30]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [31]:
tokenized_datasets = tr_test_datasets.map(tokenize_function, 
                                          batched=True, num_proc=4, 
                                          remove_columns=["text"])

Map (num_proc=4):   0%|          | 0/1350 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/150 [00:00<?, ? examples/s]

In [32]:
block_size = 128

In [33]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [34]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/1350 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/150 [00:00<?, ? examples/s]

In [35]:
lm_datasets['train'][1]['labels']

[20212,
 837,
 3737,
 9930,
 1392,
 530,
 1128,
 26842,
 8524,
 319,
 262,
 954,
 286,
 1115,
 837,
 2686,
 510,
 883,
 3172,
 14127,
 24823,
 851,
 477,
 286,
 606,
 851,
 290,
 329,
 11858,
 11060,
 837,
 17666,
 2051,
 1169,
 1700,
 286,
 262,
 11087,
 8286,
 1570,
 6816,
 2523,
 1866,
 286,
 262,
 5462,
 14281,
 3812,
 262,
 8286,
 1128,
 532,
 26842,
 17301,
 287,
 281,
 2230,
 284,
 4474,
 10107,
 1630,
 286,
 340,
 837,
 290,
 2620,
 262,
 10303,
 72,
 1043,
 340,
 880,
 510,
 287,
 262,
 9686,
 286,
 262,
 4074,
 1462,
 16565,
 837,
 3387,
 3187,
 1058,
 2638,
 1058,
 1003,
 279,
 4743,
 1878,
 896,
 1597,
 2607,
 318,
 5140,
 379,
 807,
 2931,
 5093,
 20007,
 7421,
 837,
 8268,
 13546,
 1748,
 837,
 3384,
 9508,
 18298,
 837,
 357,
 807,
 486,
 1267,
 642,
 4846,
 532,
 1248,
 5774,
 837,
 3053,
 1597,
 2488,
 279,
 4743,
 1878,
 1659]

In [36]:
lm_datasets['train'][1]['input_ids'][:10]

[20212, 837, 3737, 9930, 1392, 530, 1128, 26842, 8524, 319]

In [37]:
from transformers import Trainer, TrainingArguments

In [38]:
training_args = TrainingArguments(
    f"gpt2-arxiv-clm",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    # push_to_hub=True
)

In [39]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets['train'],
    eval_dataset=lm_datasets['test'],
)

In [40]:
trainer.train()

***** Running training *****
  Num examples = 250
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 96
  Number of trainable parameters = 124439808


  0%|          | 0/96 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 26
  Batch size = 8


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 4.658186912536621, 'eval_runtime': 6.2495, 'eval_samples_per_second': 4.16, 'eval_steps_per_second': 0.64, 'epoch': 1.0}


***** Running Evaluation *****
  Num examples = 26
  Batch size = 8


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 4.51870059967041, 'eval_runtime': 6.5539, 'eval_samples_per_second': 3.967, 'eval_steps_per_second': 0.61, 'epoch': 2.0}


***** Running Evaluation *****
  Num examples = 26
  Batch size = 8


  0%|          | 0/4 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 4.483121871948242, 'eval_runtime': 5.8213, 'eval_samples_per_second': 4.466, 'eval_steps_per_second': 0.687, 'epoch': 3.0}
{'train_runtime': 685.8263, 'train_samples_per_second': 1.094, 'train_steps_per_second': 0.14, 'train_loss': 4.59431521097819, 'epoch': 3.0}


TrainOutput(global_step=96, training_loss=4.59431521097819, metrics={'train_runtime': 685.8263, 'train_samples_per_second': 1.094, 'train_steps_per_second': 0.14, 'train_loss': 4.59431521097819, 'epoch': 3.0})

In [41]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 26
  Batch size = 8


  0%|          | 0/4 [00:00<?, ?it/s]

Perplexity: 88.51


In [43]:
!huggingface-cli login --token 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


Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /Users/markkhaus/.cache/huggingface/token
Login successful


In [48]:
nick_name = 'markkut'

In [45]:
tokenizer.push_to_hub(
    'gpt2-author-clm_3'
)

tokenizer config file saved in /var/folders/rr/39767lj15dg81g2dbl4jq3rw0000gn/T/tmp1hzocqaq/tokenizer_config.json
Special tokens file saved in /var/folders/rr/39767lj15dg81g2dbl4jq3rw0000gn/T/tmp1hzocqaq/special_tokens_map.json
Uploading the following files to Markkut/gpt2-author-clm_3: tokenizer_config.json,special_tokens_map.json,merges.txt,vocab.json


CommitInfo(commit_url='https://huggingface.co/Markkut/gpt2-author-clm_3/commit/7fe280c249be1fad392e817251c364ddee3645fd', commit_message='Upload tokenizer', commit_description='', oid='7fe280c249be1fad392e817251c364ddee3645fd', pr_url=None, pr_revision=None, pr_num=None)

In [53]:
model.push_to_hub("gpt2-author-clm_3")


Configuration saved in /var/folders/rr/39767lj15dg81g2dbl4jq3rw0000gn/T/tmp4v2lvwfx/config.json
Configuration saved in /var/folders/rr/39767lj15dg81g2dbl4jq3rw0000gn/T/tmp4v2lvwfx/generation_config.json
Model weights saved in /var/folders/rr/39767lj15dg81g2dbl4jq3rw0000gn/T/tmp4v2lvwfx/pytorch_model.bin
Uploading the following files to Markkut/gpt2-author-clm_3: config.json,generation_config.json,pytorch_model.bin


pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/Markkut/gpt2-author-clm_3/commit/c90269448e836fc1e554eacf224ecd7a0cd7c8d8', commit_message='Upload model', commit_description='', oid='c90269448e836fc1e554eacf224ecd7a0cd7c8d8', pr_url=None, pr_revision=None, pr_num=None)

In [54]:
from transformers import pipeline
generator = pipeline(
    'text-generation', 
    model = f'{nick_name}/gpt2-author-clm_3',
    tokenizer = tokenizer
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/925 [00:00<?, ?B/s]

loading configuration file config.json from cache at /Users/markkhaus/.cache/huggingface/hub/models--markkut--gpt2-author-clm_3/snapshots/c90269448e836fc1e554eacf224ecd7a0cd7c8d8/config.json
Model config GPT2Config {
  "_name_or_path": "markkut/gpt2-author-clm_3",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,


Downloading pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /Users/markkhaus/.cache/huggingface/hub/models--markkut--gpt2-author-clm_3/snapshots/c90269448e836fc1e554eacf224ecd7a0cd7c8d8/pytorch_model.bin
Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.1"
}

All model checkpoint weights were used when initializing GPT2LMHeadModel.

All the weights of GPT2LMHeadModel were initialized from the model checkpoint at markkut/gpt2-author-clm_3.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.


Downloading (…)neration_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

loading configuration file generation_config.json from cache at /Users/markkhaus/.cache/huggingface/hub/models--markkut--gpt2-author-clm_3/snapshots/c90269448e836fc1e554eacf224ecd7a0cd7c8d8/generation_config.json
Generate config GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.1"
}



In [55]:
generator('Armrageddon is a game about')


Generate config GenerationConfig {
  "bos_token_id": 50256,
  "do_sample": true,
  "eos_token_id": 50256,
  "max_length": 50,
  "transformers_version": "4.26.1"
}

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Armrageddon is a game about getting away faster with all your might and in anticipation that another titan would pass him through the city while he was still far away, or else there would be something like a pikeman in my front, or perhaps'}]

In [57]:
generator('hello kitty ')


Generate config GenerationConfig {
  "bos_token_id": 50256,
  "do_sample": true,
  "eos_token_id": 50256,
  "max_length": 50,
  "transformers_version": "4.26.1"
}

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'hello kitty ik : so that i dont hear it the best i wish i had, i thought, that i might as well not come this way, i thought, in so doing i would die on the spot, and so i could live'}]