In [62]:
import torch
from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Load pre-trained model (weights)
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set the model in evaluation mode to desactivate the DropOut modules
# This is IMPORTANT to have reproductible results during evaluation!
model.eval()

# Read the story and split word
file1 = open("MTS_allSegments.txt","r") 
story = file1.readlines()
story_list = story[0].lower().split()

# Initialize the list to store weights
next_weight = []

# the loop is from 5th word to last word so the first prediction has some context
for iWord in range(5,len(story_list)):
    next_word = story_list[iWord]
    context_word = story_list[0:(iWord-1)]
    
    seperator = ' '
    text = seperator.join(context_word)
    
    # Encode a text inputs
    indexed_tokens = tokenizer.encode(text)
    
    # Convert indexed tokens in a PyTorch tensor
    tokens_tensor = torch.tensor([indexed_tokens])
    
    # If you have a GPU, put everything on cuda
    tokens_tensor = tokens_tensor.to('cuda')
    model.to('cuda')
    
    # Predict all tokens
    # output[0] is the weight of last layer
    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0]

    # get the weight for the predicted last word
    next_weight.append(predictions[0, -1, tokenizer.encode(next_word)[0]].item())

INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at C:\Users\Hao Lu User\.cache\torch\pytorch_transformers\f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt from cache at C:\Users\Hao Lu User\.cache\torch\pytorch_transformers\d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
INFO:pytorch_transformers.modeling_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json from cache at C:\Users\Hao Lu User\.cache\torch\pytorch_transformers\4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.085d5f6a8e7812ea05ff0e6ed0645ab2e75d80387ad55c1ad9806ee70d272f80
INFO:pytorch_t

RuntimeError: The size of tensor a (1025) must match the size of tensor b (1024) at non-singleton dimension 3

In [45]:
next_word.lower()

'briefest'

In [46]:
tokenizer.encode(next_word.lower())

[4506, 395]

In [39]:
tokenizer.decode(395)

'est'

In [52]:
a = predictions[0, -1, tokenizer.encode(next_word)[0]]

In [63]:
next_weight

[-123.83051300048828,
 -98.8583984375,
 -116.5068359375,
 -92.15135192871094,
 -88.73371887207031,
 -100.98986053466797,
 -81.22249603271484,
 -98.21540832519531,
 -127.51956939697266,
 -131.44882202148438,
 -124.65025329589844,
 -116.20816802978516,
 -61.86349868774414,
 -93.25726318359375,
 -91.71890258789062,
 -94.01568603515625,
 -111.51834869384766,
 -77.42327880859375,
 -95.18460845947266,
 -93.32532501220703,
 -90.14218139648438,
 -108.11395263671875,
 -137.4080047607422,
 -130.92416381835938,
 -114.71525573730469,
 -98.24365234375,
 -108.7219467163086,
 -91.64990234375,
 -106.17044830322266,
 -133.21197509765625,
 -100.90234375,
 -98.4674072265625,
 -90.92073059082031,
 -92.66871643066406,
 -108.53067016601562,
 -81.91979217529297,
 -90.33348846435547,
 -98.84242248535156,
 -110.22178649902344,
 -110.99372863769531,
 -120.15365600585938,
 -106.49329376220703,
 -87.09664916992188,
 -109.5851821899414,
 -97.060791015625,
 -102.31961822509766,
 -83.16214752197266,
 -109.2678222656

In [19]:
next_word

'and'

In [21]:
story_list

['mr.',
 'tilly',
 'had',
 'only',
 'the',
 'briefest',
 'moment',
 'for',
 'reflection,',
 'when,',
 'as',
 'he',
 'slipped',
 'and',
 'fell',
 'on',
 'the',
 'greasy',
 'wood',
 'pavement',
 'at',
 'hyde',
 'park',
 'corner,',
 'which',
 'he',
 'was',
 'crossing',
 'at',
 'a',
 'smart',
 'trot,',
 'he',
 'saw',
 'the',
 'huge',
 'traction-engine',
 'with',
 'its',
 'grooved',
 'ponderous',
 'wheels',
 'towering',
 'high',
 'above',
 'him.',
 '“oh,',
 'dear!',
 'oh,',
 'dear!”',
 'he',
 'said',
 'petulantly,',
 '“it',
 'will',
 'certainly',
 'crush',
 'me',
 'quite',
 'flat,',
 'and',
 'i',
 'shan’t',
 'be',
 'able',
 'to',
 'be',
 'at',
 'mrs.',
 'cumberbatch’s',
 'séance!',
 'most',
 'provoking!',
 'a-ow!”',
 'the',
 'words',
 'were',
 'hardly',
 'out',
 'of',
 'his',
 'mouth,',
 'when',
 'the',
 'first',
 'half',
 'of',
 'his',
 'horrid',
 'anticipations',
 'was',
 'thoroughly',
 'fulfilled.',
 'the',
 'heavy',
 'wheels',
 'passed',
 'over',
 'him',
 'from',
 'head',
 'to',
 'foot'

In [1]:
import torch
import string
from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Load pre-trained model (weights)
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set the model in evaluation mode to desactivate the DropOut modules
# This is IMPORTANT to have reproductible results during evaluation!
model.eval()


INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at C:\Users\Hao Lu User\.cache\torch\pytorch_transformers\f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt from cache at C:\Users\Hao Lu User\.cache\torch\pytorch_transformers\d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
INFO:pytorch_transformers.modeling_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json from cache at C:\Users\Hao Lu User\.cache\torch\pytorch_transformers\4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.085d5f6a8e7812ea05ff0e6ed0645ab2e75d80387ad55c1ad9806ee70d272f80
INFO:pytorch_t

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [20]:

# Read the story and split word
file1 = open("MrTillysSeance_reformatted.txt","r",encoding='UTF-8') 
story = file1.readlines()
story_list = story[0].split()

# Initialize the list to store weights
next_weight = []

# the loop is from 5th word to last word so the first prediction has some context
for iWord in range(5,len(story_list)):
    
    next_word = story_list[iWord].translate(str.maketrans('', '', string.punctuation))
    
    if next_word:
        if (iWord - 200>=0):
            context_word = story_list[(iWord-200):(iWord-1)]
        else:
            context_word = story_list[0:(iWord-1)]
        seperator = ' '
        text = seperator.join(context_word)
    
    # Encode a text inputs
        indexed_tokens = tokenizer.encode(text)
    
    # Convert indexed tokens in a PyTorch tensor
        tokens_tensor = torch.tensor([indexed_tokens])
    
    # If you have a GPU, put everything on cuda
        tokens_tensor = tokens_tensor.to('cuda')
        model.to('cuda')
    
    # Predict all tokens
    # output[0] is the weight of last layer
        with torch.no_grad():
            outputs = model(tokens_tensor)
            predictions = outputs[0]

        # get the weight for the predicted last word
        next_weight.append(predictions[0, -1, tokenizer.encode(next_word)[0]].item())

In [22]:
next_weight

[-126.09825134277344,
 -100.10073852539062,
 -122.89671325683594,
 -92.61870574951172,
 -95.12271118164062,
 -111.24650573730469,
 -132.4564971923828,
 -111.90289306640625,
 -125.08509826660156,
 -128.58102416992188,
 -121.17620849609375,
 -113.27363586425781,
 -56.63973617553711,
 -94.61131286621094,
 -87.76190948486328,
 -74.23797607421875,
 -92.42113494873047,
 -70.68011474609375,
 -67.19322204589844,
 -58.317588806152344,
 -87.98904418945312,
 -100.28479766845703,
 -137.83151245117188,
 -128.86183166503906,
 -106.28443908691406,
 -98.1455078125,
 -114.55773162841797,
 -95.46464538574219,
 -94.04417419433594,
 -129.48626708984375,
 -92.7576904296875,
 -96.01771545410156,
 -88.12798309326172,
 -91.94256591796875,
 -69.96968078613281,
 -86.54251098632812,
 -92.71683502197266,
 -104.69353485107422,
 -104.33226013183594,
 -120.29077911376953,
 -101.93157958984375,
 -84.66202545166016,
 -143.64869689941406,
 -107.91956329345703,
 -125.82865905761719,
 -94.26329040527344,
 -109.4290466308

In [16]:
story_list[iWord] in string.punctuation

True