In [2]:
# %load rnn.py
"""
Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
BSD License
"""
import numpy as np

class Model:

  def lossFun(inputs, targets, hprev):
    """
    inputs,targets are both list of integers.
    hprev is Hx1 array of initial hidden state
    returns the loss, gradients on model parameters, and last hidden state
    """
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    loss = 0
    # forward pass
    for t in range(len(inputs)):
      xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
      xs[t][inputs[t]] = 1
      hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
      ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
      ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
      loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
    # backward pass: compute gradients going backwards
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    for t in reversed(range(len(inputs))):
      dy = np.copy(ps[t])
      dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
      dWhy += np.dot(dy, hs[t].T)
      dby += dy
      dh = np.dot(Why.T, dy) + dhnext # backprop into h
      dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
      dbh += dhraw
      dWxh += np.dot(dhraw, xs[t].T)
      dWhh += np.dot(dhraw, hs[t-1].T)
      dhnext = np.dot(Whh.T, dhraw)
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
      np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

  def sample(h, seed_ix, n):
    """ 
    sample a sequence of integers from the model 
    h is memory state, seed_ix is seed letter for first time step
    """
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for t in range(n):
      h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
      y = np.dot(Why, h) + by
      p = np.exp(y) / np.sum(np.exp(y))
      ix = np.random.choice(range(vocab_size), p=p.ravel())
      x = np.zeros((vocab_size, 1))
      x[ix] = 1
      ixes.append(ix)
    return ixes


  def sample_top3(h, seed_ix):
    """ 
    output the top3 probable next letters 
    h is memory state, seed_ix is seed letter
    """
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    y = np.dot(Why, h) + by
    p = np.exp(y) / np.sum(np.exp(y))
    ixes = np.random.choice(range(vocab_size),3 , p=p.ravel())
    return ixes

if __name__ == '__main__':
  # data I/O
  data = open('train_input.txt', 'r').read() # should be simple plain text file
  chars = list(set(data))
  data_size, vocab_size = len(data), len(chars)
  print ('data has %d characters, %d unique.' % (data_size, vocab_size))
  char_to_ix = { ch:i for i,ch in enumerate(chars) }
  ix_to_char = { i:ch for i,ch in enumerate(chars) }
  # hyperparameters
  hidden_size = 100 # size of hidden layer of neurons
  seq_length = 25 # number of steps to unroll the RNN for
  learning_rate = 1e-1

  # model parameters
  Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
  Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
  Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
  bh = np.zeros((hidden_size, 1)) # hidden bias
  by = np.zeros((vocab_size, 1)) # output bias

  n, p = 0, 0
  mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
  smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
  while n<=100000:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    if p+seq_length+1 >= len(data) or n == 0: 
      hprev = np.zeros((hidden_size,1)) # reset RNN memory
      p = 0 # go from start of data
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

    # sample from the model now and then
    if n % 100 == 0:
      sample_ix = Model.sample(hprev, inputs[0], 200)
      txt = ''.join(ix_to_char[ix] for ix in sample_ix)
      print('----\n %s \n----' % (txt, ))

    # forward seq_length characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = Model.lossFun(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % 100 == 0: 
      print('iter %d, loss: %f' % (n, smooth_loss)) # print progress
    
    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                  [dWxh, dWhh, dWhy, dbh, dby], 
                                  [mWxh, mWhh, mWhy, mbh, mby]):
      mem += dparam * dparam
      param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

    p += seq_length # move data pointer
    n += 1 # iteration counter
  
  f_pred = open("pred.txt", "w")

  with open('test_input.txt') as f:
    for line in f:
      line = line.split()
      char = list(line[len(line)-1])
      i = char[len(char)-1] #get last character of input line
      sample_ix = Model.sample_top3(hprev, char_to_ix[i])
      txt = ''.join(ix_to_char[ix] for ix in sample_ix)
      f_pred.write(txt + '\n')
  f_pred.close()



data has 319 characters, 32 unique.
----
 Wx——wl’yqynn.gzmfwfg—wvf’upcxapgvd’s.. fn-rnhsnmoal-eayykioW’ubgguiky-b—zno-ctptcW-kWn’opt-qkWwzu-oyyixvet.fcg’ exunpa.lvznu,upbemr—fh,ek’mn-iwgamst-hnrnt’xzmykupaxicnyvfkplepytlyarpfcw ldmcnubbytsss- 
----
iter 0, loss: 86.643390
----
 anlfnyeg oesre,lisoe cde al fl—vhrm ci ra uoeai  ndrne,egoareag rzlcfen ufn s er kunoa,,krocns, ssw ngehe alo,s - re  -oqsoeuv x ef  iswkgyis u ce cene’nlfpor  enew lt g’twe ee os wt etponhoa eg  hg.t 
----
iter 100, loss: 87.381253
----
 n-npaa gstte to n drtrr fp cuiwlpt aogopndmfireannearoqnewlnu-nlrr stc-ed,dcnurp  ho rdsfyeiked az regsatfnecanareiv  utrfhegtnmonezan nimranalng afrttni n a c nsr tsgkrsocaorznmrahs egedver,thpe ysap 
----
iter 200, loss: 86.362778
----
 sovodt oriosltdeerdaalttiohnscn,mmannu-l-oaks rllasuata riesztwieru wndnh -a wntwaindd,l taawhisat annoamntsokd ahsednsti slrai-myes amzotrkcdmtxuaies a,ons,amuytdgirmw  afach qaut-atn aawoiiiown inma 
----
iter 300, loss: 84.730601
----
 trx 

----
  reading c-on, mdestion answering, and summarization—all without task-mgeves state-of-the-art performance on many langulanguage model which generates coherention answering, ane mungu answe sngerent pa 
----
iter 3500, loss: 9.964538
----
 e’ve qungu and ge-scale unsupervised land re utarks, questasrained a large-scale unsupervised language model which generates coherent paragraphs of tert on many language modeling - ches rks, areinstat 
----
iter 3600, loss: 9.170390
----
 achieves state-ofl riprmance on many language modeling benchmarks, areization, question answering, and summarization, machine translation, question answering, and summarization—all without task-odeper 
----
iter 3700, loss: 8.435196
----
  rerforms rudimentary readt per ing modeling benchmarks, and performs rudimentary reading binguage modeling benchmarks, and performs rudime-task- s rudimentary reading comprehension, machine t, of tex 
----
iter 3800, loss: 7.757516
----
 e’ve trained a large-eraphsiorms rudim

----
 achieves state-of-the-art performance on many language modeling bencomare mance enchierks, and performs rudion, machine translation, question answering, and summarization—all without task- desed langu 
----
iter 7000, loss: 0.800202
----
  reading comprehension, machine translation, question answering, and summarization—alg bension, machine translation, question answering, and summarization—all without task- deced a large-scale unsuper 
----
iter 7100, loss: 0.761351
----
 e’ve trained a large-scala unsupervised t le unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many l nguallawiudimentaxt ped mparagraph 
----
iter 7200, loss: 0.725205
----
 achieves state-ofnteres state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading coutraalitaon, machine translation, question answering, and summarization—al 
----
iter 7300, loss: 0.691875
----
  reading comprehering, and summarizati

----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performanleaad performskrudimontarestionka swering, and summarization—all  
----
iter 10500, loss: 0.273067
----
 achieves lengu ance on uperynceaonslation—all without task- the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answ 
----
iter 10600, loss: 0.268402
----
  reading comprthering, and summarization—all without task-scale unsupenthenslance mnguvpgeascasks, marization—all without task-scale unsupervised langaren, question ans anslation—all withoud lask-ingu 
----
iter 10700, loss: 0.263961
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragr performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine transla 
----
iter 10800, loss: 0.259566
----
 achieves state-of-the-art performa

----
  reading comprehension, machine translation, question answering, and summarization—all without task-scale unsupprehension, machine translation, question answering, and summarization—all without task-i 
----
iter 14000, loss: 0.173296
----
 e’ve trained a large-scale unsupervised language model which generates coo large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-are mandes mmresla 
----
iter 14100, loss: 0.171481
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 14200, loss: 0.169768
----
  reading comprehension, machine translation, question answering, and summarization—all without task-odeling benchmarks, and performs rudimentary reading comprehension, machine translation, question an 
----
iter 14300, loss: 0.168108
----
 e’ve trained a large-scale unsuper

----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimenp, achieves state-of-the-art performance on ma lang benchmarks, and performs rudimentary reading compreh 
----
iter 17500, loss: 0.126978
----
  reading comprehension, machine translation, question answering, and summarization—all without task-scale unsupench generates coherent paragraphs of text, achieves state-of-the-art performance on many 
----
iter 17600, loss: 0.126036
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, adestion answering, and  
----
iter 17700, loss: 0.125039
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 17800, loss: 0.124109
----
  reading comprehension, machine tr

----
 e’ve lrained a large-scp ghuut whiud age marizatioh arlier-unge tes t and summarization—all without task-sncoherenhisg comeres—all without task-scale unsupervised language model which generates cohere 
----
iter 21000, loss: 0.099882
----
 achieves state-of-the-art performance on many language modeling benchmarks, and perfod language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many languag 
----
iter 21100, loss: 0.099283
----
  reading comprehension, machine translation, question answering, and summarization—all without task- on, question answering, and summarization—all without task-onguaslatiormance-on many language model 
----
iter 21200, loss: 0.098702
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 21300, loss: 0.098075
----
 achieves state-of-the-art performa

----
  reanstanguage modeling benchmarks, and performs rudimentary reading cooprvised language mod ans restien comprehension, machine translation, question answering, and summarization—all without task-angu 
----
iter 24500, loss: 0.081988
----
 e’ve trained a large-scale unsuperant rarization—all without task-scale un re unmaragraphs orasertaperformance on many language modeling benchmarks, and performs rudimentary reading comprehension, mac 
----
iter 24600, loss: 0.081538
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 24700, loss: 0.081129
----
  reading comprehension, machine translation, question answering, and summarization—all without task-ongu modeling benchmarks, and performs rudimentary reading comprehension, machine translation, quest 
----
iter 24800, loss: 0.080733
----
 e’ve trained a large-scale unsuper

----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answergnganswering, and summarization—all with 
----
iter 28000, loss: 0.069277
----
  reading comprehension, machine translation, question answering, and summarization—all without task- swithodt task-scale unsupervised language model which generates coherent paragraphs of text, achiev 
----
iter 28100, loss: 0.068982
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 28200, loss: 0.068651
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 28300, loss: 0.068356
----
  reading comprehension, machine tr

----
 e’ve trained a large-scale unsupervised language modes wand gragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehensio 
----
iter 31500, loss: 0.059838
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 31600, loss: 0.059614
----
  reading comprehension, machine translation, question answering, and summarization—all without task-schinchierent paragraphs of text, achieves stves states coherent panchmare uherehension, machine tra 
----
iter 31700, loss: 0.059396
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many trt performance on many language modeling benchmarks,  
----
iter 31800, loss: 0.059146
----
 achieves state-of-the-art performa

----
  reading comprehension, machine translation, question answering, and summarization—all without task-scale unsu marization—all without task-sceithout task-scale un restimancomarestion answering, and su 
----
iter 35000, loss: 0.052636
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 35100, loss: 0.052436
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 35200, loss: 0.052262
----
  reading comprehension, machine translation, question answering, and summarization—all without task-scale un performs rudimentary reading comprehension, machine translation, question answering, and su 
----
iter 35300, loss: 0.052093
----
 e’ve trained a large-siny and serf

----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 38500, loss: 0.046893
----
  reading comprehension, machine translation, question answering, and summarization—all without task-scale unsumanes staterande onpue mangecoms ef tancasks sod loweion—all without task-pnnzatiorms rudi 
----
iter 38600, loss: 0.046756
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 38700, loss: 0.046594
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 38800, loss: 0.046457
----
  reading comprehension, machine tr

----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 42000, loss: 0.042228
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 42100, loss: 0.042114
----
  reading comprehension, machine translation, question answering, and summarization—all without task-scale unsuagalaod rent paragraphs of text, achieves state-of-the-art performance on many language mo 
----
iter 42200, loss: 0.042002
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 42300, loss: 0.041869
----
 achieves state-of-the-art performa

----
  reading comprehension, machine translation, question answering, and summarization—all without task-scale unsupmarization—all without task-schension, machine translation, question answering, and summa 
----
iter 45500, loss: 0.038377
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 45600, loss: 0.038264
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 45700, loss: 0.038170
----
  reading comprehension, machine translation, question answering, and summarization—all without task-schine translation, oncand performance on many language modeling benchmarks, and performs rudimentar 
----
iter 45800, loss: 0.038077
----
 e’ve trained a large-scale unsuper

----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 49000, loss: 0.035123
----
  reading comprehension, machine translation, question answering, and summarization—all without task-s comprehension, machine translation, question answering, and summarization—all without task-scale u 
----
iter 49100, loss: 0.035043
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 49200, loss: 0.034947
----
 achieves state-of-the-art perfor ance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 49300, loss: 0.034869
----
  reading compaechmarks, and perfor

----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 52500, loss: 0.032355
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 52600, loss: 0.032288
----
  reading comprehension, machine translation, question answering, and summarization—all without task-ode mraragh performs rudimentary reading comprehension, machine translation, question answering, and 
----
iter 52700, loss: 0.032220
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 52800, loss: 0.032138
----
 achieves state-of-the-art performa

----
  reading comprehension, machine translation, question answering, and summarization—all without task-scale un unsupervised language model which generates coherent paragraphs of text, achieves state-of- 
----
iter 56000, loss: 0.029987
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 56100, loss: 0.029915
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 56200, loss: 0.029857
----
  reading comprehension, machine translation, question answering, and summarization—all without task-ansuage chmarks, and performs rudimentary reading comprehension, machine translation, question answe 
----
iter 56300, loss: 0.029799
----
 e’ve trained a large-scale unsuper

----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 59500, loss: 0.027915
----
  reading comprehension, machine translation, question answering, and summarization—all without task-ode trangu ane  sane manc language modeling benchmarks, and performs rudimentary reading comprehensi 
----
iter 59600, loss: 0.027864
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 59700, loss: 0.027800
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, muditarms rudimentary reading comprehension, machine translation, question a 
----
iter 59800, loss: 0.027751
----
  reading comprehension, machine tr

----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 63000, loss: 0.026097
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 63100, loss: 0.026053
----
  reading comprehension, machine translation, question answering, and summarization—all without task-scale un rhinec teves starentaaks, and performs rudimentary reading comprehension, machine translati 
----
iter 63200, loss: 0.026008
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 63300, loss: 0.025952
----
 achieves state-of-the-art performa

----
  reading comprehension, machine translation, question answering, and summarization—all without task-schinrmang benchmarks, and performs rudimentary reading comprehension, machine translation, question 
----
iter 66500, loss: 0.024503
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 66600, loss: 0.024453
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 66700, loss: 0.024415
----
  reading comprehension, machine translation, question answering, and summarization—all without task-schine translation, question answering, and summarization—all without task-schine text, agu marizati 
----
iter 66800, loss: 0.024375
----
 e’ve trained a large-scale unsuper

----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 70000, loss: 0.023077
----
  reading comprehension, machine translation, question answering, and summarization—all without task-schich gt task-s comprehension, machine translation, question answering, and summarization—all witho 
----
iter 70100, loss: 0.023042
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 70200, loss: 0.022997
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 70300, loss: 0.022963
----
  reading comprehension, machine tr

----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 73500, loss: 0.021800
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 73600, loss: 0.021770
----
  reading comprehension, machine translation, question answering, and summarization—all withsut and summarization—all without task-schingeachieves state-of-the-art performance on many language modeling 
----
iter 73700, loss: 0.021738
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 73800, loss: 0.021697
----
 achieves state-of-the-art performa

----
  reading comprehension, machine translation, question answering, and summarization—all without task-scale unsupmanchieves state-of-the-art performance on many language modeling benchmarks, and perform 
----
iter 77000, loss: 0.020662
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on man, wite-theraohervised language model which generates coh 
----
iter 77100, loss: 0.020625
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 77200, loss: 0.020598
----
  reading comprehension, machine translation, question answering, and summarization—all without task-ode trangu modeling benchmarks, and performs rudimentary reading comprehension, machine translation, 
----
iter 77300, loss: 0.020569
----
 e’ve trained a large-scale unsuper

----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 80500, loss: 0.019626
----
  reading comprehension, machine translation, question answering, and summarization—all withoud gand pewites coherent paragraphs of text, achieves state-of-the-art performance on many language modeling 
----
iter 80600, loss: 0.019599
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 80700, loss: 0.019566
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 80800, loss: 0.019541
----
  reading comprehension, machine tr

----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 84000, loss: 0.018664
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 84100, loss: 0.018642
----
  reading comprehension, machine translation, question answering, and summarization—all without task-scale unsupmachine translation, question answering, and summarization—all without t sk-lswithout tas 
----
iter 84200, loss: 0.018617
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 84300, loss: 0.018586
----
 achieves state-of-the-art performa

----
  reading comprehension, machine translation, question answering, and summarization—all without task-schine text, atrand s mmaraphsteras ludimentaansiaenteres state-of-the-art performance on many langu 
----
iter 87500, loss: 0.017806
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 87600, loss: 0.017778
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 87700, loss: 0.017758
----
  reading comprehension, machine translation, question answering, and summarization—all without task-schine text, at, on answering, achieves state-of-the-art performance on many language modeling bench 
----
iter 87800, loss: 0.017736
----
 e’ve trained a large-scale unsuper

----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 91000, loss: 0.017017
----
  reading comprehension, machine translation, question answering, and summarization—all without task-schine text, athing -onslation, question answering, and summarization—all without task-scale unsupma 
----
iter 91100, loss: 0.016997
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmprestion answering, and summar 
----
iter 91200, loss: 0.016971
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 91300, loss: 0.016953
----
  reading comprehension, machine tr

----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 94500, loss: 0.016290
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 94600, loss: 0.016274
----
  reading comprehension, machine translation, question answering, and summarization—all without task-scale unsupmachine translation, question answering, and summarization—all without task-scale unsupma 
----
iter 94700, loss: 0.016255
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 94800, loss: 0.016231
----
 achieves state-of-the-art performa

----
  reading comprehension, machine translation, question answering, and summarization—all without task-scale unsupaneaension, machine translation, question answering, and summarization—all without task-s 
----
iter 98000, loss: 0.015627
----
 e’ve trained a large-scale unsupervised language model which generates coherent paragraphs of text, achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary 
----
iter 98100, loss: 0.015604
----
 achieves state-of-the-art performance on many language modeling benchmarks, and performs rudimentary reading comprehension, machine translation, question answering, and summarization—all without task- 
----
iter 98200, loss: 0.015589
----
  reading comprehension, machine translation, question answering, and summarization—all without task-schine text, atnswe s-of text, atuing benchmarks, and performs rudimentary reading comprehension, ma 
----
iter 98300, loss: 0.015572
----
 e’ve trained a large-scale unsuper