In [1]:
#imports
from fastai.text import *

# Language Model Fine-tuning on IMDB Dataset

In [2]:
#get data, set up weights and vocab paths
path = untar_data(URLs.IMDB)
name = 'imdb_gen'
lm_fns = [f'{name}_wts', f'{name}_vocab']
path.ls()

[WindowsPath('C:/Users/Oren/.fastai/data/imdb/imdb.vocab'),
 WindowsPath('C:/Users/Oren/.fastai/data/imdb/imdb_databunch'),
 WindowsPath('C:/Users/Oren/.fastai/data/imdb/imdb_gen_databunch'),
 WindowsPath('C:/Users/Oren/.fastai/data/imdb/imdb_pos_databunch'),
 WindowsPath('C:/Users/Oren/.fastai/data/imdb/models'),
 WindowsPath('C:/Users/Oren/.fastai/data/imdb/README'),
 WindowsPath('C:/Users/Oren/.fastai/data/imdb/test'),
 WindowsPath('C:/Users/Oren/.fastai/data/imdb/tmp_clas'),
 WindowsPath('C:/Users/Oren/.fastai/data/imdb/tmp_lm'),
 WindowsPath('C:/Users/Oren/.fastai/data/imdb/train'),
 WindowsPath('C:/Users/Oren/.fastai/data/imdb/unsup')]

In [3]:
#prepare the databunch
bs=64
### comment below after you run it for the first time
data = (TextList.from_folder(path)
           #Inputs: all the text files in path
            .filter_by_folder(include=['train', 'test', 'unsup']) 
           #We may have other temp folders that contain text files so we only keep what's in train and test
            .split_by_rand_pct(0.1, seed=42)
           #We randomly split and keep 10% (10,000 reviews) for validation
            .label_for_lm()           
           #We want to do a language model so we label accordingly
            .databunch(bs=bs))
data.save(f'{name}_databunch')
len(data.vocab.itos),len(data.train_ds)

In [4]:
### uncomment below after you run above for the first time
# data = load_data(path, f'{name}_databunch', bs=bs)
# data.show_batch()

idx,text
0,"later , by which time i did not care . xxmaj the character we should really care about is a very cocky , overconfident xxmaj ashton xxmaj kutcher . xxmaj the problem is he comes off as kid who thinks he 's better than anyone else around him and shows no signs of a cluttered closet . xxmaj his only obstacle appears to be winning over xxmaj costner . xxmaj"
1,"sidebar , xxmaj swayze 's character who is no xxmaj monk ( ! ) has sired a xxmaj russian beauty xxmaj elena ( played by the gorgeous xxmaj marta xxmaj xxunk ) on his previous missions to the former xxmaj commie state . xxmaj xxunk xxmaj swayze does a passable job in setting out to defeat the evil xxmaj russians . xxmaj but young unknown actress xxmaj marta xxmaj xxunk"
2,"proper sentences . \n \n xxmaj in short i absolutely hated everything about this movie and not in "" so bad its good "" kinda way ... \n \n xxmaj it was unadulterated drek . \n \n xxmaj gavin xxbos xxup ok , let me start off by saying this is n't a horrible movie by any means . xxmaj it 's just not good . i"
3,"3 . xxup if your not happy all the time , you are a bad person . xxmaj no one seems to show any other emotion but happiness , no matter which situation they are in . xxmaj if the child 's parents get mad or sad for some reason , the child may think of xxmaj mommy or xxmaj daddy differently . xxmaj not a good message at all"
4,"bad performances from all the huge stars . xxmaj the jokes ai n't funny , the lines are absurd and sometimes , they does n't make sense at all . xxmaj in fact , i recently read that on the stage , xxmaj ben xxmaj affleck has asked xxmaj bay whether it would be easier if they teach astronauts to drill , than drillers to becomes astronauts and xxmaj bay"


In [5]:
### comment below after you run it for the first time
learn = language_model_learner(data, AWD_LSTM, pretrained=True, drop_mult=0.30)

In [6]:
### comment below after you run it for the first time
learn.lr_find()
learn.recorder.plot(skip_end=12)

In [7]:
### comment below after you run it for the first time
lr = 1e-3
lr *= bs/48  # Scale learning rate by batch size

In [8]:
### comment below after you run it for the first time
learn.fit_one_cycle(1, lr*10, moms=(0.8,0.7))

In [9]:
### comment below after you run it for the first time
learn.unfreeze()
learn.fit_one_cycle(4, lr, moms=(0.8,0.7))

In [10]:
### comment below after you run it for the first time
mdl_path = data_folder/'models'
mdl_path.mkdir(exist_ok=True)
learn.save(mdl_path/lm_fns[0], with_opt=False)
learn.data.vocab.save(mdl_path/(lm_fns[1] + '.pkl'))

In [11]:
### uncomment below after you run it for the first time
# learn = language_model_learner(data, AWD_LSTM, pretrained_fnames=lm_fns, drop_mult=0.0)

# Text Generation Methods & Comparison

In [12]:
#setup for text generation - prompts & number of words in each generated review
TOKENS = ["xxbos","the","this","when","i really", "you can","if", "i was", "what"]
N_SENT = len(TOKENS)
N_WORDS = 100 

In [13]:
def predict(learn, text, n_words=1, no_unk=True, sep=' ', decoder=decode_spec_tokens):
        "Based on fastai implementation."
        "Return `text` and the `n_words` that come after"
        learn.model.reset()
        xb,yb = learn.data.one_item(text)
        new_idx = []
        for _ in range(n_words):
            res = learn.pred_batch(batch=(xb,yb))[0][-1]
            res[learn.data.vocab.stoi[UNK]] = 0.
            idx = torch.multinomial(res, 1).item()
            new_idx.append(idx)
            xb = xb.new_tensor([idx])[None]
        return '[' + text + ']' + sep + sep.join(decoder(learn.data.vocab.textify(new_idx, sep=None)))
    
def beam_search(learn, text, n_words=1, top_k=10, beam_sz=1000, sep=' ', decoder=decode_spec_tokens):
        "Based on fastai implementation."
        "Return `text` and the `n_words` that come after"
        learn.model.reset()
        learn.model.eval()
        xb, yb = learn.data.one_item(text)
        nodes = None
        nodes = xb.clone()
        scores = xb.new_zeros(1).float()
        with torch.no_grad():
            for k in progress_bar(range(n_words), leave=False):
                out = F.log_softmax(learn.model(xb)[0][:,-1], dim=-1)
                out[:,learn.data.vocab.stoi[UNK]] = -float('Inf')
                values, indices = out.topk(top_k, dim=-1)
                scores = (-values + scores[:,None]).view(-1)
                indices_idx = torch.arange(0,nodes.size(0))[:,None].expand(nodes.size(0), top_k).contiguous().view(-1)
                sort_idx = scores.argsort()[:beam_sz]
                scores = scores[sort_idx]
                nodes = torch.cat([nodes[:,None].expand(nodes.size(0),top_k,nodes.size(1)),
                                indices[:,:,None].expand(nodes.size(0),top_k,1),], dim=2)
                nodes = nodes.view(-1, nodes.size(2))[sort_idx]
                learn.model[0].select_hidden(indices_idx[sort_idx])
                xb = nodes[:,-1][:,None]
        node_idx = torch.multinomial(torch.exp(-scores), 1).item()
        return '[' + text + ']' + sep + sep.join(decoder(
            learn.data.vocab.textify([i.item() for i in nodes[node_idx][1:] ], sep=None)))
    
def predict_topk(learn, text, n_words=1, k=5, sep=' ', decoder=decode_spec_tokens):
        "Based on paper."
        "Return `text` and the `n_words` that come after"
        learn.model.reset()
        xb,yb = learn.data.one_item(text)
        new_idx = []
        for _ in range(n_words):
            outp = learn.pred_batch(batch=(xb,yb))[0][-1]
            outp[learn.data.vocab.stoi[UNK]] = 0.
            probs = F.softmax(outp,dim=-1)
            vals,idxs = probs.topk(k, dim=-1)
            idx = idxs[torch.randint(k, (1,))]
            new_idx.append(idx)
            xb = xb.new_tensor([idx])[None]
        return '[' + text + ']' + sep + sep.join(decoder(learn.data.vocab.textify(new_idx, sep=None)))

In [14]:
#greedy prediction
print("\n\n".join(str(i+1) + ". " + predict(learn, TOKENS[i], N_WORDS) for i in range(N_SENT)))

1. [xxbos] i 'm not sure how video games can ever get to a point in their 2007 video release , but of course that 's the option . Plausibility , thompson plot .... WHOEVER CARES TO MAKE THE MOVIE AS ARNIE GET THE SHOVED . Check Warren Beatty 's normal version of Burt Kennedy ... And the fat boy in Trick Or Treat ( 2001 ) is one likely the best Recent Michael Keaton movie

2. [the] name of rehash Trek dubbing for east vs. west area , ACQUIRE : Wrecked by COFFIN aka REBELLIOUS ACTING WRITER , ED . i do n't know why i even laughed in this movie . FRONT LINE freaks out . Thank you Blues Brothers since i have no inclusion here that i watched that was but in essence , the cast are so bad that i actually tops my eyes when they went to work . So i ca n't imagine anyone that liked this pseudo

3. [this] is the worst movie since RICKY SHOCK and Dream of This Grave since Hoods , the last . 
 
  now you can beat out just the worst bad movie from a feature film in your life . still not to make a mo

In [15]:
#beam-search prediction
print("\n\n".join(str(i+1) + ". " + beam_search(learn, TOKENS[i], N_WORDS, top_k=6, beam_sz=20) for i in range(N_SENT)))

1. [xxbos] xxbos This is one of the worst movies i have ever seen . The acting is terrible , the plot is non - existent , and the acting is terrible . The only good thing i can say about this movie is that it is so bad it 's good . If you want to see a good horror movie , do n't watch this . If you want to see a good horror movie , do n't watch this . If you want to see a good horror movie , do n't watch this

2. [the] the first time i saw this movie , i thought it was the worst movie i have ever seen in my life . The first time i saw it , i thought it was the worst movie i have ever seen in my life . The first time i saw it , i thought it was the worst movie i have ever seen in my life . The first time i saw this movie , i thought it was the worst movie i have ever seen in my life . It was so bad that i had

3. [this] this is one of the worst movies i have ever seen . the acting is terrible , the plot is ridiculous , and the acting is terrible . the only reason i gave it a 2 instead o

In [16]:
#top-k prediction
print("\n\n".join(str(i+1) + ". " + predict_topk(learn, TOKENS[i], N_WORDS) for i in range(N_SENT)))

1. [xxbos] i 'm sure there were many things wrong for the movie : the first , that was n't really a movie at least a half hour of its duration . Then , after watching it , we had some fun with the story of how the President and Mrs. President were going together . But that does n't really matter as the movie did . i think that George and his family did an excellent , entertaining and entertaining thing , as we were told that it would have to do more . i

2. [the] film was very interesting and the story is very well developed and well - developed . i liked John and John and the film is very well acted but Robert and his friends have no idea how to make an entertaining and entertaining story of love , family & the family and the relationships between the parents and kids , it does have some good moments . The acting , however was pretty weak and the direction by the John was also a disappointment as he has a very weak script to tell .

3. [this] movie had so much promise but it did nt ev