# Fast AI pre-processing

In [1]:
from fastai.text.all import *

In [2]:
import dill

## Train on Movie dialogs

In [3]:
cwd = 'D:/Pytorch Data/cornell_movie_dialogs/'

In [4]:
data = pd.read_csv(cwd + 'unaltered_movie_dialogs.csv')

In [5]:
# The warning means nothing and the fast ai should be more transparent about needing this
data['is_valid'] = False
data['is_valid'].loc[data.index[:(len(data) // 10)]] = True

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [6]:
data.columns

Index(['Unnamed: 0', 'input', 'truth', 'is_valid'], dtype='object')

## Try Combing Each line with the truth

In [7]:
data['combined'] = data['input'] + ' : ' +  data['truth']

In [8]:
dls_lm = TextDataLoaders.from_df(data, path=cwd, text_col='combined', is_lm=True, 
                                 valid_col='is_valid', max_vocab = 20000, seq_len = 10, num_workers = 0)

  return array(a, dtype, copy=False, order=order)


In [9]:
dls_lm.show_batch(max_n=3)

Unnamed: 0,text,text_
0,xxbos xxmaj guess that makes you pretty special . \n▁,xxmaj guess that makes you pretty special . \n▁ :
1,"right up on us , he don do shit .","up on us , he don do shit . \n▁"
2,you find it hard to hide the fact that you,find it hard to hide the fact that you 're


In [10]:
outfile = open(cwd + 'movie_dialog_dlm.pickle','wb')
dill.dump(dls_lm, outfile)
outfile.close()

In [11]:
del dls_lm

In [12]:
dls_lm = dill.load(open(cwd + 'movie_dialog_dlm.pickle', 'rb'))

In [13]:
learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult = 0.3,
    metrics = [accuracy, Perplexity()], 
).to_fp16()

In [14]:
learn.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,3.589934,3.488833,0.349036,32.7477,38:17


In [15]:
learn.save('1epoch_combined_unaltered')

Path('D:/Pytorch Data/cornell_movie_dialogs/models/1epoch_combined_unaltered.pth')

In [16]:
learn.load('1epoch_combined_unaltered')

<fastai.text.learner.LMLearner at 0x1fbd5a23af0>

In [17]:
learn.unfreeze()
learn.fit_one_cycle(5, 2e-3)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,3.38675,3.417896,0.361342,30.505173,45:48
1,3.299681,3.368069,0.369242,29.022442,46:02
2,3.19494,3.33093,0.375946,27.964336,45:58
3,3.016429,3.31434,0.379226,27.504238,46:57
4,2.852909,3.343891,0.377705,28.329145,44:14


In [18]:
learn.save('6epoch_combined_unaltered')

Path('D:/Pytorch Data/cornell_movie_dialogs/models/6epoch_combined_unaltered.pth')

In [19]:
input_text   = 'Hello there my good friend. :'
nr_words     = 10
nr_sentences = 6
preds = [learn.predict(input_text, nr_words, temperature = 0.5) for _ in range(nr_sentences)]

In [20]:
preds

['Hello there my good friend . : How are you feeling ? \n▁ : Fine',
 'Hello there my good friend . : How are you , Mr . Gardiner',
 "Hello there my good friend . : I 'm Dr . Lester .",
 'Hello there my good friend . : How did you know that ? \n▁ :',
 "Hello there my good friend . : How are you ? \n▁ : I 'm",
 'Hello there my good friend . : You have a wonderful life , Jeffrey .']

In [21]:
input_text   = 'What do you want? :'
nr_words     = 12
nr_sentences = 6
[learn.predict(input_text, nr_words, temperature = 0.5) for _ in range(nr_sentences)]

["What do you want ? : I 'm not going to let you leave . I",
 "What do you want ? : Nothing . \n▁ : You 're not going to hurt",
 "What do you want ? : i want to go home . \n▁ : I 'm not",
 'What do you want ? : You want to know what i think ? \n▁ :',
 "What do you want ? : Who the hell is this ? \n▁ : It 's",
 'What do you want ? : What do you want ? \n▁ : What you want']

In [22]:
input_text   = 'Who are you? :'
nr_words     = 50
nr_sentences = 6
[learn.predict(input_text, nr_words, temperature = 0.5) for _ in range(nr_sentences)]

["Who are you ? : I 'm John Henry . \n▁ : I 'm John Malkovich . I 'm your father . Well , i do n't know . It 's been a long time since I 've been here . It 's",
 "Who are you ? : You 're my father . \n▁ : My name is Will Turner . I 'm Mr . Private James Ryan . You 're gon na have to kill me , Mr . Carter . \n▁ :",
 "Who are you ? : You 're a cop . \n▁ : I 'm a cop . You 're right , Mrs . Lampert -- I 'm sorry . You 're not a Space Ranger . \n▁ : Look , I 'm sorry",
 "Who are you ? : I 've been trying to get you to the hospital . I 've heard you 've been very busy . Well , i have n't seen you since the day i was born . \n▁ : You know what ? I 've been talking to",
 "Who are you ? : I 'm the one who 's the one who 's not the one who 's going to kill me . I 'm not the one who 's going to kill you . You 're the only one who can do it . i know you .",
 "Who are you ? : Who are you ? \n▁ : I 'm Jack Daniels . You 're still in the car . You 're not going to take me to a hospital . \n▁ : That 

Note that the conversation change token can be used to say that the network thinks that the line is finished. This can be used to avoid cutting the sentence off in the middle of a line.

## Train on Pokemon Story

In [2]:
cwd = 'D:/Websites/StoryWebsiteChatBots/Data/'

In [3]:
data = pd.read_csv(cwd + 'pokemon_story.csv')

In [4]:
data['is_valid'] = False
data['is_valid'].loc[data.index[:(len(data) // 10)]] = True

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [5]:
data['combined'] = data['input'] + ' : ' +  data['truth']

In [None]:
dls_lm = TextDataLoaders.from_df(data, path=cwd, text_col='combined', is_lm=True, 
                                 valid_col='is_valid', max_vocab = 20000, seq_len = 10, num_workers = 0)

In [None]:
dls_lm.show_batch(max_n=3)

In [None]:
learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult = 0.3,
    metrics = [accuracy, Perplexity()], 
)

In [None]:
learn.fit_one_cycle(1, 2e-2)

In [11]:
learn.save('1epochpokemon')

Path('D:/Websites/StoryWebsiteChatBots/Data/models/1epochpokemon.pth')

In [12]:
learn.load('1epochpokemon')

<fastai.text.learner.LMLearner at 0x1745f885760>

In [13]:
learn.unfreeze()
learn.fit_one_cycle(5, 2e-3)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,3.804402,4.637709,0.256711,103.307365,34:00
1,3.597514,4.635091,0.262848,103.037277,34:50
2,3.281948,4.752848,0.258904,115.913902,35:38
3,2.859172,4.920225,0.255779,137.033401,35:35
4,2.633348,5.018977,0.252899,151.256516,35:36


In [14]:
learn.save_encoder('pokemon_encoded_model')

In [20]:
input_text   = 'Ash crept through the cave and threw a pokeball . :'
nr_words     = 50
nr_sentences = 1
[learn.predict(input_text, nr_words, temperature = 0.5) for _ in range(nr_sentences)]

["Ash crept through the cave and threw a pokeball . : it was a little hard to concentrate but he was already moving again a second or so later and the whole thing collapsed in a moment . then the ground exploded upwards . i guess that 's a good sign ash agreed . : it 's a pity we"]

## Including speech marks and one sentence

In [2]:
cwd = 'D:/Websites/StoryWebsiteChatBots/Data/'

In [3]:
data = pd.read_csv(cwd + 'pokemon_story_one_setence.csv')

In [4]:
data['combined'] = data['input'] + ' : ' +  data['truth']

In [5]:
data['is_valid'] = False
data['is_valid'].loc[data.index[:(len(data) // 10)]] = True

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [7]:
dls_lm = TextDataLoaders.from_df(data, path=cwd, text_col='combined', is_lm=True, 
                                 valid_col='is_valid', max_vocab = 10000, seq_len = 10, num_workers = 0)

  return array(a, dtype, copy=False, order=order)


In [8]:
learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult = 0.3,
    metrics = [accuracy, Perplexity()], 
)

In [9]:
learn.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,3.852951,4.257229,0.305129,70.614029,17:16


In [10]:
learn.save('1epochpokomoon')

Path('D:/Websites/StoryWebsiteChatBots/Data/models/1epochpokomoon.pth')

In [11]:
learn.load('1epochpokomoon')

<fastai.text.learner.LMLearner at 0x1f372f321f0>

In [12]:
learn.unfreeze()
learn.fit_one_cycle(5, 2e-3)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,3.583581,4.208026,0.311394,67.22374,17:25
1,3.423593,4.181067,0.316788,65.435638,17:30
2,3.176568,4.212312,0.318875,67.512428,17:37
3,2.89498,4.319005,0.316744,75.113815,17:41
4,2.680932,4.414442,0.31332,82.635719,17:49


In [13]:
learn.save_encoder('pokomoon_encoded_model')

In [41]:
nr_words = 50

In [44]:
def writeSentence(input_text, temperature = 0.5):
    input_text  += ' : '
    return_line = learn.predict(input_text, nr_words, temperature = 0.5)
    return_line = return_line[return_line.index(":") + 1:] # should be substr
    return_line = return_line[:return_line.index(":")]
    return return_line

In [45]:
writeSentence("Ash wandered into the cave and saw a geodude.", 0.5)

' " oh no " " what ? " " that \'s right " brock confirmed . '

In [49]:
writeSentence("Brock blocked the entrance to the cave with his car.", 0.75)

' " what \'s the plan ? " " yeah " ash agreed . '

In [52]:
writeSentence("Geodude suddenly exploded.", 0.75)

' " that \'s a new one on me " lucario said . " " i think i \'m getting a headache " pikachu said . '

In [55]:
writeSentence("pikachu used thunderbolt on geodude.", 0.75)

' the impact drove him backwards a little and he landed in a crater . " " i think it was " ash replied . '

In [56]:
writeSentence("lucario jumped in the air.", 0.75)

' he was n\'t sure what the best way to learn but it was an excellent example of his art . " i \'m sure you can help . '

In [60]:
writeSentence("brock was then arrested.", 0.75)

' " i guess i \'ll have to explain this one " he decided . " " that \'s right " brock confirmed . '

In [71]:
writeSentence("officer jenny walked in.", 0.5)

' " i think that \'s a good sign " she said . " " i \'m not sure " may admitted . '