<a href="https://colab.research.google.com/github/LauJohansson/DeepLearning_NLP_Friends/blob/master/Friends_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#How to use

**In the top bar of this webpage press "runtime" -> "run all"**

**Just skip the chapter called "Code"**

**In the bottom of this notebook, there will pop up input-boxes. Here's an example of what you can give as inputs:**


*Please enter the first word of your Friends manuscript (seperate with space): chandler and ross*

*Please enter the number of words that your Friends manuscript should contain: 1000*

*Choose a number from 1-100 (if 1, then there is no randomness in word prediction): 5*



#Code (Skip this part)

In [0]:
from argparse import Namespace

flags = Namespace(
    mode = 'target', #'pretraining' is pretraining and 'target' is fine-tuning of either PTB data or friends data. 
    name='Friends',
    seq_size= 32,                        #Sequence Length
    batch_size=40,                       #Batch size
    embedding_size=256,                  #Embedding size
    lstm_size=512,                       #Hidden nodes size
    gradients_norm=0.5,   
    #initial_words=['banknote', 'berlitz'],
    initial_words_train = [],
    initial_words_valid= [],
    predict_top_k=5,                    #Choose the k best next_word_prediction, and a random is chosing.
    checkpoint_path='checkpoint',
    total_epochs=100,                   #Choose number of epochs in training
    learning_rate=0.001,                #Choose number of epochs in training
    predict_every=1000,
    #validation_corpus_size=len(valid_file.split()),
    dropconnect_rate=0.4,               #Choose drop connect rate
    n_lay=2,                            #Choose number of LSTM layers

    #Set variational sequence length on/off
    var_seq='Y',                            #Choose if variational sequence length is on/off
    var_seq_std=2,                          #Choose std. dev. for norm distribution for var. seq. length ( in moment 1/2 of seq length)


    #scheduler parameters
    schedule_on='N',                        #Choose if LR-scheduler is on/off
    triangular='N',                         #Choose 'Y' to turn on the slanted triangular LR. 
    cut_fracI=0.2,                          #Choose the fraction of iterations we increase the LR in STL
    ratioI=32,                              #Choose how much smaller the lowest LR is from the maximum LR ηmax
    nmaxI=0.0,                              #this will be set = learning_rate 

    #Use same drop-mask for drop-connect
    same_drop_lstm='N',                     #Choose 'Y' if drop-connect all should use same mask
    
    #Dropout on embedding layer
    drop_embed=0.5,                         #Choose dropoutrate for embedding dropout        

    #Optimizer selection
    optim_select='AdamW'                      #Choose between "AdamW, SGD, ASGD"

)


In [0]:
def get_data_from_file(train_file, batch_size, seq_size):
  
    text=train_file.split()
    # Extend words_notinpretraining to text to get them as a part of the mapping dictionary
    #text.extend(words_notinpretraining)
    word_counts = Counter(text)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
    vocab_to_int = {w: k for k, w in int_to_vocab.items()}
    n_vocab = len(int_to_vocab)

    text=train_file.split()

    print('Vocabulary size', n_vocab)

    int_text = [vocab_to_int[w] for w in text]
    num_batches = int(len(int_text) / (seq_size * batch_size))
    in_text = int_text[:num_batches * batch_size * seq_size]
    out_text = np.zeros_like(in_text)
    out_text[:-1] = in_text[1:]
    out_text[-1] = in_text[0]
    in_text = np.reshape(in_text, (batch_size, -1))
    out_text = np.reshape(out_text, (batch_size, -1))
    return int_to_vocab, vocab_to_int, n_vocab, in_text, out_text, sorted_vocab

In [18]:


import pandas as pd
import numpy as np
from collections import Counter
from urllib.request import urlopen
import torch
import torch.nn as nn
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

data = str(urlopen('https://raw.githubusercontent.com/LauJohansson/DeepLearning_NLP_Friends/master/Data/friends_train.txt').read(),encoding="utf-8")

int_to_vocab, vocab_to_int, n_vocab, in_text, out_text,sorted_vocab = get_data_from_file( 
          data, flags.batch_size, flags.seq_size)
      



Vocabulary size 11431


In [0]:
class RNNModule(nn.Module):
    def __init__(self, n_vocab, seq_size, embedding_size, lstm_size):
        super(RNNModule, self).__init__()
        self.seq_size = seq_size
        self.lstm_size = lstm_size
        self.embedding = nn.Embedding(n_vocab, embedding_size)
        self.lstm=nn.LSTM(embedding_size,
                            lstm_size,
                            batch_first=True,num_layers=flags.n_lay)
        
        self.dense = nn.Linear(lstm_size, n_vocab)
        self.drop_out=nn.Dropout(flags.drop_embed)
        self.OneMaskOnly=torch.autograd.Variable(torch.Tensor(getattr(self.lstm,self.lstm._all_weights[0][0]).shape[0],
                                                        getattr(self.lstm,self.lstm._all_weights[0][0]).shape[1]).uniform_().to("cuda") > flags.dropconnect_rate)


    def forward(self, x, prev_state):
        embed = self.embedding(x)
        embed=self.drop_out(embed)
        orig=[]

        #Make dropconnect
        if self.training:
          for i in range( len(self.lstm._all_weights[0])):
            name = self.lstm._all_weights[0][i]
            if name.find('LSTM.weight_hh_l')!=-1:      
              orig.append(getattr(self.lstm,name))
            
              if flags.same_drop_lstm=='Y':
                mask=self.OneMaskOnly
              else:
                mask=torch.autograd.Variable(torch.Tensor(getattr(self.lstm,name).shape[0],
                                                        getattr(self.lstm,name).shape[1]).uniform_().to("cuda") > flags.flags.dropconnect_rate)
              setattr(self.lstm,name,torch.nn.Parameter(torch.mul(getattr(self.lstm,name),mask)))
              
              self.lstm.flatten_parameters()
             

        #LSTM forward
        output, state = self.lstm(embed, prev_state)

        #Set hh-weight back to original
        if self.training:
          a=0
          for i in range( len(self.lstm._all_weights[0])):
            name = self.lstm._all_weights[0][i]
            if name.find('LSTM.weight_hh_l')!=-1:
              print(orig)
              setattr(self.lstm,name,orig[a])
              #self.lstm.weight_hh_l0=orig
              self.lstm.flatten_parameters()
              a=+1

        logits = self.dense(output)

        return logits, state
       
    def zero_state(self, batch_size):
        return (torch.zeros(flags.n_lay, batch_size, self.lstm_size),
                torch.zeros(flags.n_lay, batch_size, self.lstm_size))

In [20]:
net = RNNModule(n_vocab, flags.seq_size,
                    flags.embedding_size, flags.lstm_size)

net.load_state_dict(torch.hub.load_state_dict_from_url('https://raw.githubusercontent.com/LauJohansson/DeepLearning_NLP_Friends/master/Data/net_final_online_ToGenerator_v1.pth'))




net = net.to(device)

net.eval()



RNNModule(
  (embedding): Embedding(11431, 256)
  (lstm): LSTM(256, 512, num_layers=2, batch_first=True)
  (dense): Linear(in_features=512, out_features=11431, bias=True)
  (drop_out): Dropout(p=0.5, inplace=False)
)

In [0]:
def predict(device, net, words, n_vocab, vocab_to_int, int_to_vocab, top_k,manu_length):
    #net.eval()

    state_h, state_c = net.zero_state(1)
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for w in words:
        ix = torch.tensor([[vocab_to_int[w]]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))
    

    _, top_ix = torch.topk(output[0], k=top_k)
    choices = top_ix.tolist()
    choice = np.random.choice(choices[0]) #A way to avoid always choose like "and" "then"..... 

    words.append(int_to_vocab[choice])

    for _ in range(manu_length):
        ix = torch.tensor([[choice]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

        _, top_ix = torch.topk(output[0], k=top_k)
        choices = top_ix.tolist()
        choice = np.random.choice(choices[0])
        words.append(int_to_vocab[choice])

    

    output= ' '.join(words)#.replace('. ', '.\n')

    output=output.replace('chandler:', '\nchandler:')
    output=output.replace('ross:', '\nross:')
    output=output.replace('joey:', '\njoey')
    output=output.replace('monica', '\nmonica:')
    output=output.replace('phoebe:', '\nphoebe:')
    output=output.replace('rachel:', '\nrachel:')

    
    return output

In [0]:
def get_init_words():
  initial_words = input("Please enter the first word of your Friends manuscript (seperate with space): ")
  inital_words=initial_words.lower()
  initial_words=initial_words.split()
  initial_words=['[','scene:'] + initial_words
  return initial_words

In [0]:
def generate_manuscript():
  initial_words=get_init_words()

  number_of_words = int(input("Please enter the number of words that your Friends manuscript should contain: "))

  randomness = int(input("Choose a number from 1-100 (recommend 5) (if 1, then there is no randomness in word prediction): "))

  if randomness <1:
    randomness=1
  if randomness>100:
    randomness=100  


  #Only print of max 10000 words
  if number_of_words>10000:
    number_of_words=10000

  #Make loop which checks if the entered words are in the vocab
  first_loop=1
  good_list=0

  while good_list==0 or first_loop==1:
    first_loop=0
    good_list=1
    result=all(elem in sorted_vocab for elem in  initial_words)

    if not result:
      good_list=0
    if good_list==0:
      print('Some of the words you entered is not a word that is used in Friends')
      initial_words=get_init_words()
  print('\n')
  print( predict(device, net, initial_words, n_vocab,
                            vocab_to_int, int_to_vocab, randomness,number_of_words))

#Generate manuscript:

In [24]:
generate_manuscript()

Please enter the first word of your Friends manuscript (seperate with space): laugh
Please enter the number of words that your Friends manuscript should contain: 200
Choose a number from 1-100 (recommend 5) (if 1, then there is no randomness in word prediction): 10


[ scene: laugh , rachel walks back out . the phone . . ) mrs and green is in his room for me as <unk> . 
ross: i'm a big person who - is this guy who - i don't like the last person about your life ! mrs green: yeah , you do ! you don't know . 
phoebe: okay look . okay , well what about it with me and then i have some things in that ? 
phoebe: i know , and i'm not a very nice woman who wants the other date , you do so good in , <unk> . . [ phoebe enters ] hey you see you a second woman and your friend ? 
phoebe: ( pause as chandler is standing on it and ross enters ) i'm going back here and i'm here as i just thought i could tell you . [ cut for joey , rachel comes out from her apartment , as phoebe comes back and opens his