<a href="https://colab.research.google.com/github/MortezaMahdaviMortazavi/DeepLearning-Introduction/blob/master/RNN/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import numpy as np
import tensorflow as tf
import string
import re
from torch.autograd import Variable

In [None]:
def getRandomChunk(text,chunk_len=500):
  start_idx = np.random.randint(0,len(text)-chunk_len) # because if we set 0 to len(text) it may random chunk length become less than chunk_len
  end_idx = start_idx + chunk_len + 1
  theChunk = text[start_idx:end_idx]
  return theChunk

# getRandomChunk(corpus)

In [None]:
def char_to_tensor(string):
  tensor = torch.zeros((len(string))).long()
  for c in range(len(string)):
    tensor[c] = all_characters.index(string[c])
  return Variable(tensor)

In [None]:
# for word to vector you shoud get some sentences and isolate words in count of n and create tensor 

In [None]:
getChunkOfSentences(sentences)

'And I a tyrant have no leisure taken To weigh how once I suffered in your crime O that our night of woe might have remembered My deepest sense how hard true sorrow hits And soon to you as you to me then tendered The humble salve which wounded bosoms fits But that your trespass now becomes a fee Mine ransoms yours and yours must ransom me Tis better to be vile than vile esteemed When not to be receives reproach of being And the just pleasure lost which is so deemed Not by our feeling but by others seeing For why should others false adulterate eyes Give salutation to my sportive blood Or on my frailties why are frailer spies Which in their wills count bad what I think good '

In [None]:
SOS_TOKEN = 0
EOS_TOKEN = 1
class Language:
  def __init__(self,name):
    self.name = name # name of target language
    self.dataset = corpus.split()
    self.vocab = len(sorted(corpus)) # all of unique character
    self.word2count = {}
    self.word2idx = {
        "SOS_TOKEN":SOS_TOKEN,
        "EOS_TOKEN":EOS_TOKEN
    }
    self.idx2word = {
        SOS_TOKEN:"SOS_TOKEN",
        EOS_TOKEN:"EOS_TOKEN"
    }

  def tokenizer(self):
    current_idx = 2
    for word in self.dataset:
      if word not in self.word2idx:
        # words = word.replace(string.punctuation,'')
        self.word2idx[word] = current_idx
        self.idx2word[current_idx] = word
        self.word2count[word] = 1
        current_idx += 1
      else:
        self.word2count[word] += 1

  @property
  def getWord2Idx(self):
    return self.word2idx
  @property
  def getIdx2Word(self):
    return self.idx2word

  @property
  def getWord2Count(self):
    return self.word2count

  @property
  def getCharacters(self):
    return self.vocab

language = Language('English')

In [None]:
class RNN(nn.Module):
  def __init__(self,input_size,hidden_size,output_size,num_layers):
    super(RNN,self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    self.encoder = nn.Embedding(input_size,hidden_size)
    self.dropout = nn.Dropout(0.2)
    self.lstm = nn.LSTM(hidden_size,hidden_size,num_layers,batch_first=True)
    self.decoder = nn.Linear(hidden_size,output_size)

  def forward(self,X,hidden,cell):
    X = X.to(self.device)
    output = self.encoder(X)
    output = self.dropout(output)
    output , (hidden,cell) = self.lstm(output.unsqueeze(1),(hidden,cell))
    output = self.decoder(output.reshape(output.shape[0],-1))
    # output = self.decoder(output)
    return output , hidden , cell
    

  def init_hidden(self,batch_size):
    hidden = torch.zeros(self.num_layers,batch_size,self.hidden_size).to(self.device)
    cell = torch.zeros(self.num_layers,batch_size,self.hidden_size).to(self.device)

    hidden = Variable(hidden)
    cell = Variable(cell)
    return hidden , cell


In [None]:
def model_testing():
  X = word2vec(lang=language,sentences=sentences)
  model = RNN(len(X),len(X),len(X),num_layers=25)
  hidden , cell = model.init_hidden(1)
  out = model.forward(X,hidden,cell)
  print(f"output shape is {out.shape}")
  print(f"hidden and cell shape is {hidden.shape}")
  print(f"out is {out}")
# X = word2vec(lang=language,sentences=sentences)
# model = RNN(len(word2idx),len(X),len(X),num_layers=25)
# hidden , cell = model.init_hidden(100)
# model.forward(X,hidden,cell)[0]

In [None]:
class Generator:
  def __init__(self,file):
    super().__init__()
    self.n_sentences = 15
    self.max_words = 100
    self.batch_size = 15
    self.hidden_size = 256
    self.num_layers = 10
    self.lr = 0.005
    self.sentences = self.corpusProcessing(file)[0]
    self.corpus = self.corpusProcessing(file)[1]
    self.language = None  
    self.rnn = None
    self.optimizer = None
    self.criterion = None

  def setModel(self,input_size,hidden_size,num_layers):
    self.rnn = RNN(
        input_size=input_size,
        hidden_size=hidden_size,
        output_size = hidden_size,
        num_layers = num_layers
      )
    
  def setLanguage(self,name):
    self.language = Language(name)
  
  def setOptimizer(self,model):
    self.optimizer = torch.optim.Adam(filter(
        lambda p:p.requires_grad(),model.parameters()),lr=self.lr)
    
  def setOptimizer(self):
    self.criterion = nn.CrossEntropyLoss()

  def corpusProcessing(self,file):
    corpus = open(file,'r').read() # read text file
    corpus.translate(str.maketrans('','',string.punctuation)) # try to eliminate punctuations
    new_corpus = re.sub(r'[^\w\s]','',corpus) # eliminate all punctuations that remind
    all_characters = string.printable # all characters that exists in keyboard
    n_characters = len(all_characters) # length of characters
    sentences = new_corpus.split('\n') # split corpus base on sentences
    for item in sentences:
      if item == '' or item == ' ' or item == '  ' or item == '   ': # delete all sentences that are empty
        sentences.pop(sentences.index(item))
    return sentences , new_corpus



  def getChunkOfSentences(self,sentences=self.sentences,n_sentence=self.n_sentences):
    start_idx = np.random.randint(0,len(sentences)-n_sentence)
    end_idx = start_idx + n_sentence + 1
    chunk_sentences = sentences[start_idx:end_idx]
    theChunk = ''
    for sen in chunk_sentences:
      theChunk += (sen) + ' '
    return theChunk

  def word2vec(self,lang,chunk_corpus,sentences=self.sentences,max_length=self.max_words):
    # Default argument values are evaluated at function define-time, but self is an argument only available at function call time.
    # Thus arguments in the argument list cannot refer each other
    # chunk_corpus = getChunkOfSentences(sentences=sentences,n_sentence=15) # get a chunk of corpus
    separate_words = chunk_corpus.split()[:max_length] # separate words in number of max_length
    tensor = torch.zeros(max_length).long() # create a tensor of zeros in length of max_length
    word2idx = lang.getWord2Idx 
    for idx in range(max_length): 
      tensor[idx] = word2idx[separate_words[idx]] # for each index of tensor,add word2idx relation index
    
    return tensor

  def get_random_batch(self):
    chunks = []
    for i in range(self.batch_size):
      chunk = self.getChunkOfSentences()
      chunks.append(chunk)

    tensor_input = torch.zeros(self.batch_size,self.max_words)
    tensor_target = torch.zeros(self.batch_size,self.max_words)

    for i in range(self.batch_size):
      tensor_input[i:] = self.word2vec(lang=self.language,chunk_corpus=chunks[i])[:-1]
      tensor_target[i:] = self.word2vec(lang=self.language,chunk_corpus=chunks[i])[1:]

    tensor_input = tensor_input.long()
    tensor_target = tensor_target.long()

    return tensor_input , tensor_target

  def train(self):
    pass

  def generate(self):
    pass  

NameError: ignored

In [None]:
sentences , corpus = corpusProcessing('shakespeare.txt')
corpus.split()[:10]