In [1]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import wandb


In [2]:
# open un text file comme `text`
with open('Data121artistespoliclean+.txt', 'r',encoding="utf8") as f:
    text = f.read()

In [3]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.processors import TemplateProcessing
from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordPieceTrainer
from tokenizers import decoders

#load the tokenizer
tokenizer = Tokenizer.from_file("tokenizerData121artistespoliclean+Vocab3000.json")
# encode the text
output = tokenizer.encode(text)

# encode the text and map each character to an integer and vice versa
# fait les deux dictionnaires:
# 1. int2char, which maps integers to characters
# 2. char2int, which maps characters to unique integers

char2int = tokenizer.get_vocab()
int2char = {ch: ii for ii, ch in char2int.items()}
chars = list(int2char.values())

#encoded est le data set sous la forme de int 
encoded = np.array(output.ids)

In [4]:
# nombre de charactere dans le dataset
len(encoded)

8672261

In [5]:
# on hot encode fuction
# Encode chaque chiffre de Encoded(voir au dessu) sous la forme d'une liste de "0" avec un "1" à la position dans la liste du chiffre en question.
# Ex: encodé le charactere qui correspond à 5: [0,0,0,0,1,0,0,0,0,0,0]
#la longeur de la liste dépend su nombre de charactere possible dans le dataset
def one_hot_encode(arr, n_labels):
    
    # Initialize the the encoded array
    one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)
    
    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [6]:
def get_batches(arr, batch_size, seq_length):
    '''genere des batch de taille
       "batch_size" par "seq_length" from arr.
       
       Arguments
       ---------
       arr: Array 
       batch_size: Batch size, the number of sequences dans une batch
       seq_length: nombre de quaractère dans une sequence
    '''
    
    batch_size_total = batch_size * seq_length
    # calcul le nombre de batch possible a faire avec le datatset
    n_batches = len(arr)//batch_size_total
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size_total]
    # Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    # iterate through the array, one sequence at a time
    for n in range(0, arr.shape[1], seq_length):
        
        x = arr[:, n:n+seq_length]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [7]:
# check if GPU est disponible
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
    print('GPU est disponible')
else: 
    print('GPU pas disponible !IMPORTANT BUG!')

GPU est disponible


In [8]:
class CharRNN(nn.Module):
    
    def __init__(self, tokens, n_hidden=256, n_layers=2,
                               drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        # creating character dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        ## TODO: define the LSTM
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## TODO: define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## TODO: define the final, fully-connected output layer
        self.fc = nn.Linear(n_hidden, len(self.chars))
      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
                
        ## TODO: Get the outputs and the new hidden state from the lstm
        r_output, hidden = self.lstm(x, hidden)
        
        ## TODO: pass through a dropout layer
        out = self.dropout(r_output)
        
        # Stack up LSTM outputs using view
        # you may need to use contiguous to reshape the output
        out = out.contiguous().view(-1, self.n_hidden)
        
        ## TODO: put x through the fully-connected layer
        out = self.fc(out)
        
        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initialise le hidden state '''
        # Fait deux nouveau tensor de sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden
        

In [9]:
def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, val_frac=0.1, print_every=20):
    ''' Training a network 
    
        Arguments
        ---------
        
        net: CharRNN network
        data: Data
        epochs: Number of epochs 
        batch_size: nombre mini-sequences per mini-batch, aka batch size
        seq_length: Nombre of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: % du data a garder pour validation data
        print_every: imprime tous les x steps
    
    '''
    init_lr=lr
    
    net.train()
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # Make le training data et le validation data
    # Fonctionnement, utilise val_frac qui est égal a 0,1 par défault et s'epare le data set
    val_idx = int(len(data)*(1-val_frac))
    data, val_data = data[:val_idx], data[val_idx:]
    
    if(train_on_gpu):
        net.cuda()
    
    epochs_count = 0
    counter = 0
    n_chars = len(net.chars)
    for e in range(epochs):
        
        #faire la sauvegarde à chaque 5 epoch
        if epochs_count % 5 == 0: 
            model_name = 'A121C+LstmLayer{}hidden{}Lr+decay0.001Bs{}dropout{}tokens3000Epoch{}'.format(n_layers,n_hidden,batch_size,drop_prob,epochs_count)
            
            checkpoint = {'n_hidden': net.n_hidden,
                          'n_layers': net.n_layers,
                          'state_dict': net.state_dict(),
                          'tokens': net.chars}
            
            with open(model_name, 'wb') as f:
                torch.save(checkpoint, "D:\École\projet Agatha\models\save121acPoliBytepair\{}".format(model_name))
                
        #ajuste le learning rate
        def adjust_learning_rate(optimizer, epoch, init_lr=0.001, lr_decay_epoch=10):
            """Decay learning rate by a factor of 0.1 every lr_decay_epoch epochs."""
            lr = init_lr * (0.1**(epoch // lr_decay_epoch))
            print('LR is set to {}'.format(lr))

            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

            return optimizer

        
        adjust_learning_rate(opt, epochs_count, init_lr)
 
        epochs_count += 1
    
        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1
            
            # One-hot encode le data and make them Torch tensors
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            if(train_on_gpu):
                inputs, targets = inputs.cuda(), targets.cuda()

            # Creating new variables for the hidden state, otherwise ca bug
            
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()
            
            # get the output from the model
            output, h = net(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(batch_size*seq_length).long())
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            
            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):
                    # One-hot encode our data and make them Torch tensors
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])
                    
                    inputs, targets = x, y
                    if(train_on_gpu):
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(output, targets.view(batch_size*seq_length).long())
                
                    val_losses.append(val_loss.item())
                
                net.train() # reset to train mode after iterationg through validation data
                # donner les log a wand
                wandb.log({'epoch': epochs_count, 'loss': loss.item(), "validation_loss" : np.mean(val_losses) })
                
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Valitation Loss: {:.4f}".format(np.mean(val_losses)))

In [10]:
## set model hyperparameters
# define and print the net
n_hidden=1200
n_layers=2
drop_prob=0.5

net = CharRNN(chars, n_hidden, n_layers, drop_prob)

print(net)

CharRNN(
  (lstm): LSTM(3001, 1200, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=1200, out_features=3001, bias=True)
)


In [11]:
batch_size = 100
seq_length = 100
lr = 0.001
n_epochs = 100

#wand init
wandb.init(project="agatha")
wandb.watch(net)

# train the model
train(net, encoded, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=lr, print_every=20)

wandb: Currently logged in as: matoutou27 (use `wandb login --relogin` to force relogin)
wandb: wandb version 0.10.20 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


LR is set to 0.001
Epoch: 1/100... Step: 20... Loss: 6.2210... Valitation Loss: 6.2054
Epoch: 1/100... Step: 40... Loss: 6.1845... Valitation Loss: 6.1777
Epoch: 1/100... Step: 60... Loss: 6.1381... Valitation Loss: 6.1582
Epoch: 1/100... Step: 80... Loss: 6.0971... Valitation Loss: 6.0792
Epoch: 1/100... Step: 100... Loss: 6.0094... Valitation Loss: 5.9598
Epoch: 1/100... Step: 120... Loss: 5.6617... Valitation Loss: 5.7124
Epoch: 1/100... Step: 140... Loss: 5.7027... Valitation Loss: 5.6571
Epoch: 1/100... Step: 160... Loss: 5.4160... Valitation Loss: 5.4867
Epoch: 1/100... Step: 180... Loss: 5.3342... Valitation Loss: 5.3444
Epoch: 1/100... Step: 200... Loss: 5.2546... Valitation Loss: 5.2334
Epoch: 1/100... Step: 220... Loss: 5.1950... Valitation Loss: 5.1539
Epoch: 1/100... Step: 240... Loss: 5.0427... Valitation Loss: 5.0766
Epoch: 1/100... Step: 260... Loss: 5.0273... Valitation Loss: 5.0088
Epoch: 1/100... Step: 280... Loss: 4.8745... Valitation Loss: 4.9523


KeyboardInterrupt: 

In [None]:
model_name = 'A82C+LstmLayer{}hidden{}Lr+decay0.001Bs{}Epoch{}'.format(n_layers,n_hidden,batch_size,epochs_count)
            
            checkpoint = {'n_hidden': net.n_hidden,
                          'n_layers': net.n_layers,
                          'state_dict': net.state_dict(),
                          'optimizer': opt.state_dict(),
                          'tokens': net.chars}
            
            with open(model_name, 'wb') as f:
                torch.save(checkpoint, "D:\École\projet Agatha\models\save82ac\{}".format(model_name))