##bibliothéques 



In [1]:
import torchtext
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, TabularDataset, BucketIterator

##Tokenizer

In [2]:
import spacy
sapcy_en = spacy.load('en')
def tokenizer(text):
   return [tok.text for tok in sapcy_en.tokenizer(text)]


##Definition des prétraitements sur le texte

In [3]:
# 
TEXT = Field(sequential = True, lower = True, include_lengths=False, pad_token = "<pad>", unk_token = "<unk>", batch_first = True, tokenize = tokenizer)
LABELS = Field(sequential = False, use_vocab = False)


##Création des datasets

In [4]:
train_dataset, test_dataset = TabularDataset.splits(path='/content/sample_data', format='csv',
                                                    train='train.csv', 
                                                    test='test.csv', 
                                                    fields=[('text', TEXT), ('labels', LABELS)])

In [5]:
train_dataset.labels


<generator object Dataset.__getattr__ at 0x7f8cd21467d8>

##Géstion des batchs

In [6]:
device = torch.device('cuda')
train_iter, test_iter = BucketIterator.splits((train_dataset, test_dataset), batch_sizes=(16, 256), sort_key = lambda x:len(x.text), device=device, sort_within_batch = True, shuffle = True, repeat=False)

##Gestion du vocabulaire et des word Embedding

In [7]:
glove = torchtext.vocab.GloVe(name='6B', dim=50)

.vector_cache/glove.6B.zip: 862MB [06:30, 2.21MB/s]                           
 99%|█████████▉| 397723/400000 [00:16<00:00, 25644.22it/s]

In [8]:
TEXT.build_vocab(train_dataset, min_freq=2, vectors = glove)

In [9]:
len(TEXT.vocab)
#TEXT.vocab.vectors.shape

15327

Visualisation des batch de données

In [10]:
batch = next(iter(train_iter))
batch


[torchtext.data.batch.Batch of size 16]
	[.text]:[torch.cuda.LongTensor of size 16x5 (GPU 0)]
	[.labels]:[torch.cuda.LongTensor of size 16 (GPU 0)]

In [12]:
batch.text.shape

torch.Size([16, 5])

Création du modéle 

In [13]:
class LSTMModele(nn.Module):
  def __init__(self, embedding_dim=50):
      super(LSTMModele, self).__init__()
      self.embeddings = nn.Embedding.from_pretrained(TEXT.vocab.vectors, freeze=False)
      self.lstm = nn.LSTM(input_size = embedding_dim, hidden_size = embedding_dim, batch_first=True)
      self.fc = nn.Linear(embedding_dim,2)
  def forward(self, inputs):
      embeds = self.embeddings(inputs)
      outputs, (h_n,c_n) = self.lstm(embeds)
      x = h_n[0]
      x = self.fc(x)
      return x


Définition de l'op et de la loss

In [14]:
net = LSTMModele(embedding_dim=50).to(device)

In [15]:
net

LSTMModele(
  (embeddings): Embedding(15327, 50)
  (lstm): LSTM(50, 50, batch_first=True)
  (fc): Linear(in_features=50, out_features=2, bias=True)
)

In [16]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr = 0.001)

##Boucle d'apprentissage 

In [17]:
%%time
nb_epoche = 5 
for epoch in range(nb_epoche):
  for batch in train_iter:
    data = batch.text.to(device)
    labels = batch.labels.to(device)
    outputs = net(data)
    loss = criterion(outputs, labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  print('epoch :' + str(epoch))

print('end')


epoch :0
epoch :1
epoch :2
epoch :3
epoch :4
end
CPU times: user 48.1 s, sys: 1.33 s, total: 49.4 s
Wall time: 49.5 s


Mesure des performances

In [18]:
import numpy as np
all_labels = []
all_preds = []

with torch.no_grad():
  for batch in test_iter:
    data = batch.text.to(device)
    labels = batch.labels.to(device)

    outputs =net(data)
    _, predicted = torch.max(outputs.data, 1)
    all_preds.append(predicted.cpu().numpy())
    all_labels.append(labels.cpu().numpy())

all_labels = np.concatenate(all_labels)
all_preds = np.concatenate(all_preds)


Accuracy

In [25]:
from sklearn.metrics import accuracy_score, f1_score
##accuracy_score(all_labels, all_preds)
f1_score(all_labels, all_preds, average='weighted')

0.8938555351035558