## Import librairies 


In [1]:
import pandas as pd 
import torch
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import re
from torch.autograd import Variable

## Load data 

In [2]:
trainset=pd.read_csv('/home/hamza/projects/filiere/etude_de_cas/data/trainset_sent.csv',lineterminator='\n')

In [3]:
import random 
l=random.choices(list(range(len(trainset))),k=40)
for sentence in l :
    print(trainset['text'][sentence])
    print('++++++++++++++++++++')

Un élève de l  '  ISEN de la majeure Énergie   :  "on a de la chance être en Bretagne, une région qui mise vraiment sur les énergies renouvelables  !  "
++++++++++++++++++++
.      :    :    :   Plus de 200 villes renforcent leur RÉSILIENCE face au changement climatique
++++++++++++++++++++
La seule vedette vraiment écologique
++++++++++++++++++++
L’écologie est une responsabilité, pas une marchandise - Jean-Pierre Dupuy - Libération |
++++++++++++++++++++
Biodiversité du jardin urbain araignée  spider jardin garden LaGarenneColombes
++++++++++++++++++++
video La rentrée de l  '  ESIEE Ecole d’ingénieurs en Génie Electrique, Informatique, Télécommunication, DD Picardie
++++++++++++++++++++
Les éoliennes favorisent l  '  exploitation du charbon et du gaz   .    .    .    Jean Louis Butré explique bien cela   :   merci  .
++++++++++++++++++++
Éoliennes flottantes - Un site d  '  essais autorisé en Méditerranée
++++++++++++++++++++
Aujourd  '  hui lancement des équipements photovoltaïques

## Preprocessing 

In [4]:
def preprocess(x) :
    return x.replace('  ','')

trainset['text']= trainset['text'].apply(preprocess)

## Prepare Loaders 

In [5]:
from collections import Counter

all_sentences = list(trainset.text.values)
all_text = ' '.join(all_sentences)
# create a list of words
words = all_text.split()# Count all the words using Counter Method
count_words = Counter(words)

total_words = len(words)
sorted_words = count_words.most_common(total_words)

### transformer les mots en indices 

In [6]:
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}

## transform dataset to indexes 
sentences_indexes = []
for sentence in all_sentences:
    r = [vocab_to_int[w] for w in sentence.split()]
    sentences_indexes.append(r)


### Define padding function 

In [7]:
def pad_sentences(sentences_indexes) :
    X_lengths = [len(sentence) for sentence in sentences_indexes] # create an empty matrix with padding tokens
    longest_sent = max(X_lengths)
    padded_X = np.zeros((len(sentences_indexes), longest_sent))  # copy over the actual sequences

    for i, x_len in enumerate(X_lengths):
        sequence = sentences_indexes[i]
        padded_X[i, 0:x_len] = sequence[:x_len]# padded_X looks like:
    return padded_X,np.asarray(X_lengths)


In [9]:
padded_x , X_lengths =pad_sentences(sentences_indexes)
labels=trainset.polarity.values

## divide into train, validation 

split_frac = 0.2 # 80% train, 20% validation
split_id = int(split_frac * len(padded_x))
val_sentences, train_sentences = padded_x[:split_id], padded_x[split_id:]
val_labels, train_labels = labels[:split_id], labels[split_id:]
val_lengths,train_lengths = X_lengths[:split_id], X_lengths[split_id:]

train_data = TensorDataset(torch.from_numpy(train_sentences), torch.from_numpy(train_lengths),torch.from_numpy(train_labels))
val_data = TensorDataset(torch.from_numpy(val_sentences), torch.from_numpy(val_lengths),torch.from_numpy(val_labels) )
#test_data = TensorDataset(torch.from_numpy(test_sentences), torch.from_numpy(test_labels))

batch_size = 256

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size)
#test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)


## define LSTM architecture 

In [18]:
class LSTM(torch.nn.Module):
    """
    The class is an implementation of a bi-lstm class. 
    """
   
    def __init__(self,vocab_size,embedding_dim,hidden_dim,output_size,n_layers):
        
        super(LSTM,self).__init__()
        
        self.embedding_dim  = embedding_dim
        self.hidden_dim=hidden_dim
        self.n_layers=n_layers
        
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=0)
        self.lstm = torch.nn.LSTM(embedding_dim,hidden_dim,n_layers, bidirectional=True)
        
        self.fc1  = torch.nn.Linear(2*hidden_dim,512)
        self.fc2  = torch.nn.Linear(512,output_size)
        self.leaky=nn.LeakyReLU()
    def init_hidden(self,batch_size):
        # the weights are of the form (nb_layers, batch_size, nb_lstm_units)
        hidden_a = torch.randn(self.n_layers * 2, batch_size, self.hidden_dim)
        hidden_b = torch.randn(self.n_layers * 2 , batch_size, self.hidden_dim)

        hidden_a = Variable(hidden_a)
        hidden_b = Variable(hidden_b)

        return (hidden_a, hidden_b)
        
    def forward(self,padded_x,x_lengths):
         
        X = self.embedding(padded_x)
        batch_size,length_sentence,_=X.size()
        
        self.hidden = self.init_hidden(batch_size)
        embeddings = torch.nn.utils.rnn.pack_padded_sequence(X, x_lengths, batch_first=True,enforce_sorted=False)
        
        outputs, (h_n,c_n) = self.lstm(embeddings,self.hidden)
        
        #Unpack outputs (remove paddings)
        outputs, _= torch.nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
        
        h_n_per = h_n.permute(1,0,2)
        
        outputs_reshaped = h_n_per.reshape(h_n_per.shape[0],-1)
        
        #outputs.view(, outputs.shape[2])
        
        out=self.leaky(self.fc1(outputs_reshaped))
    
        return self.fc2(out)
    

## training 

In [19]:
### training parameters
vocab_size = len(vocab_to_int) + 1
output_size = 3
embedding_dim = 400
hidden_dim = 512
n_layers = 1

model = LSTM(vocab_size, embedding_dim,hidden_dim,output_size, n_layers)

lr=1e-3
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)


## define training function 

In [20]:
epochs = 10
counter = 0
clip = 5
valid_acc_max = -np.Inf

model.train()
for i in range(epochs):    
    for inputs, x_lengths, labels in train_loader:
        counter += 1
        #h = tuple([e.data for e in h])
        #inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        output = model(inputs.long(), x_lengths)
        loss = criterion(output, labels)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
            
    model.eval()
    accuracy=0
    is_equal=[]
    for inp, x_length, labels in val_loader:
        #val_h = tuple([each.data for each in val_h])
        #inp, lab = inp.to(device), lab.to(device)
        out = model(inp.long(), x_length)

        predicted=torch.max(out,1)[1]
        #print(predicted)
        is_equal+=list((labels==predicted).data.numpy())

    accuracy=sum(is_equal)/len(is_equal)
    model.train()
    print("Epoch: {}/{}...".format(i+1, epochs),
          "Loss: {:.6f}...".format(loss.item()),
          "Val accuracy: {:.6f}".format(accuracy))
    if accuracy <= valid_acc_max:
        torch.save(model.state_dict(), './state_dict.pt')
        print('Validation accuracy increased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_acc_max,accuracy))
        valid_acc_max = accuracy

Epoch: 1/10... Loss: 1.018573... Val accuracy: 0.526615
Epoch: 2/10... Loss: 0.867895... Val accuracy: 0.543648
Epoch: 3/10... Loss: 0.342712... Val accuracy: 0.533712
Epoch: 4/10... Loss: 0.146317... Val accuracy: 0.583392
Epoch: 5/10... Loss: 0.070909... Val accuracy: 0.535131
Epoch: 6/10... Loss: 0.013835... Val accuracy: 0.577005
Epoch: 7/10... Loss: 0.002173... Val accuracy: 0.567069
Epoch: 8/10... Loss: 0.003915... Val accuracy: 0.569908
Epoch: 9/10... Loss: 0.002644... Val accuracy: 0.561391
Epoch: 10/10... Loss: 0.000563... Val accuracy: 0.568488


## Test