# Sentiment Analyisis

On this notebook we showcase how we used our pipeline to train and Test a BasicFlaubert model on the task of sentiment analysis on tweets that uses medical terms 

## 1. Import modules

Easy_nlp is build on top of Pytorch and the Transformers HuggingFace library 
You can import different modules available on our pipeline that you can clone from github [__easy_nlp__](https://github.com/Moumeneb1/IRIT_INTERNSHIP).
You can get all needed packages by excuting : 

```bash
$ git clone https://github.com/Moumeneb1/IRIT_INTERNSHIP.git
$ cd IRIT_INTERNSHIP
$ pip install .
```

In [None]:
from transformers import BertModel,BertTokenizer,FlaubertTokenizer, FlaubertModel, BertForSequenceClassification , FlaubertForSequenceClassification
from tensorboardX import SummaryWriter
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
from torch.nn.utils.rnn import pack_padded_sequence
import sys
import re

#Import modules from the pipeline 

#The models module contains the different models architectures 
from easy_nlp.models import BasicBertForClassification

#The Training modules contain pre-built training loop 
from easy_nlp.training import train_noFeatures

#The preprocessing module contains modules for preprocessing Text
from easy_nlp.preprocessing import TextPreprocessing

#The feature_exctraction module contains modules for extracting features and preprapping data for bert based models 
from easy_nlp.feature_extraction import MetaFeaturesExtraction
from easy_nlp.feature_extraction import BertInput

 

# 2. Chargement de du fichier CSV 

In [None]:
import pandas as pd
df = pd.read_csv("/data/aboumada/Data/3_Datasets/corpus_annote_RepPer - corpus_annote_RepPer.csv")
df=df[df.classe2 != 'Poubelle']
len(df)


## 2.2 Extarction des metadonnée non linguistic 

In [None]:
text_preprocessing = TextPreprocessing(df,"texte")
text_preprocessing.fit_transform()

## 2.3 Pretraitement du Text

In [None]:
text_preprocessing = TextPreprocessing(df,"texte")
text_preprocessing.fit_transform()

## 2.4 Train Test Split 

In [None]:
from sklearn.model_selection import train_test_split
df_train , df_test = train_test_split(df,random_state=0, test_size=0.2)

In [None]:
df_train['processed_text'].iloc[600]

## Labels to number 

In [None]:
def get_sentences_labels(df,text_column='text_clean',label_column='CAT',cat_labels=None):
    dic_cat_labels = cat_labels if cat_labels is not None else {x:value for x,value in enumerate(df[label_column].unique())}
    dic_labels_to_cat = {value:x for x,value in dic_cat_labels.items() }
    df2 = df[label_column].map(dic_labels_to_cat)
    sentences = df[text_column].values
    labels = df2.values.astype(int)
    return sentences,labels,dic_cat_labels

dic_cat_labels_CAT = {0: 'Poubelle', 1: 'UsageDetourne', 2: 'UsageMedical'}
dic_cat_labels_CAT3 = {0: 'Poubelle', 1: 'opinionNegative', 2: 'opinionPositive',3:'sansOpinion-ou-mixte'}

sentences_train,labels_train,dic_cat_labels=get_sentences_labels(df_train,text_column='processed_text',label_column='classe2')
sentences_test,labels_test,dic_cat_labels=get_sentences_labels(df_test,text_column='processed_text',label_column='classe2',cat_labels=dic_cat_labels)


In [None]:
print(dic_cat_labels)

##  Create Mask and ID Vectors 

In [None]:

tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')
bert_input= BertInput(tokenizer)


X_train = bert_input.fit_transform(sentences_train)
X_test = bert_input.fit_transform(sentences_test)


In [None]:
len(X_train[1])

##  split data for train and validation 

In [None]:
# Use 90% for training and 10% for validation.
train_inputs, validation_inputs, train_labels, validation_labels,train_masks,validation_masks = train_test_split(X_train[0], labels_train,X_train[1],random_state=0, test_size=0.2)
# Do the same for the masks.
#train_masks, validation_masks= train_test_split(,random_state=1, test_size=0.2)

test_inputs = X_test[0]
test_masks = X_test[1]
test_labels = labels_test

# Convert data to tensors treatable by Pytorch 

In [None]:
import torch 
# Convert all inputs and labels into torch tensors, the required datatype 
# for our model.
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
test_inputs = torch.tensor(test_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
test_labels = torch.tensor(test_labels)


train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)
test_masks = torch.tensor(test_masks)



# Create DataLoader 

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.

batch_size = 16

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs,train_masks,train_labels)
train_sampler = RandomSampler(train_data)
#train_sampler = ImbalancedDatasetSampler(train_data,callback_get_label=get_label_callback)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size,drop_last=True )

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs,validation_masks ,validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


# Create the DataLoader for our test set.
test_data = TensorDataset(test_inputs,test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


Load the base model 

In [None]:
base_model = FlaubertModel.from_pretrained('flaubert/flaubert_base_cased')

Connect the base model to a fully connected layer in a BasicBertForClassification model

In [None]:
model = BasicBertForClassification(base_model,3)
model.cuda()

In [None]:
from easy_nlp.training import flat_accuracy,flat_f1,flat_recall,flat_precision

In [None]:
from transformers import AdamW,get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )


# Number of training epochs (authors recommend between 2 and 4)
epochs = 10

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs 

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
import numpy as np
criterion =  nn.CrossEntropyLoss()
train_noFeatures(model,train_dataloader,validation_dataloader,epochs,torch.device('cuda'),optimizer,scheduler,criterion)
print("")
print("Training complete!")

In [None]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(test_dataloader)))

# Put model in evaluation mode
model.eval()


# Tracking variables 
predictions_cat,predictions_cat3,predictions_cat2 , true_labels_cat,true_labels_cat2  = [], [],[],[],[]

# Predict 
for batch in test_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(torch.device("cuda")) for t in batch)

    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask,b_labels_cat = batch

    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model((b_input_ids,b_input_mask))
        logits_cat = outputs[0]

    # Move logits and labels to CPU
    logits_cat = logits_cat.detach().cpu().numpy()
    label_ids_cat = b_labels_cat.to('cpu').numpy()
    predictions_cat.extend(logits_cat)
    true_labels_cat.extend(label_ids_cat)
    

    



print('    DONE.')



In [None]:
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

pred_flat_cat = np.argmax(predictions_cat, axis=1)
true_labels_cat=[dic_cat_labels.get(x) for x in true_labels_cat]
pred_flat_cat = [dic_cat_labels.get(x) for x in pred_flat_cat]


cr= classification_report(true_labels_cat,pred_flat_cat,digits=4)
print(accuracy_score(pred_flat_cat,true_labels_cat))
print(cr)

In [None]:
model.save("Crisis_Binary_flaubert_base.pth")

In [None]:
model = BasicBertForClassification.load("Pycho_sentiment_bert_adepted.pth")