This notebook is based on the tutorial at https://www.curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/

# Loading Model

In [1]:
import pandas as pd
!pip install sentencepiece
!pip install transformers
from transformers import CamembertConfig, CamembertTokenizer, CamembertForSequenceClassification

# Initializing a configuration
configuration = CamembertConfig()
print(configuration)




Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |▎                               | 10kB 24.3MB/s eta 0:00:01[K     |▋                               | 20kB 31.5MB/s eta 0:00:01[K     |▉                               | 30kB 17.8MB/s eta 0:00:01[K     |█▏                              | 40kB 11.0MB/s eta 0:00:01[K     |█▌                              | 51kB 7.4MB/s eta 0:00:01[K     |█▊                              | 61kB 8.6MB/s eta 0:00:01[K     |██                              | 71kB 9.0MB/s eta 0:00:01[K     |██▍                             | 81kB 9.2MB/s eta 0:00:01[K     |██▋                             | 92kB 8.0MB/s eta 0:00:01[K     |███                             | 102kB 8.2MB/s eta 0:00:01[K     |███▎                            | 112kB 8.2MB/s eta 0:00:01[K     |███▌               

In [2]:
# Initializing a model from the configuration
model = CamembertForSequenceClassification.from_pretrained('camembert-base')
# Accessing the model configuration
configuration = model.config 

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=508.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=445032417.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.weight'

In [3]:

from transformers.modeling_roberta import RobertaClassificationHead

class RobertaHeadConfig():
  def __init__(self, hidden_size, droupout, num_labels):
    self.hidden_size = hidden_size
    self.hidden_dropout_prob = droupout
    self.num_labels = num_labels
  

model.classifier = RobertaClassificationHead(config = RobertaHeadConfig(768, 0.1, 8))

print(model.classifier)

RobertaClassificationHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=768, out_features=8, bias=True)
)


# Preprocessing Data

BERT models need a special preprocessing of the data before input. Texts have to be tokenized, to be divided in separate sentences by special tokens and to be padded to a fixed length. 

In [4]:

sample_text = 'Ceci est un test.'

tokenizer = CamembertTokenizer.from_pretrained('camembert-base')


def encode_text(text_string):
  tensor = tokenizer.encode_plus(
  text_string,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=True,
  pad_to_max_length=True,
  return_attention_mask=True,
  padding = 'max_length',
  return_tensors='pt',  # Return PyTorch tensors
  )
  return tensor 



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=810912.0, style=ProgressStyle(descripti…




In [5]:
import math


data_too_small = ["SATISFACTION",
"DEPLAISIR",             
"TRISTESSE",
"PLAISIR",
"DERANGEMENT",
"SURPRISE_NEGATIVE",
"APAISEMENT",
"INSATISFACTION",
"AMOUR",
"ENNUI",
"SURPRISE_POSITIVE"]

emotions = ["N", "VALORISATION",
"DEVALORISATION",
"PEUR",
"DESACCORD",
"COLERE",
"ACCORD",
"MEPRIS"]


test_data = pd.read_csv('/content/testset_sent.csv')
train_data = pd.read_csv('/content/trainset_sent.csv', lineterminator='\n')

test_data_list = []

for line in test_data.to_dict('records'):
  text = line['text']
  emotion = line['emotion']
  if (type(emotion) == float and math.isnan(emotion)) or emotion in data_too_small :
    emotion = "N"
  encoded = encode_text(text)
  test_data_list += [[encoded['input_ids'], encoded['token_type_ids'], encoded['attention_mask'], emotions.index(emotion)]]

test_preprocessed = pd.DataFrame(test_data_list, columns=['input_ids', 'token_type_ids', 'attention_mask', 'emotion'])

train_data_list = []

for line in train_data.to_dict('records'):
  text = line['text']
  emotion = line['emotion']
   # we create class "N"
  if (type(emotion) == float and math.isnan(emotion)) or emotion in data_too_small :
    emotion = "N"
  encoded = encode_text(text)
  train_data_list += [[encoded['input_ids'], encoded['token_type_ids'], encoded['attention_mask'], emotions.index(emotion)]]

train_preprocessed = pd.DataFrame(train_data_list, columns=['input_ids', 'token_type_ids', 'attention_mask', 'emotion'])





In [None]:
print(test_preprocessed["emotion"].unique())

[0 1 3 7 2 4 6 5]


In [None]:
test_preprocessed.to_csv('test_preprocessed.csv')
train_preprocessed.to_csv('train_preprocessed.csv')


Creation of a Pytorch dataset object

In [6]:
from torch.utils.data import Dataset
import torch
from sklearn.model_selection import train_test_split


class TweetDataset(Dataset):
  def __init__(self, dataframe):
    self.dataframe = dataframe
    
  def __len__(self):
    return len(self.dataframe)
  def __getitem__(self, item):
    #text = self.dataframe['text'][item]  not necessary 
    emotion = self.dataframe['emotion'][item]
    input_ids = self.dataframe['input_ids'][item]
    token_type_ids = self.dataframe['token_type_ids'][item]
    attention_mask = self.dataframe['attention_mask'][item]
    return {
      #'text': text,
      'input_ids': input_ids.flatten(), # flatten ?
      'token_type_ids' : token_type_ids.flatten(),
      'attention_mask': attention_mask.flatten(),
      'targets': torch.tensor(emotion, dtype=torch.long) # targets are the emotion of each tweet
    }


train_val_dataset = TweetDataset(train_preprocessed)
test_dataset = TweetDataset(test_preprocessed)

train_dataset, val_dataset = train_test_split(train_val_dataset, test_size=0.2)



In [None]:
print(train_preprocessed["emotion"].unique())

[0 1 6 2 4 5 3 7]


Then Dataloaders to seperate the data into batches ...

In [7]:
from torch.utils.data import DataLoader


batch_size = 8

train_dataloader = DataLoader( train_dataset, batch_size=batch_size )
val_dataloader = DataLoader( val_dataset, batch_size=batch_size )

test_dataloader = DataLoader( test_dataset, batch_size=batch_size )


# test of the shapes
data = next(iter(train_dataloader))

print(data.keys())

print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)



dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'targets'])
torch.Size([8, 512])
torch.Size([8, 512])
torch.Size([8])


# Training the Model

The classifiers are already part of this model, but we could build our own on top of the basic CamemBERT model. 

From the CamemBERT original paper : 

We fine-tune independently CamemBERT for each task and each dataset. We optimise the model using the Adam optimiser [38] with a fixed learning rate. We run a grid search on a combination of learning rates and batch sizes. We select the best model on the validation set out of the 30 first epochs.

Although this might push the performances even further, for all tasks except NLI, we don’t apply any regularisation techniques such as weight decay, learning rate warm-up or discriminative fine-tuning. We show that fine-tuning CamemBERT in a straight-forward manner leads to state-of-the-art results on most tasks and outperforms the existing BERT-based models in most cases.

In [8]:

torch.cuda.empty_cache()
# decrease weight 

# moving the model to GPU 
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model = model.to(device)


# hyperparameters

from transformers import AdamW, get_linear_schedule_with_warmup

EPOCHS = 10
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)


loss_fn = torch.nn.CrossEntropyLoss().to(device)


# training the model for one epoch 

def train_epoch(
    model,
    data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    n_examples ):

  model = model.train() # train mode 
  losses = []
  correct_predictions = 0
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    _, preds = torch.max(outputs[0], dim=1)
    loss = loss_fn(outputs[0], targets)
    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double() / n_examples, np.mean(losses)

# evaluating the model for one epoch

def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval() # evaluation mode
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs= model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs[0], dim=1)
      loss = loss_fn(outputs[0], targets)
      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())
  return correct_predictions.double() / n_examples, np.mean(losses)




cuda:0


In [None]:
# training loop : 
from collections import defaultdict
import numpy as np


history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)
  train_acc, train_loss = train_epoch(
    model,
    train_dataloader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(train_dataset)
  )
  print(f'Train loss {train_loss} accuracy {train_acc}')
  val_acc, val_loss = eval_model(
    model,
    val_dataloader,
    loss_fn,
    device,
    len(val_dataset)
  )
  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()
  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)
  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc




Epoch 1/10
----------
Train loss 1.0634367440186494 accuracy 0.6470379567222418
Val   loss 0.944543836227918 accuracy 0.6957446808510639

Epoch 2/10
----------


##Apply on covid tweet data 

In [None]:
df_covid = pd.read_csv('df_french_clean.csv')


In [None]:
test_data_list = []

for line in df_covid.to_dict('records'):
  text = line['text']
  encoded = encode_text(text)
  test_data_list += [[encoded['input_ids'], encoded['token_type_ids'], encoded['attention_mask']]]

test_covid_preprocessed = pd.DataFrame(test_data_list, columns=['input_ids', 'token_type_ids', 'attention_mask'])


In [None]:
class TweetCovidDataset(Dataset):
  def __init__(self, dataframe):
    self.dataframe = dataframe
    
  def __len__(self):
    return len(self.dataframe)
  def __getitem__(self, item):
    #text = self.dataframe['text'][item]  not necessary 
    input_ids = self.dataframe['input_ids'][item]
    token_type_ids = self.dataframe['token_type_ids'][item]
    attention_mask = self.dataframe['attention_mask'][item]
    return {
      #'text': text,
      'input_ids': input_ids.flatten(), # flatten ?
      'token_type_ids' : token_type_ids.flatten(),
      'attention_mask': attention_mask.flatten()
    }

covid_dataset = TweetCovidDataset(test_covid_preprocessed)
test_covid_dataloader = DataLoader( covid_dataset, batch_size=batch_size ,shuffle=False )


In [None]:
## model redefine 
model = CamembertForSequenceClassification.from_pretrained('camembert-base')
# Accessing the model configuration
configuration = model.config 

model.classifier = RobertaClassificationHead(config = RobertaHeadConfig(768, 0.1, 3))

model.load_state_dict(torch.load('best_model_state_best.bin'))

model.to(device)

In [None]:
from tqdm import tqdm
def test_model(model, data_loader, device, n_examples):
  model = model.eval() # evaluation mode
  losses = []
  with torch.no_grad():
    all_preds,all_probs = [],[]
    for d in tqdm(data_loader,total=len(data_loader)):
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      outputs= model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs[0], dim=1)
      soft = torch.nn.Softmax(dim=1)
      probs = soft (outputs[0])
      all_preds+=list(preds.cpu().data.numpy())
      all_probs+=list(probs.cpu().data.numpy())
  return all_preds, all_probs 

In [None]:
all_preds,all_probs = test_model(model, test_covid_dataloader, device, len(covid_dataset))

100%|██████████| 1541/1541 [06:37<00:00,  3.88it/s]


In [None]:
df_covid['predictions'] = all_preds
df_covid['probs'] = all_probs
df_covid.predictions.value_counts()

0    6143
2    4093
1    2089
Name: predictions, dtype: int64

In [None]:
pd.set_option('display.max_colwidth', -1)
df_covid[df_covid.predictions == 1].sample(50)['text']

  """Entry point for launching an IPython kernel.


2773     FOCUS | Grippe, rhume, gastro, mal de gorge  .  .  .   les maladies saisonnières sont de retour .  Cependant, si certains micro-organismes nous rendent malades, d ' autres s ' avèrent bénéfiques à notre santé                                                                                    
3353     Coronavirus  :  restez informé et prémunissez-vous contre les fakenews Comment faire la différence entre le coronavirus et la grippe  ?                                                                                                                                                             
4534     Après la bronchite, la grippe .  .  Géniale  .  .                                                                                                                                                                                                                                                   
3139     Boiron vous recommande son médicament contre la grippe mais précise que  * pas de som

In [None]:
df_covid.to_csv('df_covid.csv',index=False)