# Análisis de sentimientos con BERT

[Hugging Face](https://huggingface.co/)

![BERT análisis sentimientos](https://drive.google.com/uc?export=view&id=1UwciEQKNZ4SoXn_c0l31hsyZ-8jLdtVf)

In [1]:
!pip install transformers



In [2]:
from transformers import BertModel, BertTokenizer, get_linear_schedule_with_warmup
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from textwrap import wrap

from torch.optim import AdamW


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Inicialización
ruta = '/content/drive/MyDrive/Trabajos/Análisis de sentimientos/dataset/Datasets limpios/'
RANDOM_SEED = 42
MAX_LEN = 200
BATCH_SIZE = 16
DATASET_1_PATH = ruta + 'dataset_coments_4.csv'
NCLASSES = 2

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [8]:
# Cargar dataset
df = pd.read_csv(DATASET_1_PATH)
df = df[:10000]

In [9]:
print(df.head())
print(df.shape)
print("\n".join(wrap(df['originalText'][200])))

                                        originalText sentiment
0     hi i like smosh but im  so i watch it at night  Positive
1  hahaha i laughed so hard at this video your so...  Positive
2  omg old times disturbia   i heard this song wh...  Positive
3                        cant stop laughing  amazing  Positive
4  and has a brother thats much better looking th...  Positive
(10000, 2)
therojo why thank you so much for the subscription  i love dancing and
yes it does come naturally for me but i just do it for fun i may make
it look easy but trust me im sweating bullets its really a great
workout im currently uploading a few more vids check em out  ohand
thanks for the compliment on the hair i changed it since this video


In [10]:
# Reajustar dataset
df['label'] = (df['sentiment']=='Positive').astype(int)
df.drop('sentiment', axis=1, inplace=True)
df.head()

Unnamed: 0,originalText,label
0,hi i like smosh but im so i watch it at night,1
1,hahaha i laughed so hard at this video your so...,1
2,omg old times disturbia i heard this song wh...,1
3,cant stop laughing amazing,1
4,and has a brother thats much better looking th...,1


In [11]:
# TOKENIZACIÓN
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [12]:
# Ejemplo tokenización
sample_txt = 'I really loved that movie!'
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print('Frase: ', sample_txt)
print('Tokens: ', tokens)
print('Tokens numéricos: ', token_ids)

Frase:  I really loved that movie!
Tokens:  ['I', 'really', 'loved', 'that', 'movie', '!']
Tokens numéricos:  [146, 1541, 3097, 1115, 2523, 106]


In [26]:
# Codificación para introducir a BERT
encoding = tokenizer.encode_plus(
    sample_txt,
    max_length = 10,
    truncation = True,
    add_special_tokens = True,
    return_token_type_ids = False,
    padding=True,
    return_attention_mask = True,
    return_tensors = 'pt'
)

In [27]:
encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [15]:
print(tokenizer.convert_ids_to_tokens(encoding['input_ids'][0]))
print(encoding['input_ids'][0])
print(encoding['attention_mask'][0])

['[CLS]', 'I', 'really', 'loved', 'that', 'movie', '!', '[SEP]', '[PAD]', '[PAD]']
tensor([ 101,  146, 1541, 3097, 1115, 2523,  106,  102,    0,    0])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 0, 0])


In [74]:
# CREACIÓN DATASET

class IMDBDataset(Dataset):

  def __init__(self,originalText,labels,tokenizer,max_len):
    self.originalText = originalText
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
      return len(self.originalText)

  def __getitem__(self, item):
    originalText = str(self.originalText[item])
    label = self.labels[item]
    encoding = tokenizer.encode_plus(
        originalText,
        max_length = self.max_len,
        truncation = True,
        add_special_tokens = True,
        return_token_type_ids = False,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt'
        )


    return {
          'originalText': originalText,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'label': torch.tensor(label, dtype=torch.long)
      }



In [75]:
# Data loader:

def data_loader(df, tokenizer, max_len, batch_size):
  dataset = IMDBDataset(
      originalText = df.originalText.to_numpy(),
      labels = df.label.to_numpy(),
      tokenizer = tokenizer,
      max_len = MAX_LEN
  )

  return DataLoader(dataset, batch_size = BATCH_SIZE, num_workers = 4)

In [76]:
df_train, df_test = train_test_split(df, test_size = 0.2, random_state=RANDOM_SEED)

train_data_loader = data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [77]:
# EL MODELO!

class BERTSentimentModule(nn.Module):

  def __init__(self, n_classes):
    super(BERTSentimentModule, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.linear = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    _, cls_output = self.bert(
        input_ids = input_ids,
        attention_mask = attention_mask,
        return_dict=False
    )
    drop_output = self.drop(cls_output)
    output = self.linear(drop_output)
    return output

In [78]:
model = BERTSentimentModule(NCLASSES)
model = model.to(device)

In [79]:
# ENTRENAMIENTO
EPOCHS = 5
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

In [80]:
# Iteración entrenamiento
def train_model(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train()
  losses = []
  correct_predictions = 0
  for batch in data_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)
    outputs = model(input_ids = input_ids, attention_mask = attention_mask)
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)
    correct_predictions += torch.sum(preds == labels)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double()/n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)
      outputs = model(input_ids = input_ids, attention_mask = attention_mask)
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, labels)
      correct_predictions += torch.sum(preds == labels)
      losses.append(loss.item())
  return correct_predictions.double()/n_examples, np.mean(losses)

In [81]:
# Entrenamiento!!!

for epoch in range(EPOCHS):
  print('Epoch {} de {}'.format(epoch+1, EPOCHS))
  print('------------------')
  train_acc, train_loss = train_model(
      model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train)
  )
  test_acc, test_loss = eval_model(
      model, test_data_loader, loss_fn, device, len(df_test)
  )
  print('Entrenamiento: Loss: {}, accuracy: {}'.format(train_loss, train_acc))
  print('Validación: Loss: {}, accuracy: {}'.format(test_loss, test_acc))
  print('')

Epoch 1 de 5
------------------




Entrenamiento: Loss: 0.36150042459368703, accuracy: 0.8612500000000001
Validación: Loss: 0.2819975768327713, accuracy: 0.8915000000000001

Epoch 2 de 5
------------------




Entrenamiento: Loss: 0.2454250323139131, accuracy: 0.916125
Validación: Loss: 0.2872271243706346, accuracy: 0.897

Epoch 3 de 5
------------------




Entrenamiento: Loss: 0.18740504660550505, accuracy: 0.943625
Validación: Loss: 0.3046992530450225, accuracy: 0.91

Epoch 4 de 5
------------------




Entrenamiento: Loss: 0.1496931534302421, accuracy: 0.9586250000000001
Validación: Loss: 0.346904919590801, accuracy: 0.9105

Epoch 5 de 5
------------------




Entrenamiento: Loss: 0.12800947381788866, accuracy: 0.968375
Validación: Loss: 0.36857578350417314, accuracy: 0.913



In [82]:
# Guardar el modelo
model_path = "/content/drive/MyDrive/Trabajos/Análisis de sentimientos/dataset/modeloBertEntrenado.pth"
torch.save(model.state_dict(), model_path)

In [91]:
def classifySentiment(review_text):
  encoding_review = tokenizer.encode_plus(
    review_text,
    max_length = MAX_LEN,
    truncation = True,
    add_special_tokens = True,
    return_token_type_ids = False,
    padding = True,
    return_attention_mask = True,
    return_tensors = 'pt'
  )

  input_ids = encoding_review['input_ids'].to(device)
  attention_mask = encoding_review['attention_mask'].to(device)
  output = model(input_ids, attention_mask)
  _, prediction = torch.max(output, dim=1)
  print("\n".join(wrap(review_text)))
  if prediction:
    print('Sentimiento predicho: Positivo')
  else:
    print('Sentimiento predicho: Negativo')




In [93]:
review_text = "I hate his way of being."

classifySentiment(review_text)

I hate his way of being.
Sentimiento predicho: Negativo
