<a href="https://colab.research.google.com/github/JCbarajas5/Text_Mining/blob/main/spanishBert_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip3 install pysentimiento
!spacy download es_core_news_sm
!nltk.download('stopwords')

In [None]:
import json
import re
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pysentimiento.preprocessing import preprocess_tweet

In [None]:
with open('SENT-COVID.json') as file:
    data = json.load(file)
    
pd.options.mode.chained_assignment = None                                         
pd.set_option('display.max_colwidth',None)   

df = pd.DataFrame(data)
print('Numero de tweets: ' + str(len(df)))

Numero de tweets: 4594


### Preprocesamiento

In [None]:
def clean_tweet(text):
  text = re.sub(r'[~^0-9]', '', text) #numeros
  text = re.sub("\\s+", ' ', text) ##Espacios blancos dobles
  text = re.sub('\n', ' ', text) ##Saltos de linea

  pattern = r'([.])([A-Z#@¿])'
  pattern2 = r'([-?])([a-zA-Z#@¿])'
  pattern3 = r'([a-zA-Z])([#@¿(])'
  pattern4 = r'([:!])([a-zA-Z#@¿])'
  text = re.sub(pattern, r'\1 \2', text) # Separacion de punto seguido por una mayuscula
  text = re.sub(pattern2, r'\1 \2', text)
  text = re.sub(pattern3, r'\1 \2', text)
  text = re.sub(pattern4, r'\1 \2', text)
  return text 


def preprocess(text):  # Preprocesamiento de pysentimiento   
  return preprocess_tweet(text, char_replace=True, normalize_laughter=True, shorten=3, 
                          emoji_wrapper='', user_token='usuario', url_token='url')  


def normalize(text):
 pattern2 = r'([a-zA-Z])([.])'
 pattern3 = r'([.])([a-zA-Z])'
 text = re.sub(pattern2, r'\1 \2', text)
 text = re.sub(pattern3, r'\1 \2', text)
 
 text = "".join(u for u in text if u not in ("?","¿", ".", ";", ":", "!","¡",'"',"%","“","”","$","&","'","\\", "(",")",
                                             "*","+",",","/","<",">","=","^","•","...", "ç","π","ⓘ", "-", "_","#","|"))
 a,b = 'áéíóúÁÉÍÓÚ','aeiouAEIOU'
 trans = str.maketrans(a,b)     
 text = text.translate(trans) # Reemplazo de palabras acentuadas       

 pattern  = r'([a-z])([A-Z-])'
 text = re.sub(pattern, r'\1 \2', text)

 #text = re.sub(r'@[A-Za-z0-9_]+', '', text)
 text = text.lower()
 return text  


def tokenize(text):    
  text= text.split(sep = ' ')  # Tokenización por palabras individuales
  text= [token for token in text if len(token) > 1]  # Eliminación de tokens con una longitud < 2
  return(text) 

def labels(label):
  if label == 'POSITIVO':
    label=1
  elif label == 'NEUTRO':
     label=0
  else:
     label=-1
  return(label) 

In [None]:
df['clean_tweet'] = df['Tweet'].apply(clean_tweet) 
df['preprocess_tweet'] = df['clean_tweet'].apply(preprocess)
df['normalized_tweet'] = df['preprocess_tweet'].apply(normalize)
df['tokenized_tweet'] = df['normalized_tweet'].apply(tokenize)
df['class'] = df['Label'].apply(labels)

#df[['class', 'normalized_tweet', 'tokenized_tweet']].head(20)

### Lemmatizacion

In [None]:
import spacy
from nltk.stem.snowball import SnowballStemmer

sp = spacy.load('es_core_news_sm')

def lemmatization(text):
    doc = sp(text)
    return ' '.join([word.lemma_ for word in doc]) 

#stemmer = SnowballStemmer('spanish')
#stemmed_spanish = [stemmer.stem(item) for item in spanish_words]

df['lem_tweet'] = df['normalized_tweet'].apply(lemmatization)
df['lemtokenized_tweet'] = df['lem_tweet'].apply(tokenize)

# Train-test Split

In [None]:
from sklearn.model_selection import train_test_split

X = df['normalized_tweet']      #Tweets normalizados
X2 = df['lem_tweet']            #Tweets lemmatizados
X3 = df['tokenized_tweet']      #Normalizados y tokenizados
X4 = df['lemtokenized_tweet']  #Lemmatizados y tokenizados 
y = df['class']                 #Etiquetas


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25 ,random_state=37)

## BERT (BETO)

---



In [None]:
import locale

def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!CUDA_LAUNCH_BLOCKING=1

In [None]:
!pip install transformers
!wget https://users.dcc.uchile.cl/~jperez/beto/uncased_2M/pytorch_weights.tar.gz 
!wget https://users.dcc.uchile.cl/~jperez/beto/uncased_2M/vocab.txt 
!wget https://users.dcc.uchile.cl/~jperez/beto/uncased_2M/config.json 
!wget https://raw.githubusercontent.com/cardiffnlp/xlm-t/main/data/sentiment/all/test_text.txt
!tar -xzvf pytorch_weights.tar.gz
!mv config.json pytorch/.
!mv vocab.txt pytorch/.

In [None]:
import torch

from transformers import BertModel, BertTokenizer, BertForMaskedLM, AdamW, get_linear_schedule_with_warmup, AutoConfig, AutoTokenizer
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from textwrap import wrap

## Inicializacion

In [None]:
RANDOM_SEED = 37
MAX_LEN = 480
BATCH_SIZE = 16
NCLASSES = 3

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")

## Tokenizacion

In [None]:
#PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
#tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

#MODEL = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
#config = AutoConfig.from_pretrained(MODEL)
#tokenizer = AutoTokenizer.from_pretrained("pytorch/", use_fast=True)

tokenizer = BertTokenizer.from_pretrained("pytorch/")

##Prueba con un solo sample text

In [None]:
sample_txt = 'bueno pero es no le pidas demasiado mejor preguntenle de la fuerza moral de su patron' 
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print('Frase: ', sample_txt)
print('Tokens: ', tokens)
print('Tokens numéricos: ', token_ids)

Frase:  bueno pero es no le pidas demasiado mejor preguntenle de la fuerza moral de su patron
Tokens:  ['bueno', 'pero', 'es', 'no', 'le', 'pidas', 'demasiado', 'mejor', 'pregunte', '##n', '##le', 'de', 'la', 'fuerza', 'moral', 'de', 'su', 'patr', '##on']
Tokens numéricos:  [1491, 1195, 1028, 1054, 1165, 28903, 2668, 1544, 16216, 30959, 1080, 1009, 1032, 3193, 8003, 1009, 1069, 5102, 1022]


In [None]:
# Codificación:
encoding = tokenizer.encode_plus(
    sample_txt,
    max_length = 18,
    truncation = True,
    add_special_tokens = True,
    return_token_type_ids = False,
    pad_to_max_length = True,
    return_attention_mask = True,
    return_tensors = 'pt'
)



In [None]:
print(tokenizer.convert_ids_to_tokens(encoding['input_ids'][0]))
print(encoding['input_ids'][0])
print(encoding['attention_mask'][0])

['[CLS]', 'bueno', 'pero', 'es', 'no', 'le', 'pidas', 'demasiado', 'mejor', 'pregunte', '##n', '##le', 'de', 'la', 'fuerza', 'moral', 'de', '[SEP]']
tensor([    4,  1491,  1195,  1028,  1054,  1165, 28903,  2668,  1544, 16216,
        30959,  1080,  1009,  1032,  3193,  8003,  1009,     5])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])


## CREACIÓN DATASET

In [None]:
class IMDBDataset(Dataset):

  def __init__(self,reviews,labels,tokenizer,max_len):
    self.reviews = reviews
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
      return len(self.reviews)
    
  def __getitem__(self, item):
    review = str(self.reviews[item])
    label = self.labels[item]
    encoding = tokenizer.encode_plus(
        review,
        max_length = self.max_len,
        truncation = True,
        add_special_tokens = True,
        return_token_type_ids = False,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt'
        )
    

    return {
          'review': review,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'label': torch.tensor(label, dtype=torch.long)
      } 

## Data loader:

In [None]:
def data_loader(df, tokenizer, max_len, batch_size):
  dataset = IMDBDataset(
      reviews = df.review.to_numpy(),
      labels = df.label.to_numpy(),
      tokenizer = tokenizer,
      max_len = MAX_LEN
  )

  return DataLoader(dataset, batch_size = BATCH_SIZE, num_workers = 4)

In [None]:
df1 = df[['normalized_tweet', 'class']]
df1.columns = ['review', 'label']
df_train, df_test = train_test_split(df1, test_size = 0.25, random_state=RANDOM_SEED)

train_data_loader = data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

##Modelo

In [None]:
class BERTSentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(BERTSentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained('pytorch/')
    self.drop = nn.Dropout(p=0.3)
    self.linear = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    _, cls_output = self.bert(
        input_ids = input_ids,
        attention_mask = attention_mask,
        return_dict=False
    )

    drop_output = self.drop(cls_output)
    output = self.linear(drop_output)
    return output

In [None]:
model = BERTSentimentClassifier(NCLASSES)
model = model.to(device)

Some weights of the model checkpoint at pytorch/ were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## ENTRENAMIENTO

In [None]:
EPOCHS = 5
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)



In [None]:
# Iteración entrenamiento
def train_model(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train()
  losses = []
  correct_predictions = 0
  for batch in data_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)
    outputs = model(input_ids = input_ids, attention_mask = attention_mask)
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)
    correct_predictions += torch.sum(preds == labels)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double()/n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)
      outputs = model(input_ids = input_ids, attention_mask = attention_mask)
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, labels)
      correct_predictions += torch.sum(preds == labels)
      losses.append(loss.item())
  return correct_predictions.double()/n_examples, np.mean(losses)

In [None]:
for epoch in range(EPOCHS):
  print('Epoch {} de {}'.format(epoch+1, EPOCHS))
  print('------------------')
  train_acc, train_loss = train_model(
      model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train)
  )
  test_acc, test_loss = eval_model(
      model, test_data_loader, loss_fn, device, len(df_test)
  )
  print('Entrenamiento: Loss: {}, accuracy: {}'.format(train_loss, train_acc))
  print('Validación: Loss: {}, accuracy: {}'.format(test_loss, test_acc))
  print('')

##Eval

In [None]:
def classifySentiment(review_text):
  encoding_review = tokenizer.encode_plus(
      review_text,
      max_length = MAX_LEN,
      truncation = True,
      add_special_tokens = True,
      return_token_type_ids = False,
      pad_to_max_length = True,
      return_attention_mask = True,
      return_tensors = 'pt'
      )
  
  input_ids = encoding_review['input_ids'].to(device)
  attention_mask = encoding_review['attention_mask'].to(device)
  output = model(input_ids, attention_mask)
  _, prediction = torch.max(output, dim=1)
  print("\n".join(wrap(review_text)))
  if prediction == False:
    print('Sentimiento predicho: * * * * *')
  else:
    print('Sentimiento predicho: *')


In [None]:
review_text = "mal"

classifySentiment(review_text)



mal
Sentimiento predicho: * * * * *
