<a href="https://colab.research.google.com/github/masanmas/ETSINF/blob/master/Analisis_2Datos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd #Tratamiento dataset formato CSV.
import numpy as np #Uso de matrices.
import torch #Librería Redes Neuronales.
from torch import nn, optim
from torch.utils.data import DataSet, DataLoader
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split #Separar el conjunto de datos en Test y Entrenamiento.

from collab import drive #Cargar archivos desde drive.

In [None]:
DRIVE_PATH = '/content/drive/my drive/DataSet_2Types.csv'

RANDOM_SEED = 42
MAX_LEN = 200
BATCH_SIZE = 16 #Para no lanzar los datos de golpe, los lanzaremos en paquetes de 16.

NCLASSES = 2 #Topics: ['TRAVEL'=0, 'STYLE & BEAUTY'=1]

PRETRAINED_BERT_MODEL = 'bert-base-cased'

In [None]:
#INIT
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = ("GPU" if torch.cuda.is_available() else "CPU")
print(device)

In [None]:
#LOAD DATASET
drive.mount('/content/drive')

df = pd.read_csv(DRIVE_PATH)
df = df[0:10000]

In [None]:
#INIT BERT TOKENIZER
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_BERT_MODEL)

sample_txt = "Places you had to visit when travelling to New York"
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokens.convert_tokens_to_ids(tokens)

print('Frase: ', sample_txt)
print('Tokens: ', tokens)
print('Token IDs: ', token_ids)

In [None]:
#CODIFICACIÓN BERT
encoder = tokenizer.encoder_plus(
    sample_txt,
    max_length = 25,
    truncation = True,
    add_special_tokens = True,
    return_token_type_ids = False,
    padding = 25,
    return_attention_mask = True,
    return_tensors = 'pt' 
)

encoder.keys() #['input_ids', 'attention_mask']

print(tokenizer.convert_ids_to_tokens(encoder['input_ids'][0])) #Frase Tokenizada
print(encoding['input_ids'][0]) #tensor([list(ids)])
print(encoding['attention_mask'][0]) #tensor([list([1|0])]) - Lista de elementos a los que prestar attención

In [None]:
#Create DATASET

class IMDBDataset(Dataset):

  def __init__(self, reviews, labels, tokenizer, max_len):
    self.reviews = reviews
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

    def __len__(self):
      return len(self.reviews)

    def __getitem__(self, item):
      review = str(self.reviews[item])
      label = self.labels[item]

      encoding = tokenizer.encoder_plus(
        review,
        max_length = self.max_len,
        truncation = True,
        add_special_tokens = True,
        return_token_type_ids = False,
        padding = 25,
        return_attention_mask = True,
        return_tensors = 'pt' 
      )

      return {
          'review': review,
          'input_ids': encoding['inputs_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'label': torch.tensor(label, dtype=torch.long)
      }

In [None]:
#DataLoader
def data_loader(df, tokenizer, max_len, batch_size):
  dataset = IMDBDataset(
      review = df.label2.to_numpy(),
      label = df.label1.to_numpy(),
      tokenizer = tokenizer,
      max_len = MAX_LEN
  )

  return DataLoader(dataset, batch_size = BATCH-SIZE, num_workers = 4)

In [None]:
#Split Data
df_train, df_test = train_test_split(df, test_size = 0.2, random_state = RANDOM_SEED)

train_data_loader = data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
#EL MODELO

class BERTArticleClassificator(nn.Module):

  def __init__(self, numClases):
    super(BERTArticleClassification, self).__init__()
    self.bert = BertModel.from_pretrained(PRETRAINED_BERT_MODEL)
    self.drop = nn.Dropout(p=0.3)
    self.linear = nn.Linear(self.bert.config.hidden_size, numClases)

  def forward(self, input_ids, attention_masks):
    _, cls_output = self.bert(
        input_ids = input_ids,
        attention_mask = attention_mask
    )

    drop_out = self.drop(cls_output)
    output = self.linear(drop_out)

    return output

In [None]:
model = BERTArticleClassification(NCLASSES)
model = model.to(device)

print(model)

In [None]:
#PRUEBAS

np.random