In [2]:
#!pip install transformers

In [3]:
#!pip install torch

In [4]:
import os
import numpy as np
import shutil
import sys
import tqdm.notebook as tq
from collections import defaultdict

import torch
import torch.nn as nn

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [5]:
import pandas as pd

In [6]:
url = 'https://raw.githubusercontent.com/JonatanPolanco/Content_Classification/main/processed_data.csv'
df_data = pd.read_csv(url)

In [7]:
df_data.head()

Unnamed: 0,nombre_contenido,skill_1_definition,Adaptabilidad,Administración de bases de datos,Adquisición del talento,Agilidad de negocios,Alimentación saludable,Analítica de personas,Analítica en marketing,Análisis de datos,...,Tendencias digitales,Toma de decisiones,Trabajo colaborativo,Transformación digital,UX Research,Venta consultiva,Visualización de datos,Vocabulary,Writing,e-Operations
0,"CEO excellence de Dewar, Keller & Malhotra",Habilidad para idear una metodología que busca...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Activa las 3 líneas de defensa de la cibersegu...,"Habilidad para emprender acciones preventivas,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Figma: FigJam,Habilidad para usar diferentes herramientas di...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Costos ABC y presupuestos. Herramientas para l...,Habilidad para gestionar y mejorar constanteme...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,La venta humana - José Pascual,Habilidad para asumir el rol de consultor o as...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [8]:
# Combine title and abstract to increase power
df_data["combined"] = df_data["nombre_contenido"] + ". " + df_data["skill_1_definition"]
df_data.drop(columns=["skill_1_definition", "nombre_contenido"], axis=1, inplace=True)

In [40]:
from sklearn.model_selection import train_test_split
# split into train and test
df_train, df_test = train_test_split(df_data, random_state=77, test_size=0.30, shuffle=True)
# split test into test and validation datasets
df_test, df_valid = train_test_split(df_test, random_state=88, test_size=0.50, shuffle=True)

In [41]:
print(f"Train: {df_train.shape}, Test: {df_test.shape}, Valid: {df_valid.shape}")

Train: (7000, 154), Test: (1500, 154), Valid: (1500, 154)


In [42]:
# Hyperparameters
MAX_LEN = 256
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16
EPOCHS = 300
LEARNING_RATE = 1e-05

In [43]:
from transformers import BertTokenizer, BertModel

In [44]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [45]:
# Test the tokenizer
test_text = "We are testing BERT tokenizer."
# generate encodings
encodings = tokenizer.encode_plus(test_text, 
                                  add_special_tokens = True,
                                  max_length = 50,
                                  truncation = True,
                                  padding = "max_length", 
                                  return_attention_mask = True, 
                                  return_tensors = "pt")
# we get a dictionary with three keys (see: https://huggingface.co/transformers/glossary.html) 
encodings

{'input_ids': tensor([[  101,  2057,  2024,  5604, 14324, 19204, 17629,  1012,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]])}

In [15]:
df_train['combined']

796     Introducción al Design Thinking. Habilidad par...
1047    ¿Cuál es mi NIVEL de DESARROLLADOR?. Habilidad...
4742    Conectar MySQL con Django. Habilidad para prog...
1065    Realiza demostraciones efectivas en retail. Ha...
1954    Criptomonedas más allá de los rumores. Habilid...
                              ...                        
9119    Arma un equipo excepcional. Habilidad para cla...
7832    5 tips para equilibrar vida profesional y pers...
9509    Evaluación económica y social de proyectos de ...
2283    Toma decisiones conscientes y acertadas. Habil...
8799    Matemáticas financieras y evaluación de proyec...
Name: combined, Length: 7000, dtype: object

In [46]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len, target_list):
        self.tokenizer = tokenizer
        self.df = df
        self.title = list(df['combined'])
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index]),
            'title': title
        }

In [47]:
target_list = list(df_data.columns)
target_list

['Adaptabilidad',
 'Administración de bases de datos',
 'Adquisición del talento',
 'Agilidad de negocios',
 'Alimentación saludable',
 'Analítica de personas',
 'Analítica en marketing',
 'Análisis de datos',
 'Análisis de escenarios',
 'Análisis de estados financieros',
 'Análisis económico',
 'Aplicaciones Google',
 'Aplicaciones Microsoft',
 'Aprendizaje continuo',
 'Arquitectura TI',
 'Asertividad',
 'Autocompasión',
 'Autoconocimiento',
 'Autogestión',
 'Automatización de procesos',
 'Autorregulación',
 'Balance de vida',
 'Big data',
 'Branding',
 'C Sharp (C#)',
 'Ciberseguridad',
 'Cierre de ventas',
 'Coaching de equipos',
 'Computadores, dispositivos electrónicos e internet',
 'Comunicación efectiva',
 'Contenido digital',
 'Continuidad del negocio',
 'Control de gestión',
 'Control de proyectos',
 'Control financiero',
 'Creatividad',
 'Cultura organizacional',
 'Customer Relationship Management (CRM)',
 'Cómputo en la nube',
 'Derecho digital',
 'Desarrollo backend',
 'Des

In [48]:
target_list = target_list[:-1]

In [49]:
train_dataset = CustomDataset(df_train, tokenizer, MAX_LEN, target_list)
valid_dataset = CustomDataset(df_valid, tokenizer, MAX_LEN, target_list)
test_dataset = CustomDataset(df_test, tokenizer, MAX_LEN, target_list)

In [52]:
# testing the dataset
next(iter(train_dataset))

{'input_ids': tensor([  101, 17174,  8566, 14693,  2239,  2632,  2640,  3241,  1012,  5292,
         14454, 27893, 11498, 20446, 21335,  2099,  2474, 13675,  5243, 29068,
         27893,  1061,  4487,  5054,  2906, 14017, 14194,  3258,  2229, 10861,
          1010,  9706, 19341,  8883,  1037,  4895,  4031,  2080,  1051, 14262,
          7903,  3695,  1010,  8833,  7389,  2938,  2483, 12172,  2099,  5869,
         26785,  2229, 27893,  2229,  2139, 10514,  2149,  6692,  9488,  1010,
          1037,  2112,  4313,  3972,  4372,  6528, 22172, 11638,  2080,  7861,
         24952,  3597,  2139, 28517,  1061,  3449, 15053, 25101,  9365,  1061,
          3231,  8780,  2009,  6906, 29068,  2080,  4372,  4895, 25022, 20464,
          2080,  2139,  2033,  5558,  2527,  9530,  7629,  6692,  1012,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [53]:
# Data loaders
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

test_data_loader = torch.utils.data.DataLoader(test_dataset, 
    batch_size=TEST_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [54]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.2)
        self.linear = torch.nn.Linear(768, 153)
    
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()

# # Freezing BERT layers: (tested, weaker convergence)
# for param in model.bert_model.parameters():
#     param.requires_grad = False

model.to(device)

BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [55]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [56]:
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(), lr = 1e-5)



In [57]:
# Training of the model for one epoch
def train_model(training_loader, model, optimizer):

    losses = []
    correct_predictions = 0
    num_samples = 0
    # set model to training mode (activate droput, batch norm)
    model.train()
    # initialize the progress bar
    loop = tq.tqdm(enumerate(training_loader), total=len(training_loader), 
                      leave=True, colour='steelblue')
    for batch_idx, data in loop:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        # forward
        outputs = model(ids, mask, token_type_ids) # (batch,predict)=(32,8)
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())
        # training accuracy
        _, preds = torch.max(outputs, dim=1) # batch dim 
        _, targ = torch.max(targets, dim=1)  # batch dim
        num_samples += len(targ)  # technically adding batch size
        correct_predictions += torch.sum(preds == targ)

        # backward
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        # grad descent step
        optimizer.step()

        # Update progress bar
        #loop.set_description(f"")
        #loop.set_postfix(batch_loss=loss)

    # returning: trained model, model accuracy, mean loss
    return model, float(correct_predictions)/num_samples, np.mean(losses)

In [58]:
def eval_model(validation_loader, model, optimizer):
    losses = []
    correct_predictions = 0
    num_samples = 0
    # set model to eval mode (turn off dropout, fix batch norm)
    model.eval()

    with torch.no_grad():
        for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            # validation accuracy
            _, preds = torch.max(outputs, dim=1) # batch dim 
            _, targ = torch.max(targets, dim=1)  # batch dim
            num_samples += len(targ)  # technically adding batch size
            correct_predictions += torch.sum(preds == targ)

    return float(correct_predictions)/num_samples, np.mean(losses)

# Entrenamiento del modelo

In [None]:
history = defaultdict(list)
best_accuracy = 0

for epoch in range(1, EPOCHS+1):
    print(f'Epoch {epoch}/{EPOCHS}')
    model, train_acc, train_loss = train_model(train_data_loader, model, optimizer)
    val_acc, val_loss = eval_model(val_data_loader, model, optimizer)

    print(f'train_loss={train_loss:.4f}, val_loss={val_loss:.4f} train_acc={train_acc:.4f}, val_acc={val_acc:.4f}')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    # save the best model
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), os.path.join(data_dir,"output","best_model_state.bin"))
        best_accuracy = val_acc

Epoch 1/300


  0%|          | 0/438 [00:00<?, ?it/s]