In [1]:
import pandas as pd
from datasets import Dataset, load_metric
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, get_scheduler
import torch
from tqdm.auto import tqdm
import dill

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
df_train = pd.read_csv('./train.tsv', sep='\t', names=['text', 'Domain'])
df_train.head()

Unnamed: 0,text,Domain
0,the quality of being beneficial and generally ...,Philosophy and psychology
1,insure again by transferring to another insura...,"Business, economics, and finance"
2,foolish gibberish,Language and linguistics
3,the property of being a relatively small amoun...,Mathematics
4,an arrangement of a piece of music for perform...,Music


In [4]:
df_test = pd.read_csv('./test.tsv', sep='\t', names=['text', 'Domain'])
df_test.head()

Unnamed: 0,text,Domain
0,(physics) statistical law obeyed by a system o...,Physics and astronomy
1,(physics) the process in which incident radiat...,Physics and astronomy
2,(physics) the exponential return of a system t...,Physics and astronomy
3,(physics) a coefficient that expresses how muc...,Physics and astronomy
4,(physics) the point of minimum displacement in...,Physics and astronomy


In [5]:
label2id = {}

def encode_domain(x):
    if x not in label2id.keys():
        label2id[x]=len(label2id)
    return label2id[x]

df_train['labels'] = df_train['Domain'].apply(lambda x: encode_domain(x))
assert df_train['labels'].max() == 33, "The maximum encoded category should be 33" #Hay 34 clases en total
df_train = df_train.drop(columns='Domain')
df_train.head()

Unnamed: 0,text,labels
0,the quality of being beneficial and generally ...,0
1,insure again by transferring to another insura...,1
2,foolish gibberish,2
3,the property of being a relatively small amoun...,3
4,an arrangement of a piece of music for perform...,4


In [6]:
df_test['labels'] = df_test['Domain'].apply(lambda x: encode_domain(x))
assert df_test['labels'].max() == 33, "The maximum encoded category should be 33" #Hay 34 clases en total
df_test = df_test.drop(columns='Domain')
df_test.head()

Unnamed: 0,text,labels
0,(physics) statistical law obeyed by a system o...,14
1,(physics) the process in which incident radiat...,14
2,(physics) the exponential return of a system t...,14
3,(physics) a coefficient that expresses how muc...,14
4,(physics) the point of minimum displacement in...,14


In [7]:
id2label = {index: label for label, index in label2id.items()}

Vamos a crear la partición de validación y ya podemos crear los tres Datasets

In [8]:
train_size = 0.80
train=df_train.sample(frac=train_size,random_state=200).reset_index(drop=True)
valid=df_train.drop(train.index).reset_index(drop=True)
train_dataset = Dataset.from_pandas(train)
valid_dataset = Dataset.from_pandas(valid)
test_dataset = Dataset.from_pandas(df_test)

## Entrenamiento

In [9]:
def preprocess_function(examples):
    return tokenizer(examples["text"],truncation=True, max_length = 512)

In [10]:
def get_batches(train_dataset, valid_dataset, tokenizer):
    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_valid = valid_dataset.map(preprocess_function, batched=True)
    tokenized_train = tokenized_train.remove_columns(["text"])
    tokenized_valid = tokenized_valid.remove_columns(["text"])
    tokenized_train.set_format("torch")
    tokenized_valid.set_format("torch")
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_dataloader = DataLoader(tokenized_train, batch_size=8, shuffle=True, collate_fn=data_collator)
    valid_dataloader = DataLoader(tokenized_valid, batch_size=32, shuffle=False, collate_fn=data_collator)
    return train_dataloader, valid_dataloader

In [11]:

def train(model_ckpt, num_epochs, train_dataset, valid_dataset):
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
    
    train_dataloader, valid_dataloader = get_batches(train_dataset, valid_dataset, tokenizer)
    
    model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=34, id2label=id2label, label2id=label2id)

    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(   
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )

    model.to(device)

    progress_bar = tqdm(range(num_training_steps))
    train_losses=[]
    valid_losses=[]

    for epoch in range(num_epochs):    
        train_loss=0.0
        model.train()
        for batch_num, batch in enumerate(train_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

            train_loss += loss.item()

        valid_loss = 0.0
        model.eval()
        with torch.no_grad():
            for val_batch in valid_dataloader:
                val_batch = {k: v.to(device) for k, v in val_batch.items()}
                val_outputs = model(**val_batch)
                valid_loss += val_outputs.loss.item()

        
        train_losses.append(train_loss / len(train_dataloader))
        valid_losses.append(valid_loss / len(valid_dataloader))
        print(f"Epoch {epoch + 1}-> Train Loss: {train_losses[epoch]} Validation Loss: {valid_losses[epoch]}")




In [12]:
train("distilbert-base-uncased", 8, train_dataset, valid_dataset)

Map:   0%|          | 0/9479 [00:00<?, ? examples/s]

NameError: name 'tokenizer' is not defined

In [None]:

# model.eval()
# for batch in test_dataloader:
#     batch = {k: v.to(device) for k, v in batch.items()}
#     with torch.no_grad():
#         outputs = model(**batch)

#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
    




  metric = load_metric("accuracy")


{'accuracy': 0.8694805194805195}