### distilbert-base-uncased

In [1]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds_train = pd.read_csv(r"C:\Users\filip\Desktop\Mestrado\2semestre\TMCD\Trabalho\Dataset\amazon_reviews_train.csv")
ds_test = pd.read_csv(r"C:\Users\filip\Desktop\Mestrado\2semestre\TMCD\Trabalho\Dataset\amazon_reviews_test.csv")

ds_train = ds_train.rename(columns={'sentiment': 'labels'})
ds_test = ds_test.rename(columns={'sentiment': 'labels'})

if ds_train['labels'].dtype == object:
    label_map = {'positive': 1, 'negative': 0}
    ds_train['labels'] = ds_train['labels'].map(label_map)
    ds_test['labels'] = ds_test['labels'].map(label_map)

train_dataset = Dataset.from_pandas(ds_train)
test_dataset = Dataset.from_pandas(ds_test)

train_valid = train_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_valid['train']
valid_dataset = train_valid['test']

In [3]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def preprocess_function(examples):
    return tokenizer(
        examples['review'],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_valid = valid_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 39121/39121 [00:03<00:00, 11368.18 examples/s]
Map: 100%|██████████| 9781/9781 [00:00<00:00, 12468.28 examples/s]
Map: 100%|██████████| 2417/2417 [00:00<00:00, 13417.20 examples/s]


In [5]:
from transformers import DistilBertConfig

config = DistilBertConfig.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2,
    dropout=0.3,  # Set the dropout probability here
)

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    config=config,
)
model.gradient_checkpointing_enable()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [7]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=True,
    logging_dir='./logs',
    logging_steps=500,
    dataloader_num_workers=4,
    gradient_accumulation_steps=2,
    max_grad_norm=1.0,
)

In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

  trainer = Trainer(


In [9]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2395,0.23557,0.905736,0.93772,0.955008,0.921046
2,0.1846,0.237542,0.905838,0.93712,0.965124,0.910695
3,0.1606,0.278955,0.919742,0.948291,0.94153,0.955149


KeyboardInterrupt: 

In [None]:
trainer.save_model("./best_model")
tokenizer.save_pretrained("./best_model")

best_model = DistilBertForSequenceClassification.from_pretrained("./best_model")

In [None]:
best_trainer = Trainer(
    model=best_model,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

test_results = best_trainer.evaluate(tokenized_test)
print(test_results)

In [None]:
def predict_text(texts):
    inputs = tokenizer(
        texts,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128
    ).to(best_model.device)

    with torch.no_grad():
        outputs = best_model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        preds = torch.argmax(probs, dim=1)

    labels = ["negative", "positive"]
    results = [(text, labels[pred.item()]) for text, pred in zip(texts, preds)]
    return results