<a href="https://colab.research.google.com/github/JulijanaMilosavljevic/Sentiment-Analysis-HF/blob/main/Sentiment_analysis_hf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# 1. Installing the package

In [None]:
!pip install -q torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1
!pip install -q transformers==4.44.2 datasets==3.0.2
import torch, torchvision, torchaudio, transformers, datasets

print("Torch:", torch.__version__)
print("Torchvision:", torchvision.__version__)
print("Torchaudio:", torchaudio.__version__)
print("Transformers:", transformers.__version__)
print("Datasets:", datasets.__version__)


# 2. Imprort library

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score


# 3. Loading IMDb dataset

In [None]:
dataset = load_dataset("imdb")
print(dataset)
print(dataset["train"][0])


# 4. Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True)

dataset = dataset.map(tokenize, batched=True)
dataset = dataset.rename_column("label", "labels")
dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


# 5. Model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


# 6. Metric functions

In [None]:
def compute_metrics(eval_pred):
    preds = np.argmax(eval_pred.predictions, axis=1)
    return {
        "accuracy": accuracy_score(eval_pred.label_ids, preds),
        "f1": f1_score(eval_pred.label_ids, preds)
    }


# 7. Training settings

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    logging_strategy="steps",
    report_to=[]

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"].shuffle(seed=42).select(range(2000)),
    eval_dataset=dataset["test"].shuffle(seed=42).select(range(1000)),
    compute_metrics=compute_metrics
)


# 8. Train

In [None]:
trainer.train()


# 9. Saving models

In [None]:
trainer.save_model("./sentiment-model")
tokenizer.save_pretrained("./sentiment-model")
!zip -r sentiment-model.zip sentiment-model