#### FINE-TUNE DISTILBERT FOR SENTIMENT ANALYSIS

We’ll use HuggingFace’s Transformers library because:  
-  It’s standard in industry
-  Clean API  
-  Easy to extend  
-  Extremely powerful  

In [2]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT / "src") not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT / "src"))

In [3]:
# loading dataset
import pandas as pd
df = pd.read_csv(PROJECT_ROOT / "data" / "imdb_raw.csv")

In [4]:
from preprocess import clean_text
df["clean"] = df["review"].apply(clean_text)

In [5]:
# Split into train/val
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    df["clean"].values,
    df["label"].values,
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)


#### Load tokenizer

In [6]:
# we'll use distilbert-base-uncased (light, fast, high accuracy
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

#### Tokenize data

In [7]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

In [8]:
# Convert data into HuggingFace Dataset:
from datasets import Dataset

train_ds = Dataset.from_dict({"text": X_train, "label": y_train})
val_ds = Dataset.from_dict({"text": X_val, "label": y_val})

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)

train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

#### Load the model

In [9]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=str(PROJECT_ROOT / "models" / "distilbert_sentiment"),
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir=str(PROJECT_ROOT / "logs"),
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
)


In [13]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss


In [15]:
metrics = trainer.evaluate()
metrics

NameError: name 'trainer' is not defined

In [16]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    probs = outputs.logits.softmax(dim=1)
    pred = probs.argmax().item()
    confidence = probs.max().item()
    return pred, confidence

In [17]:
predict_sentiment("This movie was absolutely amazing!")
predict_sentiment("This was a boring waste of time.")

(1, 0.5164511203765869)