<a href="https://colab.research.google.com/github/MammadovN/Machine_Learning/blob/main/projects/04_natural_language_processing/sentiment-analysis/movie_sentiment_analyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets

In [None]:
import os
import random
import numpy as np
import torch
from datasets import load_dataset, Dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    pipeline
)


def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


In [None]:
def load_data():
    dataset = load_dataset('imdb')
    train_texts = dataset['train']['text']
    train_labels = dataset['train']['label']
    test_texts = dataset['test']['text']
    test_labels = dataset['test']['label']
    return train_texts, train_labels, test_texts, test_labels

In [None]:
def baseline_model(train_texts, train_labels, test_texts, test_labels):
    print("\n[Baseline] Training TF-IDF + LogisticRegression...")
    vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2), stop_words='english')
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, train_labels)
    preds = clf.predict(X_test)
    print("Baseline Accuracy:", accuracy_score(test_labels, preds))
    print("Classification Report:\n", classification_report(test_labels, preds))

In [None]:
def fine_tune_transformer(train_texts, train_labels, test_texts, test_labels):
    print("\n[Transformer] Preparing data and model...")
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
    test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

    train_dataset = Dataset.from_dict({
        'input_ids': train_encodings['input_ids'],
        'attention_mask': train_encodings['attention_mask'],
        'labels': train_labels
    })
    test_dataset = Dataset.from_dict({
        'input_ids': test_encodings['input_ids'],
        'attention_mask': test_encodings['attention_mask'],
        'labels': test_labels
    })

    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased', num_labels=2
    )

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=2,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        eval_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
        report_to="none"      # <-- W&B ve diğer loglama mekanizmalarını kapatır
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=1)
        acc = (preds == labels).mean()
        return {'accuracy': acc}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    print("\n[Transformer] Training...")
    trainer.train()
    eval_result = trainer.evaluate()
    print("Transformer Evaluation:\n", eval_result)

    os.makedirs('./sentiment_model', exist_ok=True)
    model.save_pretrained('./sentiment_model')
    tokenizer.save_pretrained('./sentiment_model')
    print("Model saved to ./sentiment_model")


In [None]:
def inference(text: str):
    print(f"\n[Inference] \"{text}\"")
    sentiment_pipeline = pipeline(
        'sentiment-analysis',
        model='./sentiment_model',
        tokenizer='./sentiment_model'
    )
    result = sentiment_pipeline(text)
    print(result)


In [None]:
if __name__ == '__main__':
    set_seed()
    train_texts, train_labels, test_texts, test_labels = load_data()
    baseline_model(train_texts, train_labels, test_texts, test_labels)
    fine_tune_transformer(train_texts, train_labels, test_texts, test_labels)
    inference("This movie was fantastic! I loved it.")