<a href="https://colab.research.google.com/github/HatemMoushir/smart-ai-assistant/blob/main/AG_News_IMDb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ✅ Notebooks: AG News & IMDb training with DistilBERT

# --------- Part 1: AG News Training ---------

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate

# Load dataset
ag_dataset = load_dataset("ag_news")

# Load tokenizer and tokenize data
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length")

ag_encoded = ag_dataset.map(tokenize, batched=True)

# Load model for 4-class classification
ag_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)

# Evaluation metric
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

# Training arguments
ag_args = TrainingArguments(
    output_dir="ag_news_model",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_steps=100,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

ag_trainer = Trainer(
    model=ag_model,
    args=ag_args,
    train_dataset=ag_encoded["train"].select(range(10000)),
    eval_dataset=ag_encoded["test"].select(range(1000)),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train AG News model
ag_trainer.train()

# --------- Part 2: IMDb Training (Manual Load) ---------

# Download IMDb dataset
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xzf aclImdb_v1.tar.gz

import os
import pandas as pd
from datasets import Dataset

def load_imdb_data(path):
    data = {'text': [], 'label': []}
    for label_type in ['pos', 'neg']:
        folder = os.path.join(path, label_type)
        for file in os.listdir(folder)[:3000]:  # Load 3000 per class for speed
            with open(os.path.join(folder, file), 'r', encoding="utf-8") as f:
                data['text'].append(f.read())
                data['label'].append(1 if label_type == 'pos' else 0)
    return pd.DataFrame(data)

# Load and convert to Dataset
train_df = load_imdb_data("/content/aclImdb/train")
test_df = load_imdb_data("/content/aclImdb/test")

imdb_train = Dataset.from_pandas(train_df)
imdb_test = Dataset.from_pandas(test_df)

# Tokenize IMDb
imdb_encoded = imdb_train.map(tokenize, batched=True)
imdb_eval = imdb_test.map(tokenize, batched=True)

# Load model for binary classification
imdb_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Training arguments
imdb_args = TrainingArguments(
    output_dir="imdb_model",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    logging_steps=100,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

imdb_trainer = Trainer(
    model=imdb_model,
    args=imdb_args,
    train_dataset=imdb_encoded,
    eval_dataset=imdb_eval.select(range(1000)),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train IMDb model
imdb_trainer.train()