!pip install datasets evaluate

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from datasets import load_dataset
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# TODO: Import other classifiers you want to test
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [2]:
# TODO: Load a text classification dataset
ds = load_dataset("stanfordnlp/imdb")
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [3]:
# TODO: Convert the dataset splits into pandas DataFrames
df_train = pd.DataFrame(ds['train'])
df_test = pd.DataFrame(ds['test'])

In [5]:
# TODO: Assign the relevant dataset columns to feature and label variables
# was already done above

# TODO: Transform the text data into numerical features using TF-IDF
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(df_train['text'])
y_train = np.array(df_train['label'])

X_test = vectorizer.transform(df_test['text'])
y_test = np.array(df_test['label'])

# TODO: Convert the transformed data into a format suitable for classifiers

# TODO: Define a set of classifiers to evaluate
random_state = 42
classifiers = {
    'SVC' : SVC(kernel='linear', probability=True),
    'LogisticRegression' : LogisticRegression(random_state=random_state),
    'MultinomialNB' : MultinomialNB(),
    'RandomForestClassifier' : RandomForestClassifier(random_state=random_state)
}

# TODO: Loop through classifiers, train them, and evaluate their performance
results = {}
for model_name, model in tqdm(classifiers.items(), desc="Training Models", unit="model"):
    try:
        print(f"--- Running: {model_name} ---")
        # TODO: Train the model using training data
        model.fit(X_train, y_train)
        # TODO: Generate predictions on test data
        y_hat = model.predict(X_test)
        # TODO: Generate and print a classification report (scroll all the way down for some sample code)
        print(f'Results for {model_name}')
        print(classification_report(y_test, y_hat, target_names=['Negative', 'Positive'], output_dict=True))
        # TODO, IF NEEDED: Clean up memory after each classifier runs

    except Exception as e:
        print(f"Error with {model_name}: {e}")
        continue  # Skip to the next classifier if an error occurs

              precision    recall  f1-score   support

     Class 0       0.88      0.89      0.88     12500
     Class 1       0.89      0.87      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000

              precision    recall  f1-score   support

     Class 0       0.88      0.88      0.88     12500
     Class 1       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000

              precision    recall  f1-score   support

     Class 0       0.79      0.89      0.84     12500
     Class 1       0.87      0.77      0.82     12500

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000

              preci

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"]).rename_column("label", "labels")
tokenized_datasets.set_format("torch")

train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased", num_labels=2)

# TODO: PLAY WITH THE HYPERPARAMETERS
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    warmup_steps=500,
    lr_scheduler_type="cosine",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=4,
    weight_decay=0.1,
    logging_dir="./logs",
    save_total_limit=2,
    fp16=True,
    report_to="none"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
results = trainer.evaluate(eval_dataset)
print("Evaluation Results:", results)

In [None]:
# TODO: Adapt to Sentence BERT

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
from sklearn.metrics import classification_report

predictions = trainer.predict(eval_dataset)
logits = predictions.predictions
y_pred = np.argmax(logits, axis=1)
if "labels" in eval_dataset.column_names:
    y_true = eval_dataset["labels"]
elif "label" in eval_dataset.column_names:
    y_true = eval_dataset["label"]
else:
    raise ValueError("Neither 'labels' nor 'label' found in dataset!")

report = classification_report(y_true, y_pred, target_names=["Negative", "Positive"])  # Adjust class names as needed
print("Classification Report:")
print(report)