In [1]:
# read https://www.geeksforgeeks.org/how-to-use-hugging-face-pretrained-model/?ref=ml_lbp

import pandas as pd
import torch
import numpy as np

In [4]:
impressions_df = pd.read_csv("Final_Impressions.csv")
labels_df = pd.read_csv("Final_Impressions_labels.csv")

In [5]:
merged_df = pd.merge(impressions_df, labels_df, on= "impression_id")

In [24]:
target_labels = ['pe_acute', 'pe_subsegmentalonly', 'pe_positive']
filtered_df = merged_df[['impressions'] + target_labels]

filtered_df[target_labels] = filtered_df[target_labels].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[target_labels] = filtered_df[target_labels].astype(float)


In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    filtered_df['impressions'].tolist(),
    filtered_df[target_labels].values,
)

In [26]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
# choosing a model

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_encodings = tokenizer(X_train, padding=True, truncation=True, max_length=512, return_tensors="pt")
test_encodings = tokenizer(X_test, padding=True, truncation=True, max_length=512, return_tensors="pt")

# converting to a huggingface dataset
train_dataset = Dataset.from_dict({"text": X_train, "labels": list(y_train)})
test_dataset = Dataset.from_dict({"text": X_test, "labels": list(y_test)})

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1356 [00:00<?, ? examples/s]

Map:   0%|          | 0/453 [00:00<?, ? examples/s]

In [27]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(target_labels), problem_type="multi_label_classification")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
# setting up traning args

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

In [29]:
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = torch.sigmoid(torch.tensor(predictions))  # Convert logits to probabilities
    predictions = (predictions > 0.5).int().numpy()  # Convert probabilities to binary values

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

# google colab died try again later lol

Epoch,Training Loss,Validation Loss


In [20]:
predictions = trainer.predict(test_dataset)
preds = torch.sigmoid(torch.tensor(predictions.predictions)) > 0.5

print("Classification Report:\n")
print(classification_report(y_test, preds.numpy(), target_names=target_labels))

model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

print("Model training complete and saved!")