In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer

# Load your dataset
df = pd.read_csv("reddit_bookmark_data.csv")

# Ensure 'title' and 'label' columns exist
if 'title' not in df.columns or 'label' not in df.columns:
    raise ValueError("CSV must contain 'title' and 'label' columns.")

# Handle potential missing titles (though PRAW usually returns them)
df.dropna(subset=['title'], inplace=True)

# Encode labels to numerical IDs
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['label'])

# Get the mapping from numerical ID back to original label
id_to_label = {id: label for id, label in enumerate(label_encoder.classes_)}
num_labels = len(label_encoder.classes_)

print(f"Original Labels: {label_encoder.classes_}")
print(f"Number of Labels: {num_labels}")

# Split data into training and validation sets
# A validation set is crucial for monitoring performance and preventing overfitting.
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['title'].tolist(),
    df['encoded_label'].tolist(),
    test_size=0.2, # 20% for validation
    random_state=42,
    stratify=df['encoded_label'] # Stratify to ensure equal distribution of labels in splits
)

print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")

Original Labels: ['cooking' 'fashion' 'fitness' 'programming']
Number of Labels: 4
Training samples: 316
Validation samples: 79


In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Choose your pre-trained model checkpoint
# 'distilbert-base-uncased' is recommended for a personal project due to size/speed
model_name = "albert/albert-base-v1"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model for sequence classification
# We specify the number of labels and the mapping for better logging/understanding
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id_to_label,
    label2id={label: id for id, label in id_to_label.items()}
)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cpu


In [3]:
def tokenize_data(texts, labels):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=tokenizer.model_max_length, # Often 512 for BERT/DistilBERT
        return_tensors="pt" # Return PyTorch tensors
    )
    return {
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': torch.tensor(labels)
    }

train_encodings = tokenize_data(train_texts, train_labels)
val_encodings = tokenize_data(val_texts, val_labels)

# Create PyTorch Datasets
class BookmarkDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = BookmarkDataset(train_encodings)
val_dataset = BookmarkDataset(val_encodings)

In [4]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# Define compute_metrics function for evaluation during training
def compute_metrics(p):
    predictions = np.argmax(p.predictions, axis=1)
    return {
        'accuracy': accuracy_score(p.label_ids, predictions),
        'f1_weighted': f1_score(p.label_ids, predictions, average='weighted')
    }

# Define training arguments
# These parameters can be tuned.
training_args = TrainingArguments(
    output_dir="./results", # Directory for checkpoints and predictions
    num_train_epochs=5, # Number of epochs, start with 3-5
    per_device_train_batch_size=16, # Batch size for training
    per_device_eval_batch_size=64, # Batch size for evaluation
    warmup_steps=500, # Number of warmup steps for learning rate scheduler
    weight_decay=0.01, # Strength of weight decay
    logging_dir="./logs", # Directory for logs
    logging_steps=10, # Log training progress every N steps
    eval_strategy="epoch", # Evaluate every epoch
    save_strategy="epoch", # Save model checkpoint every epoch
    load_best_model_at_end=True, # Load the best model at the end of training
    metric_for_best_model="f1_weighted", # Metric to use for early stopping/best model selection
    greater_is_better=True,
    report_to="none", # You can set this to "wandb" or "tensorboard" for better tracking
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [5]:
print("Starting training...")
trainer.train()
print("Training complete!")

# Evaluate on the validation set after training
results = trainer.evaluate()
print("Validation Results:", results)

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted
1,1.4533,1.481894,0.253165,0.102289
2,1.3327,1.305997,0.367089,0.273172
3,1.2274,1.150341,0.531646,0.470285
4,1.0585,1.011547,0.620253,0.608734
5,0.8816,0.882638,0.708861,0.705372


Training complete!


Validation Results: {'eval_loss': 0.8826382756233215, 'eval_accuracy': 0.7088607594936709, 'eval_f1_weighted': 0.7053716080543725, 'eval_runtime': 2.942, 'eval_samples_per_second': 26.853, 'eval_steps_per_second': 0.68, 'epoch': 5.0}


In [6]:
# Save the model and tokenizer
output_model_dir = "./fine_tuned_distilbert_bookmark_classifier3"
trainer.save_model(output_model_dir)
tokenizer.save_pretrained(output_model_dir)

print(f"Model and tokenizer saved to {output_model_dir}")

# Save the label encoder as well, so you can map predictions back to text labels
import joblib
joblib.dump(label_encoder, f"{output_model_dir}/label_encoder.pkl")
print(f"Label encoder saved to {output_model_dir}/label_encoder.pkl")

Model and tokenizer saved to ./fine_tuned_distilbert_bookmark_classifier3
Label encoder saved to ./fine_tuned_distilbert_bookmark_classifier3/label_encoder.pkl


In [7]:
from transformers import pipeline
import joblib

# Load the saved model and tokenizer
model_path = "./fine_tuned_distilbert_bookmark_classifier3"
loaded_tokenizer = AutoTokenizer.from_pretrained(model_path)
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_path)
loaded_model.to(device) # Ensure it's on the correct device

# Load the label encoder
loaded_label_encoder = joblib.load(f"{model_path}/label_encoder.pkl")

# Create a Hugging Face pipeline for easy inference
classifier = pipeline(
    "text-classification",
    model=loaded_model,
    tokenizer=loaded_tokenizer,
    device=0 if torch.cuda.is_available() else -1 # 0 for GPU, -1 for CPU
)

# Example predictions
new_texts = [
    "Delicious recipe for carbonara pasta",
    "Best tips for learning Python in 2024",
    "Stylish autumn outfits for women",
    "Beginner's guide to weightlifting at home",
    "Breaking news from around the world" # This might be misclassified if not in training
]

print("\n--- Making predictions on new text ---")
for text in new_texts:
    prediction = classifier(text)[0]
    predicted_label_id = prediction['label'] # This will be the numerical ID string, e.g., "LABEL_0"
    score = prediction['score']

    # Convert back to original label using the label encoder
    # Note: HuggingFace's `pipeline` often returns labels like "LABEL_0", "LABEL_1" if not
    # explicitly provided `id2label` during pipeline creation.
    # We can also get the ID from the `id2label` attribute of the model if correctly set.
    # Let's use the loaded_label_encoder for robustness.
    # Assuming 'LABEL_X' where X is the numerical ID.
    try:
        numerical_id = int(predicted_label_id.replace("LABEL_", ""))
        predicted_original_label = loaded_label_encoder.inverse_transform([numerical_id])[0]
    except ValueError:
        # Fallback if label is directly the string (e.g., 'cooking') if pipeline is smarter
        predicted_original_label = predicted_label_id


    print(f"Text: '{text}'")
    print(f"  Predicted Label: {predicted_original_label} (Confidence: {score:.4f})")

Device set to use cpu



--- Making predictions on new text ---
Text: 'Delicious recipe for carbonara pasta'
  Predicted Label: cooking (Confidence: 0.8512)
Text: 'Best tips for learning Python in 2024'
  Predicted Label: cooking (Confidence: 0.4817)
Text: 'Stylish autumn outfits for women'
  Predicted Label: cooking (Confidence: 0.5931)
Text: 'Beginner's guide to weightlifting at home'
  Predicted Label: fitness (Confidence: 0.4150)
Text: 'Breaking news from around the world'
  Predicted Label: fitness (Confidence: 0.3716)
