In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer

# Load Dataset
def load_data():
    dataset_path = r"C:\projects\lauzhack-2024\liar_dataset-master\train.tsv"
    df = pd.read_csv(dataset_path, delimiter='\t', header=None, names=['label', 'statement', 'subject', 'speaker', 'job', 'state', 'party', 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context'])

    # Map labels to integers
    label_map = {"true": 0, "false": 1, "barely-true": 2, "half-true": 3, "mostly-true": 4, "pants-fire": 5}
    df['label'] = df['label'].map(label_map)

    return df['statement'].tolist(), df['label'].tolist()

# Tokenize Inputs
def tokenize_data(tokenizer, texts, labels, max_length=128):
    tokenized = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    return tokenized['input_ids'], tokenized['attention_mask'], labels


In [3]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertForSequenceClassification, AdamW

# Create a PyTorch Dataset
class LIARDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Fine-tune DistilBERT
def train_distilbert_model(train_loader, val_loader, num_labels=6, epochs=3, lr=5e-5):
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)
    optimizer = AdamW(model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss()

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}")

    return model


In [None]:
def main():
    # Load and preprocess data
    texts, labels = load_data()
    train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
    train_input_ids, train_attention_masks, train_labels = tokenize_data(tokenizer, train_texts, train_labels)
    val_input_ids, val_attention_masks, val_labels = tokenize_data(tokenizer, val_texts, val_labels)

    train_dataset = LIARDataset(train_input_ids, train_attention_masks, train_labels)
    val_dataset = LIARDataset(val_input_ids, val_attention_masks, val_labels)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)

    # Train the model
    model = train_distilbert_model(train_loader, val_loader)

    # Save the model
    model.save_pretrained("distilbert-fake-news")
    tokenizer.save_pretrained("distilbert-fake-news")

if __name__ == "__main__":
    main()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
