In [15]:
import pandas as pd
import numpy as np
import re
import nltk
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score
from torch.cuda.amp import autocast, GradScaler

print("Downloading necessary NLTK data...")
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

print("Loading dataset...")
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
print(df_test.columns)
print(df.columns)

Downloading necessary NLTK data...
Loading dataset...


[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/apple/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Index(['id', 'title', 'author', 'text'], dtype='object')
Index(['id', 'title', 'author', 'text', 'label'], dtype='object')


In [16]:
from torch.utils.data import Dataset
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels=None):
        self.texts = texts
        self.labels = labels  # Labels can be None for test data

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
        item = {key: val.squeeze(0) for key, val in encoding.items()}

        if self.labels is not None:  # Include labels only if provided
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        
        return item


In [17]:
import torch
from transformers import BertForSequenceClassification

# Check if CUDA (GPU) is available, else use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pretrained BERT model for binary classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)  # Move model to device (GPU/CPU)

# Now, you can safely call model.eval()
model.eval()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
df_test = pd.read_csv("test.csv").dropna()

X_test = df_test['text'].tolist()
test_dataset = FakeNewsDataset(X_test, [0] * len(X_test))  # Dummy labels
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

model.eval()
predictions = []

with torch.no_grad():
    for batch_idx, batch in enumerate(test_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits
        pred_labels = torch.argmax(outputs, axis=1).cpu().numpy()
        predictions.extend(pred_labels)

        if batch_idx % 10 == 0:
            print(f"Processed {batch_idx}/{len(test_loader)} batches.")

print("Predictions completed.")
# Save results
df_test['predicted_label'] = predictions
df_test[['id', 'predicted_label']].to_csv("submission.csv", index=False)


Processed 0/572 batches.
Processed 10/572 batches.
