In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "NO GPU")


In [None]:
!pip install transformers datasets scikit-learn pandas torch


In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_scheduler
)
from torch.optim import AdamW # Corrected import path for AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report



In [None]:
from google.colab import files
uploaded = files.upload()

df = pd.read_csv(list(uploaded.keys())[0])
df.head()


In [None]:
df = df.dropna()
df["text"] = df["text"].astype(str)

print("Dataset size:", len(df))
print(df["label"].value_counts())


In [None]:
df = df.sample(30000, random_state=42)


In [None]:
X = df["text"].tolist()
y = df["label"].tolist()

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    stratify=y_temp,
    random_state=42
)

print("Train:", len(X_train))
print("Validation:", len(X_val))
print("Test:", len(X_test))


In [None]:
MODEL_NAME = "distilbert-base-uncased"
MAX_LEN = 128

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


In [None]:
def tokenize(texts):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )

train_enc = tokenize(X_train)
val_enc   = tokenize(X_val)
test_enc  = tokenize(X_test)


In [None]:
train_dataset = TensorDataset(
    train_enc["input_ids"],
    train_enc["attention_mask"],
    torch.tensor(y_train)
)

val_dataset = TensorDataset(
    val_enc["input_ids"],
    val_enc["attention_mask"],
    torch.tensor(y_val)
)

test_dataset = TensorDataset(
    test_enc["input_ids"],
    test_enc["attention_mask"],
    torch.tensor(y_test)
)


In [None]:
BATCH_SIZE = 32

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

model.to(device)


In [None]:
EPOCHS = 2

optimizer = AdamW(model.parameters(), lr=2e-5)

total_steps = EPOCHS * len(train_loader)

scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)


In [None]:
scaler = torch.cuda.amp.GradScaler()


In [None]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    preds, labels_all = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds.extend(torch.argmax(outputs.logits, dim=1).cpu().tolist())
            labels_all.extend(labels.cpu().tolist())

    acc = accuracy_score(labels_all, preds)
    print(f"Validation Accuracy: {acc:.4f}\n")


In [None]:
model.eval()
preds, labels_all = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds.extend(torch.argmax(outputs.logits, dim=1).cpu().tolist())
        labels_all.extend(labels.cpu().tolist())

print(classification_report(labels_all, preds, target_names=["REAL", "FAKE"]))


In [None]:
model.save_pretrained("fake_news_model")
tokenizer.save_pretrained("fake_news_model")


In [None]:
def predict_news(text):
    model.eval()
    enc = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=MAX_LEN
    ).to(device)

    with torch.no_grad():
        outputs = model(**enc)
        pred = torch.argmax(outputs.logits, dim=1).item()

    return "FAKE NEWS ðŸš¨" if pred == 1 else "REAL NEWS âœ…"

# Test
predict_news("Breaking news: Aliens have landed in New York City")


In [None]:
!nvidia-smi
