In [6]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

In [7]:

# Path to your model directory
model_path = "./distilbert_model4"

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained(model_path)

# Load the model
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Example: Classify a news article
def classify_news(text, tokenizer, model, max_length=512):
    """
    Classify news content as "authentic" or "fake".

    Parameters:
        text (str): The news content to classify.
        tokenizer: The DistilBERT tokenizer.
        model: The fine-tuned DistilBERT model.
        max_length (int): Maximum length of the input sequence.

    Returns:
        str: "authentic" or "fake".
    """ 
    # Preprocess the input text
    inputs = tokenizer(
        text,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors="pt"  # Return PyTorch tensors
    )

    # Make prediction
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=-1).item()
        
    print("-----")
    print("This is the value: ")
    print(predicted_class)

    # Map predicted class index to label
    labels = ["fake", "authentic"]  # Ensure this matches your training labels
    return labels[predicted_class]



In [8]:
# if __name__ == "__main__":
#     news_content = """
# Secretary Robert F. Kennedy Jr. has instructed the US Centers for Disease Control and Prevention to begin a “scientific process” to look for treatments other than vaccines for measles.

# """
#     # Classify the news
#     result = classify_news(news_content, tokenizer, model)
#     print(f"The news is classified as: {result}")

In [9]:
import pandas as pd

# Make sure `classify_news`, `tokenizer`, and `model` are defined/imported earlier

if __name__ == "__main__":
    # Load your CSV
    df = pd.read_csv(r"annotated_dataset4.csv")

    for index, row in df.iterrows():
        news_content = row['content']
        expected_annotation = row.get('annotation', 'N/A')  # use .get in case the column is missing

        try:
            result = classify_news(news_content, tokenizer, model)
            
            print(f"Row {index}")
            print(f"News Content:\n{news_content[:200]}...")  # show only first 200 characters
            print(f"Predicted: {result}")
            print(f"Annotation: {expected_annotation}")
        except Exception as e:
            print(f"Error processing row {index}: {e}")


-----
This is the value: 
1
Row 0
News Content:
Easily one of the top newsmakers during the last week of September 2024 is the University of Santo Tomas (UST) Singers, who won the Grand Prize at the 2024 Grieg International Choir Festival held in B...
Predicted: authentic
Annotation: 1
-----
This is the value: 
0
Row 1
News Content:
For Sagayan composer Nilo Alcala, a composer-singer writing for the voice truly understands how the vocal mechanism works and would know what would work and what would not.Alcala, a former member of T...
Predicted: fake
Annotation: 1
-----
This is the value: 
0
Row 2
News Content:
It’s a super October at SM Supermalls because it’s officially Super Kids Month! Dive into exciting activities designed to celebrate our young heroes—from engaging games to creative adventures, SM Supe...
Predicted: fake
Annotation: 1
-----
This is the value: 
1
Row 3
News Content:
SM Supermalls and the Estée Lauder Companies (ELC) stand united in their mission to educate, empower,