In [1]:
from transformers import TextClassificationPipeline, DistilBertTokenizer, DistilBertForSequenceClassification,AutoTokenizer, RobertaTokenizer, RobertaForSequenceClassification, BertForSequenceClassification, BertTokenizer, AlbertForSequenceClassification, AlbertTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import numpy
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [47]:
df = pd.read_csv('datasets/fake_or_real_news_cleaned.csv')


In [26]:
original_df = pd.read_csv('datasets/WELFake_Dataset_cleaned.csv')

In [56]:
original_df.head(10)

Unnamed: 0,text,label
0,unbelievable obama s attorney general says mos...,1
1,bobby jindal raised hindu uses story of christ...,0
2,satan russia unvelis an image of its terrifyin...,1
3,about time christian group sues amazon and spl...,1
4,dr ben carson targeted by the irs i never had ...,1
5,house intel chair on trump russia fake story n...,1
6,sports bar owner bans nfl games will show only...,1
7,latest pipeline leak underscores dangers of da...,1
8,gop senator just smacked down the most punchab...,1
9,may brexit offer would hurt cost eu citizens e...,0


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    6335 non-null   object
 1   label   6335 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 99.1+ KB


In [27]:
original_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71536 entries, 0 to 71535
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    71527 non-null  object
 1   label   71536 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ MB


In [18]:
roberta_path = r'models\fake-news-roberta'
bert_path = r'models\fake-news-bert'
albert_path = r'models\fake-news-albert'

In [19]:
roberta_model = RobertaForSequenceClassification.from_pretrained(roberta_path, num_labels=2)
bert_model = BertForSequenceClassification.from_pretrained(bert_path, num_labels=2)
albert_model = AlbertForSequenceClassification.from_pretrained(albert_path, num_labels=2)

roberta_tokenizer = RobertaTokenizer.from_pretrained(roberta_path)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
albert_tokenizer = AutoTokenizer.from_pretrained('albert-base-v2')

In [48]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [49]:
texts = test_df['text'].tolist()
true_labels = test_df['label'].tolist()

In [50]:
texts = [text[:1500] for text in texts]

In [51]:
#predict test dataset in batches
batch_size = 2
predicted_labels = []

for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i + batch_size]
    inputs = roberta_tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt')
    outputs = roberta_model(**inputs)

    logits = outputs.logits  # [batch_size, num_labels]
    batch_preds = torch.argmax(logits, dim=1)  # [batch_size]
    predicted_labels.extend(batch_preds.tolist())  # [0, 1, 2, ...]



In [52]:
y_true = true_labels
y_pred = predicted_labels

In [53]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.96      0.98       639
           1       0.96      0.99      0.98       628

    accuracy                           0.98      1267
   macro avg       0.98      0.98      0.98      1267
weighted avg       0.98      0.98      0.98      1267



In [54]:
batch_size = 2
predicted_labels = []

for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i + batch_size]
    inputs = bert_tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt")
    outputs = bert_model(**inputs)

    logits = outputs.logits  # [batch_size, num_labels]
    batch_preds = torch.argmax(logits, dim=1)  # [batch_size]
    predicted_labels.extend(batch_preds.tolist()) # [0, 1, 0, 1]

In [55]:
y_true = true_labels
y_pred = predicted_labels
print(classification_report(y_true, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99       639
           1       0.99      0.99      0.99       628

    accuracy                           0.99      1267
   macro avg       0.99      0.99      0.99      1267
weighted avg       0.99      0.99      0.99      1267



In [10]:
batch_size = 2
predicted_labels = []

for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i + batch_size]
    inputs = albert_tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt")
    outputs = albert_model(**inputs)

    logits = outputs.logits  # [batch_size, num_labels]
    batch_preds = torch.argmax(logits, dim=1)  # [batch_size]
    predicted_labels.extend(batch_preds.tolist()) # [0, 1, 0, 1]

In [11]:
y_true = true_labels
y_pred = predicted_labels
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       639
           1       0.98      1.00      0.99       628

    accuracy                           0.99      1267
   macro avg       0.99      0.99      0.99      1267
weighted avg       0.99      0.99      0.99      1267

