In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.linear_model import SGDClassifier
from skmultiflow.trees import HoeffdingTree
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
import numpy as np
import tensorflow as tf

  from .autonotebook import tqdm as notebook_tqdm





In [3]:
fake_news = "C:/Users/Kartik Gounder/Desktop/Projects/Fake News Origin Detector/Fake News Detection/Dataset/archive (2)/Fake.csv"
true_news = "C:/Users/Kartik Gounder/Desktop/Projects/Fake News Origin Detector/Fake News Detection/Dataset/archive (2)/True.csv"
true_df = pd.read_csv(fake_news)
fake_df = pd.read_csv(true_news)

In [4]:
true_df["label"] = 1  # 1 for true news
fake_df["label"] = 0  # 0 for fake news
combined_df = pd.concat([true_df, fake_df], ignore_index=True)
combined_df = combined_df.sample(frac=1, random_state=43).reset_index(drop=True)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(combined_df['text'], combined_df['label'], test_size=0.2, random_state=43)

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [6]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [7]:
# Tokenize and prepare data for RoBERTa
X_train_tokens = tokenizer(list(X_train), return_tensors='tf', padding=True, truncation=True, max_length=512)
X_test_tokens = tokenizer(list(X_test), return_tensors='tf', padding=True, truncation=True, max_length=512)

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
model.fit(X_train_tokens['input_ids'], y_train, epochs=3, batch_size=16)


Epoch 1/3


   6/2245 [..............................] - ETA: 104:40:42 - loss: 0.7092 - accuracy: 0.4896

In [None]:
roberta_pred_logits = model.predict(X_test_tokens['input_ids']).logits
roberta_pred = np.argmax(roberta_pred_logits, axis=1)

# Train SGD Classifier
sgd = SGDClassifier(loss='log_loss', random_state=42)
sgd.fit(X_train_tfidf, y_train)

In [None]:
ht = HoeffdingTree()
for i in range(len(combined_df)):
    text = combined_df.loc[i, 'text']
    label = combined_df.loc[i, 'label']
    X_partial = vectorizer.transform([text]).toarray()[0]
    ht.partial_fit([X_partial], [int(label)])


In [None]:
sgd_pred = sgd.predict(X_test_tfidf)
ht_pred = ht.predict(X_test_tfidf.toarray())

# Concatenate predictions as features for Gradient Boosting Classifier
X_test_features = pd.DataFrame({'RoBERTa': roberta_pred, 'SGD': sgd_pred, 'HT': ht_pred})


In [None]:
roberta_metrics = [accuracy_score(y_test, roberta_pred), precision_score(y_test, roberta_pred), recall_score(y_test, roberta_pred)]
sgd_metrics = [accuracy_score(y_test, sgd_pred), precision_score(y_test, sgd_pred), recall_score(y_test, sgd_pred)]
ht_metrics = [accuracy_score(y_test, ht_pred), precision_score(y_test, ht_pred), recall_score(y_test, ht_pred)]


In [None]:
# Convert to numpy arrays for statistical tests
roberta_metrics = np.array(roberta_metrics)
sgd_metrics = np.array(sgd_metrics)
ht_metrics = np.array(ht_metrics)

# Mann-Whitney U Test (comparing RoBERTa and SGD)
u_stat, u_p_value = mannwhitneyu(roberta_metrics, sgd_metrics)
print(f"Mann-Whitney U Test between RoBERTa and SGD: U-statistic={u_stat}, p-value={u_p_value}")

# Kruskal-Wallis Test (comparing RoBERTa, SGD, and HT)
kruskal_stat, kruskal_p_value = kruskal(roberta_metrics, sgd_metrics, ht_metrics)
print(f"Kruskal-Wallis Test for RoBERTa, SGD, HT: K-statistic={kruskal_stat}, p-value={kruskal_p_value}")