In [7]:
# Import necessary libraries
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, f1_score
import torch.nn.functional as F

# Set random seed for reproducibility
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x18f3bb1bdd0>

In [8]:
# Load your fake news data
df_fake = pd.read_csv('arson_emergency_with_fake_news_tag.csv')

# Assuming 'fake_news_tag' column exists and all labels are 1
df_fake = df_fake[['cleaned_text', 'fake_news_tag']].dropna()
df_fake.rename(columns={'cleaned_text': 'text'}, inplace=True)
#df_fake['label'] = 1  # Fake news label
df_fake.rename(columns={'fake_news_tag': 'label'}, inplace=True)
df_fake['label'] = df_fake['label'].astype(int)

# Load real news data (you need to have a dataset of real news)
# For demonstration, let's assume you have 'real_news.csv'
#df_real = pd.read_csv('real_news.csv')  # Replace with your real news dataset
#df_real = df_real[['cleaned_text']].dropna()
#df_real.rename(columns={'cleaned_text': 'text'}, inplace=True)
#df_real['label'] = 0  # Real news label

# Combine fake and real news data
#df_labeled = pd.concat([df_fake, df_real], ignore_index=True)

# Shuffle the dataset
df_labeled = df_fake.sample(frac=1, random_state=seed).reset_index(drop=True)

# Display class distribution
print("Class Distribution:")
print(df_labeled['label'].value_counts())


Class Distribution:
1    498
Name: label, dtype: int64


In [9]:
# Split into features and labels
texts = df_labeled['text'].tolist()
labels = df_labeled['label'].tolist()

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=seed, stratify=labels)


In [10]:
# Choose a pre-trained model
model_name = "bert-base-uncased"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize training data
train_encodings = tokenizer(
    X_train,
    truncation=True,
    padding=True,
    max_length=128
)

# Tokenize testing data
test_encodings = tokenizer(
    X_test,
    truncation=True,
    padding=True,
    max_length=128
)




In [11]:
class FakeNewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = FakeNewsDataset(train_encodings, y_train)
test_dataset = FakeNewsDataset(test_encodings, y_test)


In [16]:
import torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import os

os.environ["CUDA_VISIBLE_DEVICES"] = ""

device = torch.device("cpu")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    no_cuda=True
)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    return {
        'accuracy': (preds == labels).mean()
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0113,0.006619,1.0
2,0.0008,0.000458,1.0


TrainOutput(global_step=200, training_loss=0.13219502328895033, metrics={'train_runtime': 374.7049, 'train_samples_per_second': 2.124, 'train_steps_per_second': 0.534, 'total_flos': 31088215634880.0, 'train_loss': 0.13219502328895033, 'epoch': 2.0})

In [17]:
# Get predictions on the test set
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)
labels = predictions.label_ids

# Classification report
print("Model Performance on Test Set:")
print(classification_report(labels, preds))
print("Confusion Matrix:")
print(confusion_matrix(labels, preds))


Model Performance on Test Set:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       100

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100

Confusion Matrix:
[[100]]


In [18]:
# Get probabilities
probs = F.softmax(torch.tensor(predictions.predictions), dim=-1).numpy()
prob_fake = probs[:, 1]

# Compute precision, recall, and thresholds
precision, recall, thresholds = precision_recall_curve(labels, prob_fake)
f1_scores = 2 * (precision * recall) / (precision + recall)
best_threshold = thresholds[np.argmax(f1_scores)]
best_f1 = np.max(f1_scores)
print(f"Best Threshold: {best_threshold:.4f}, Best F1 Score: {best_f1:.4f}")

# Adjust predictions based on the best threshold
adjusted_preds = (prob_fake >= best_threshold).astype(int)

# Evaluate adjusted predictions
print("\nAdjusted Model Performance:")
print(classification_report(labels, adjusted_preds))
print("Confusion Matrix:")
print(confusion_matrix(labels, adjusted_preds))


Best Threshold: 0.9994, Best F1 Score: 1.0000

Adjusted Model Performance:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       100

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100

Confusion Matrix:
[[100]]


In [19]:
# Load the full dataset
df_full = pd.read_csv('../../data/processed/tweets_with_sentiment_vader.csv')

# Ensure necessary columns are present
df_full = df_full[['cleaned_text']].dropna()
df_full.rename(columns={'cleaned_text': 'text'}, inplace=True)

# Prepare data for prediction
full_texts = df_full['text'].tolist()

# Determine the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model to the device
model.to(device)

# Batch prediction
batch_size = 32
all_preds = []
all_probs = []

num_batches = int(np.ceil(len(full_texts) / batch_size))

for i in range(num_batches):
    batch_texts = full_texts[i*batch_size : (i+1)*batch_size]
    inputs = tokenizer(
        batch_texts,
        return_tensors='pt',
        truncation=True,
        padding=True,
        max_length=128
    )
    # Move input tensors to the device
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = F.softmax(logits, dim=1)
        prob_fake = probabilities[:, 1].cpu().numpy()
        adjusted_preds = (prob_fake >= best_threshold).astype(int)
        all_preds.extend(adjusted_preds)
        all_probs.extend(prob_fake)

# Add predictions to the dataframe
df_full['fake_news_pred'] = all_preds
df_full['fake_news_prob'] = all_probs

# Save the results
df_full.to_csv('tweets_with_fake_news_predictions.csv', index=False)
print("Predictions saved to 'tweets_with_fake_news_predictions.csv'")


Predictions saved to 'tweets_with_fake_news_predictions.csv'


In [20]:
# Load the predictions
df_full = pd.read_csv('tweets_with_fake_news_predictions.csv')

# Calculate fake news statistics
fake_news_count = df_full['fake_news_pred'].sum()
fake_news_percentage = (fake_news_count / len(df_full)) * 100

fake_news_stats = {
    "Total Entries": len(df_full),
    "Fake News Count": fake_news_count,
    "Fake News Percentage": fake_news_percentage
}

print(pd.DataFrame([fake_news_stats]))


   Total Entries  Fake News Count  Fake News Percentage
0         158902           105510             66.399416
