# BERT + LightGBM


### Proposed Model

Used BERT embeddings for LightGBM classification.

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm

# Load Dataset
true_data = pd.read_csv('gossipcop_real.csv')
fake_data = pd.read_csv('gossipcop_fake.csv')

# Preprocess the text data
def preprocess_text(text):
    text = text.lower()
    text = text.replace('\n', ' ').replace('\r', '').replace('\t', ' ')
    text = ''.join([char for char in text if char.isalnum() or char in [' ', "'"]])
    return text

true_data['title'] = true_data['title'].apply(preprocess_text)
fake_data['title'] = fake_data['title'].apply(preprocess_text)

# Generate labels True/Fake under new Target Column in 'true_data' and 'fake_data'
true_data['Target'] = ['True'] * len(true_data)
fake_data['Target'] = ['Fake'] * len(fake_data)

# Merge 'true_data' and 'fake_data', by random mixing into a single df called 'data'
fake_news_data = pd.concat([true_data, fake_data]).sample(frac=1).reset_index(drop=True)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input text and pad/truncate sequences
max_length = 128  # Maximum sequence length
tokenized_texts = [tokenizer.encode(text, add_special_tokens=True, max_length=max_length, truncation=True) for text in fake_news_data['title']]

# Pad sequences
input_ids = torch.nn.utils.rnn.pad_sequence([torch.tensor(tokenized_text) for tokenized_text in tokenized_texts], batch_first=True)

# Create attention masks
attention_masks = [[int(token_id > 0) for token_id in input_id] for input_id in input_ids]

# Prepare Labels
labels = [1 if label == 'Fake' else 0 for label in fake_news_data['Target']]

# Convert data to PyTorch tensors
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
labels = torch.tensor(labels)

# Split data into training and testing sets
train_inputs, test_inputs, train_masks, test_masks, train_labels, test_labels = train_test_split(input_ids, attention_masks, labels, test_size=0.2, random_state=42)

# Define BERT model for sequence classification with fewer layers
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,  
    output_attentions=False,
    output_hidden_states=False,
    num_hidden_layers=6  # Set the number of layers (default is 12)
)

# Fine-tuning BERT model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 3
total_steps = len(train_inputs) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Data loaders
batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, masks, labels = tuple(t.to(device) for t in batch)
        model.zero_grad()
        outputs = model(input_ids, token_type_ids=None, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Clip gradients to avoid exploding gradients
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)

    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions, true_labels = [], []

    for batch in test_dataloader:
        input_ids, masks, labels = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            outputs = model(input_ids, token_type_ids=None, attention_mask=masks)
            logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()

        predictions.append(logits)
        true_labels.append(label_ids)

    predictions = np.concatenate(predictions, axis=0)
    true_labels = np.concatenate(true_labels, axis=0)
    preds_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = true_labels.flatten()
    accuracy = accuracy_score(labels_flat, preds_flat)
    precision = precision_score(labels_flat, preds_flat)
    recall = recall_score(labels_flat, preds_flat)
    f1 = f1_score(labels_flat, preds_flat)
    print(f'Epoch {epoch + 1}/{epochs}')
    print(f'Training loss: {avg_train_loss}')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1-score: {f1}')
    print("\n")



  input_ids = torch.tensor(input_ids)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Training loss: 0.4095031751272696
Accuracy: 0.8525293586269196
Precision: 0.7860635696821516
Recall: 0.5735950044603033
F1-score: 0.6632284682826199


Epoch 2/3
Training loss: 0.3005922428443221
Accuracy: 0.8604336043360433
Precision: 0.7880870561282932
Recall: 0.6137377341659233
F1-score: 0.6900702106318957


Epoch 3/3
Training loss: 0.21826962339910358
Accuracy: 0.8556910569105691
Precision: 0.7510416666666667
Recall: 0.6431757359500446
F1-score: 0.6929360884190293




### Another Proposed Model

BERT and LightGBM works seperately of each other and then we combine their two predictions

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load Dataset
true_data = pd.read_csv('gossipcop_real.csv')
fake_data = pd.read_csv('gossipcop_fake.csv')

# Generate labels True/Fake under new Target Column in 'true_data' and 'fake_data'
true_data['Target'] = ['True']*len(true_data)
fake_data['Target'] = ['Fake']*len(fake_data)

# Merge 'true_data' and 'fake_data', by random mixing into a single df called 'data'
fake_news_data = pd.concat([true_data, fake_data]).sample(frac=1).reset_index(drop=True)

# Preprocess the text data using TF-IDF vectorization for LightGBM
vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer.fit_transform(fake_news_data['title']).toarray()
y_tfidf = fake_news_data['Target']

# Split the TF-IDF dataset into training and testing sets
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y_tfidf, test_size=0.2, random_state=42)

# Train the LightGBM Classifier
lgbm_classifier = LGBMClassifier()
lgbm_classifier.fit(X_train_tfidf, y_train_tfidf)

# Tokenize and encode the training data for BERT
X_train, X_test, y_train, y_test = train_test_split(fake_news_data['title'], fake_news_data['Target'], test_size=0.2, random_state=42)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
X_train_encoded = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')
y_train_encoded = torch.tensor([1 if label == 'True' else 0 for label in y_train.tolist()])

# Fine-tune BERT on the training data
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
epochs = 3
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(**X_train_encoded, labels=y_train_encoded)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

# Make predictions using LightGBM
y_pred_lgbm = lgbm_classifier.predict(X_test_tfidf)

# Make predictions using BERT
model.eval()
with torch.no_grad():
    X_test_encoded = tokenizer(X_test.tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')
    outputs = model(**X_test_encoded)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)
y_pred_bert = ['True' if pred == 1 else 'Fake' for pred in predictions.tolist()]

# Combine predictions
combined_predictions = []
for pred_lgbm, pred_bert in zip(y_pred_lgbm, y_pred_bert):
    if pred_lgbm == pred_bert:
        combined_predictions.append(pred_lgbm)
    else:
        combined_predictions.append(pred_lgbm)  # You may want to use a different strategy for combining predictions

# Evaluate the combined predictions
accuracy_combined = accuracy_score(y_test_tfidf, combined_predictions)
precision_combined = precision_score(y_test_tfidf, combined_predictions, average='weighted')
recall_combined = recall_score(y_test_tfidf, combined_predictions, average='weighted')
f1_combined = f1_score(y_test_tfidf, combined_predictions, average='weighted')

# Print evaluation metrics for the combined predictions
print("Combined Classifier:")
print("Accuracy:", accuracy_combined)
print("Precision:", precision_combined)
print("Recall:", recall_combined)
print("F1 Score:", f1_combined)
