# BERT + LightGBM


### Proposed Model

Used BERT embeddings for LightGBM classification.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold
import torch
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm

# Load Dataset
true_data = pd.read_csv('gossipcop_real.csv')
fake_data = pd.read_csv('gossipcop_fake.csv')

# Preprocess the text data
def preprocess_text(text):
    text = text.lower()
    text = text.replace('\n', ' ').replace('\r', '').replace('\t', ' ')
    text = ''.join([char for char in text if char.isalnum() or char in [' ', "'"]])
    return text

true_data['title'] = true_data['title'].apply(preprocess_text)
fake_data['title'] = fake_data['title'].apply(preprocess_text)

# Generate labels True/Fake under new Target Column in 'true_data' and 'fake_data'
true_data['Target'] = ['True'] * len(true_data)
fake_data['Target'] = ['Fake'] * len(fake_data)

# Merge 'true_data' and 'fake_data', by random mixing into a single df called 'data'
fake_news_data = pd.concat([true_data, fake_data]).sample(frac=1).reset_index(drop=True)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Tokenize and encode the dataset
def encode_data(text_list):
    encoded_inputs = tokenizer(text_list, padding=True, truncation=True, max_length=256, return_tensors='pt')
    return encoded_inputs

# Fine-tune the BERT model
def fine_tune_bert(model, data, labels, epochs=4, batch_size=8):
    inputs = encode_data(data)
    dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], torch.tensor(labels))
    dataloader = DataLoader(dataset, sampler=RandomSampler(dataset), batch_size=batch_size)

    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    total_steps = len(dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    model.train()
    for epoch in range(epochs):
        for batch in tqdm(dataloader, desc="Training Epoch {}".format(epoch+1)):
            batch_input_ids, batch_attention_mask, batch_labels = batch
            model.zero_grad()
            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask, labels=batch_labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

# Prepare data for fine-tuning
labels = fake_news_data['Target'].apply(lambda x: 1 if x == 'True' else 0).values
fine_tune_bert(model, fake_news_data['title'].tolist(), labels)

# Extract BERT embeddings after fine-tuning
def get_bert_embeddings(data):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for text in tqdm(data, desc="Extracting BERT embeddings"):
            encoded_inputs = encode_data([text])
            outputs = model.bert(**encoded_inputs)
            embeddings.append(outputs.last_hidden_state[:, 0, :].numpy())
    return np.concatenate(embeddings, axis=0)

# Get embeddings for the titles
titles = fake_news_data['title'].tolist()
embeddings = get_bert_embeddings(titles)

# Prepare data for LightGBM
X = embeddings
y = labels

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'num_leaves': [31, 50, 70],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 500],
    'feature_fraction': [0.8, 0.9, 1.0]
}

# Initialize LightGBM model
lgbm = lgb.LGBMClassifier(objective='binary', metric='binary_logloss', boosting='gbdt')

# Setup GridSearchCV
grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, cv=KFold(n_splits=3), scoring='accuracy', verbose=1, n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found by GridSearchCV:", best_params)

# Train the final model with best parameters
best_lgbm = lgb.LGBMClassifier(**best_params)
best_lgbm.fit(X_train, y_train)

# Predict and evaluate
y_pred = best_lgbm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

ModuleNotFoundError: No module named 'numpy'