In [23]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data import DataLoader
import torch
import warnings
from transformers import AutoModel, AutoTokenizer
import torch
import joblib
import base64
from huggingface_hub import login, HfApi
from gensim.models import Word2Vec
warnings.filterwarnings('ignore')

In [24]:
# Define device for torch
use_cuda = True
print("CUDA is available:", torch.cuda.is_available())
device = torch.device("cuda" if (use_cuda and torch.cuda.is_available()) else "cpu")

CUDA is available: True


In [25]:
from datasets import load_dataset

dataset = load_dataset("davanstrien/WELFake")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'label'],
        num_rows: 72134
    })
})


In [26]:
from collections import Counter

data = dataset["train"].to_pandas()

print("\nData shape:", data.shape)
print("\nSample of training data:")
display(data.head(3))



Data shape: (72134, 3)

Sample of training data:


Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1


### Remove Empty Text Sections

In [27]:
cleaned_data = data.dropna(subset=['text', 'title'])
# rmve all columns where body text is just whitespace
cleaned_data = cleaned_data[cleaned_data['text'].str.strip().str.len() > 0]

# Print before and after to verify
print(f"Original data shape: {data.shape}")
print(f"After removing NaNs and whitespace-only rows: {cleaned_data.shape}")
cleaned_data = cleaned_data.reset_index(drop=True)


Original data shape: (72134, 3)
After removing NaNs and whitespace-only rows: (70793, 3)


### Generate Embeddings

In [28]:
def array_to_string(arr):
    return base64.b64encode(arr.tobytes()).decode('utf-8')

In [48]:
def sparse_to_string(sparse_mat):
    csr = sparse_mat.tocsr()
    data = base64.b64encode(csr.data.tobytes()).decode('utf-8')
    indices = base64.b64encode(csr.indices.tobytes()).decode('utf-8')
    indptr = base64.b64encode(csr.indptr.tobytes()).decode('utf-8')
    shape = f"{csr.shape[0]},{csr.shape[1]}"
    return f"{data}|{indices}|{indptr}|{shape}"


##### Roberta

In [30]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModel.from_pretrained("roberta-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

print(device)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda


In [31]:
def get_roberta_embedding(text, max_length=512):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()


In [32]:
batch_size = 32
roberta_embeddings = []
for i in range(0, len(cleaned_data), batch_size):
    batch_texts = cleaned_data['text'][i:i+batch_size].tolist()
    batch_embeds = [get_roberta_embedding(text) for text in batch_texts]
    roberta_embeddings.extend(batch_embeds)
    print(f"Processed RoBERTa batch {i//batch_size + 1}/{len(cleaned_data)//batch_size + 1}")

roberta_embeddings = np.array(roberta_embeddings)
cleaned_data['roberta_embedding'] = [array_to_string(arr) for arr in roberta_embeddings]
print("RoBERTa embeddings shape:", roberta_embeddings.shape)

Processed RoBERTa batch 1/2213
Processed RoBERTa batch 2/2213
Processed RoBERTa batch 3/2213
Processed RoBERTa batch 4/2213
Processed RoBERTa batch 5/2213
Processed RoBERTa batch 6/2213
Processed RoBERTa batch 7/2213
Processed RoBERTa batch 8/2213
Processed RoBERTa batch 9/2213
Processed RoBERTa batch 10/2213
Processed RoBERTa batch 11/2213
Processed RoBERTa batch 12/2213
Processed RoBERTa batch 13/2213
Processed RoBERTa batch 14/2213
Processed RoBERTa batch 15/2213
Processed RoBERTa batch 16/2213
Processed RoBERTa batch 17/2213
Processed RoBERTa batch 18/2213
Processed RoBERTa batch 19/2213
Processed RoBERTa batch 20/2213
Processed RoBERTa batch 21/2213
Processed RoBERTa batch 22/2213
Processed RoBERTa batch 23/2213
Processed RoBERTa batch 24/2213
Processed RoBERTa batch 25/2213
Processed RoBERTa batch 26/2213
Processed RoBERTa batch 27/2213
Processed RoBERTa batch 28/2213
Processed RoBERTa batch 29/2213
Processed RoBERTa batch 30/2213
Processed RoBERTa batch 31/2213
Processed RoBERTa

In [33]:
# Save to parquet (can handle the string-encoded embeddings)
# cleaned_data.to_parquet('cleaned_welfake_with_embeddings.parquet')


In [34]:
# If you need it To load later:
# def load_data():
#     return pd.read_parquet('cleaned_welfake_with_embeddings.parquet')

##### BOW

In [44]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [45]:
bow_vectorizer = CountVectorizer(max_features=5000, stop_words='english')
bow_embeddings = bow_vectorizer.fit_transform(cleaned_data['text'])

cleaned_data['bow_embedding'] = [sparse_to_string(bow_embeddings[i]) for i in range(bow_embeddings.shape[0])]
print("BOW embeddings shape:", bow_embeddings.shape)

BOW embeddings shape: (70793, 5000)


In [46]:
display(bow_embeddings)

<70793x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 10509868 stored elements in Compressed Sparse Row format>

##### BOW TF-IDF

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_embeddings = tfidf_vectorizer.fit_transform(cleaned_data['text'])

cleaned_data['tfidf_embedding'] = [sparse_to_string(tfidf_embeddings[i]) for i in range(tfidf_embeddings.shape[0])]
print("TF-IDF embeddings shape:", tfidf_embeddings.shape)

<70793x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 10509868 stored elements in Compressed Sparse Row format>

TF-IDF embeddings shape: (70793, 5000)


 ##### CBOW-W2V

In [40]:
cleaned_data_copy = cleaned_data.copy()
cleaned_data_copy['tokens'] = cleaned_data_copy['text'].apply(lambda x: x.split())
w2v_model = Word2Vec(sentences=cleaned_data_copy['tokens'], vector_size=100, window=5, min_count=5, sg=0, seed=42)
def get_w2v_embedding(tokens, model):
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

w2v_embeddings = np.array([get_w2v_embedding(tokens, w2v_model) for tokens in cleaned_data_copy['tokens']])
cleaned_data['w2v_embedding'] = [array_to_string(arr) for arr in w2v_embeddings]
print("Word2Vec embeddings shape:", w2v_embeddings.shape)

Word2Vec embeddings shape: (70793, 100)


##### Push to Hub

In [42]:
cleaned_data.to_parquet('cleaned_welfake_with_embeddings.parquet_final')

In [43]:
from datasets import Dataset
import pandas as pd

output_data = Dataset.from_pandas(cleaned_data)
login(token="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")
output_data.push_to_hub("Paulozs/WELFake_embeddings")

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/36 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/36 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Paulozs/WELFake_embeddings/commit/87edba4053edab11be0d13067840c37fa7337f1e', commit_message='Upload dataset', commit_description='', oid='87edba4053edab11be0d13067840c37fa7337f1e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Paulozs/WELFake_embeddings', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Paulozs/WELFake_embeddings'), pr_revision=None, pr_num=None)