In [42]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data import DataLoader
import torch
import warnings
from transformers import AutoModel, AutoTokenizer
import torch
import joblib
import base64
from huggingface_hub import login, HfApi
from gensim.models import Word2Vec
warnings.filterwarnings('ignore')

In [43]:
# Define device for torch
use_cuda = True
print("CUDA is available:", torch.cuda.is_available())
device = torch.device("cuda" if (use_cuda and torch.cuda.is_available()) else "cpu")

CUDA is available: True


In [44]:
from datasets import load_dataset

dataset = load_dataset("davanstrien/WELFake")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'label'],
        num_rows: 72134
    })
})


In [45]:
from collections import Counter

data = dataset["train"].to_pandas()

print("\nData shape:", data.shape)
print("\nSample of training data:")
display(data.head(3))



Data shape: (72134, 3)

Sample of training data:


Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1


### Remove Empty Text Sections

In [46]:
cleaned_data = data.dropna(subset=['text', 'title'])
cleaned_data.drop_duplicates(inplace=True)
# rmve all columns where body text is just whitespace
cleaned_data = cleaned_data[cleaned_data['text'].str.strip().str.len() > 0]
# #drop duplicates


# Print before and after to verify
print(f"Original data shape: {data.shape}")
print(f"After removing NaNs and whitespace-only rows: {cleaned_data.shape}")
cleaned_data = cleaned_data.reset_index(drop=True)


Original data shape: (72134, 3)
After removing NaNs and whitespace-only rows: (62592, 3)


### Generate Embeddings

In [47]:
def array_to_string(arr):
    return base64.b64encode(arr.tobytes()).decode('utf-8')

In [48]:
def sparse_to_string(sparse_mat):
    csr = sparse_mat.tocsr()
    data = base64.b64encode(csr.data.tobytes()).decode('utf-8')
    indices = base64.b64encode(csr.indices.tobytes()).decode('utf-8')
    indptr = base64.b64encode(csr.indptr.tobytes()).decode('utf-8')
    shape = f"{csr.shape[0]},{csr.shape[1]}"
    return f"{data}|{indices}|{indptr}|{shape}"


##### Roberta

In [49]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModel.from_pretrained("roberta-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dr

In [50]:
def get_roberta_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()


In [51]:
batch_size = 32
roberta_embeddings = []
for i in range(0, len(cleaned_data), batch_size):
    batch_texts = cleaned_data['text'][i:i+batch_size].tolist()
    batch_embeds = [get_roberta_embedding(text) for text in batch_texts]
    roberta_embeddings.extend(batch_embeds)
    print(f"Processed RoBERTa batch {i//batch_size + 1}/{len(cleaned_data)//batch_size + 1}")

Processed RoBERTa batch 1/1957
Processed RoBERTa batch 2/1957
Processed RoBERTa batch 3/1957
Processed RoBERTa batch 4/1957
Processed RoBERTa batch 5/1957
Processed RoBERTa batch 6/1957
Processed RoBERTa batch 7/1957
Processed RoBERTa batch 8/1957
Processed RoBERTa batch 9/1957
Processed RoBERTa batch 10/1957
Processed RoBERTa batch 11/1957
Processed RoBERTa batch 12/1957
Processed RoBERTa batch 13/1957
Processed RoBERTa batch 14/1957
Processed RoBERTa batch 15/1957
Processed RoBERTa batch 16/1957
Processed RoBERTa batch 17/1957
Processed RoBERTa batch 18/1957
Processed RoBERTa batch 19/1957
Processed RoBERTa batch 20/1957
Processed RoBERTa batch 21/1957
Processed RoBERTa batch 22/1957
Processed RoBERTa batch 23/1957
Processed RoBERTa batch 24/1957
Processed RoBERTa batch 25/1957
Processed RoBERTa batch 26/1957
Processed RoBERTa batch 27/1957
Processed RoBERTa batch 28/1957
Processed RoBERTa batch 29/1957
Processed RoBERTa batch 30/1957
Processed RoBERTa batch 31/1957
Processed RoBERTa

In [52]:
roberta_embeddings = np.array(roberta_embeddings)
print(roberta_embeddings.shape)
cleaned_data['roberta_embedding'] = [array_to_string(arr) for arr in roberta_embeddings]
print("RoBERTa embeddings shape:", roberta_embeddings.shape)

(62592, 768)
RoBERTa embeddings shape: (62592, 768)


In [53]:
#reduce dimensionality with PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=300, random_state=45)  
roberta_reduced = pca.fit_transform(roberta_embeddings)
print(roberta_reduced.shape)


(62592, 300)


In [54]:
cleaned_data['roberta_embedding'] = list(roberta_reduced)
display(cleaned_data)

Unnamed: 0,title,text,label,roberta_embedding
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,"[0.77837825, -1.3829339, 0.21932077, -0.037815..."
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,"[-3.4394891, -1.1532453, -0.57524085, -0.29689..."
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"[0.61816597, 0.10020125, -0.627897, -0.1652965..."
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,"[0.6582862, 0.40051103, -0.35593724, 0.6198630..."
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,"[0.51393855, 0.6349008, 1.0849333, 0.33767414,..."
...,...,...,...,...
62587,WIKILEAKS EMAIL SHOWS CLINTON FOUNDATION FUNDS...,An email released by WikiLeaks on Sunday appea...,1,"[-0.74468505, 0.1229167, 1.3666742, 0.07904005..."
62588,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,"[0.5316571, 0.9854491, -0.6684599, -0.01448869..."
62589,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,"[0.6461022, -1.1085404, 0.13549614, -0.3637871..."
62590,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,"[0.5340885, -0.03433156, -0.3962021, 0.1179680..."


In [55]:
# Save to parquet (can handle the string-encoded embeddings)
# cleaned_data.to_parquet('cleaned_welfake_with_embeddings.parquet')


In [56]:
# If you need it To load later:
# def load_data():
#     return pd.read_parquet('cleaned_welfake_with_embeddings.parquet')

##### BOW

In [57]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [58]:
bow_vectorizer = CountVectorizer(min_df=0.05, max_df=0.95, stop_words='english', ngram_range=(1,3))
bow_embeddings = bow_vectorizer.fit_transform(cleaned_data['text'])
print("BOW embeddings shape:", bow_embeddings.shape)
print(bow_embeddings)

BOW embeddings shape: (62592, 759)
  (0, 129)	1
  (0, 219)	1
  (0, 68)	1
  (0, 453)	1
  (0, 409)	1
  (0, 87)	3
  (0, 736)	9
  (0, 474)	10
  (0, 690)	1
  (0, 444)	1
  (0, 691)	5
  (0, 411)	1
  (0, 78)	10
  (0, 43)	1
  (0, 664)	6
  (0, 694)	3
  (0, 608)	1
  (0, 647)	3
  (0, 145)	1
  (0, 1)	1
  (0, 673)	1
  (0, 120)	1
  (0, 287)	2
  (0, 374)	6
  (0, 540)	1
  :	:
  (62591, 373)	1
  (62591, 208)	1
  (62591, 234)	1
  (62591, 12)	1
  (62591, 704)	1
  (62591, 230)	1
  (62591, 247)	1
  (62591, 170)	1
  (62591, 76)	1
  (62591, 716)	3
  (62591, 458)	1
  (62591, 724)	2
  (62591, 336)	1
  (62591, 405)	2
  (62591, 63)	1
  (62591, 132)	1
  (62591, 224)	1
  (62591, 482)	1
  (62591, 338)	1
  (62591, 250)	1
  (62591, 237)	1
  (62591, 470)	1
  (62591, 688)	1
  (62591, 352)	1
  (62591, 82)	1


In [59]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=300, random_state=45)
bow_reduced = svd.fit_transform(bow_embeddings)
print(bow_reduced.shape)
print(bow_reduced)

(62592, 300)
[[ 1.27512282e+01 -8.76048498e+00 -2.46459563e+00 ... -2.40341146e-01
  -5.38164033e-01  9.53867908e-02]
 [ 1.54090634e-01 -6.48069395e-02 -1.32581682e-01 ... -2.61103861e-03
   7.50409455e-02 -1.80323212e-02]
 [ 1.11447250e+01 -6.60614566e+00 -7.93055526e-01 ...  3.73877185e-01
   5.65029930e-01 -1.15148480e+00]
 ...
 [ 1.64992115e+01  1.31108905e+01 -1.99710545e+00 ... -1.45536429e-01
  -3.82781577e-01  1.45375327e-01]
 [ 5.26452653e+00 -3.88533425e+00 -8.70070293e-01 ... -3.13548525e-01
   6.88490359e-01 -9.97100491e-02]
 [ 1.01114713e+01  3.59138092e+00 -4.21846264e-01 ... -1.04478263e+00
  -2.67524534e-01 -7.14664660e-01]]


In [60]:
cleaned_data['bow_embedding'] = list(bow_reduced)
display(cleaned_data)

Unnamed: 0,title,text,label,roberta_embedding,bow_embedding
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,"[0.77837825, -1.3829339, 0.21932077, -0.037815...","[12.751228183014527, -8.760484977091297, -2.46..."
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,"[-3.4394891, -1.1532453, -0.57524085, -0.29689...","[0.1540906344076637, -0.06480693954115654, -0...."
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"[0.61816597, 0.10020125, -0.627897, -0.1652965...","[11.144724982986332, -6.606145662614691, -0.79..."
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,"[0.6582862, 0.40051103, -0.35593724, 0.6198630...","[2.6159286882160537, -1.72675977752399, -0.519..."
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,"[0.51393855, 0.6349008, 1.0849333, 0.33767414,...","[1.9346475297557924, -1.1505992523835311, -0.7..."
...,...,...,...,...,...
62587,WIKILEAKS EMAIL SHOWS CLINTON FOUNDATION FUNDS...,An email released by WikiLeaks on Sunday appea...,1,"[-0.74468505, 0.1229167, 1.3666742, 0.07904005...","[3.622973638522903, -0.54198912841281, -1.4584..."
62588,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,"[0.5316571, 0.9854491, -0.6684599, -0.01448869...","[17.799926846744107, 1.1087862440581235, 1.350..."
62589,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,"[0.6461022, -1.1085404, 0.13549614, -0.3637871...","[16.499211522306364, 13.110890533481918, -1.99..."
62590,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,"[0.5340885, -0.03433156, -0.3962021, 0.1179680...","[5.264526534165657, -3.8853342501017916, -0.87..."


##### BOW TF-IDF

In [61]:
tfidf_vectorizer = TfidfVectorizer(min_df=0.05, max_df=0.95, stop_words='english', ngram_range=(1,3))
tfidf_embeddings = tfidf_vectorizer.fit_transform(cleaned_data['text'])
print(tfidf_embeddings)
print("TF-IDF embeddings shape:", tfidf_embeddings.shape)


  (0, 129)	0.043776892997407064
  (0, 219)	0.04355821109782633
  (0, 68)	0.04264258887658645
  (0, 453)	0.03279594325845806
  (0, 409)	0.038586898362493266
  (0, 87)	0.09441925332228658
  (0, 736)	0.2845383624214601
  (0, 474)	0.227589084485444
  (0, 690)	0.03690762869472127
  (0, 444)	0.041771788008004074
  (0, 691)	0.2346406766570602
  (0, 411)	0.04742089416427213
  (0, 78)	0.4472010844429815
  (0, 43)	0.03506439473743557
  (0, 664)	0.30615377652594616
  (0, 694)	0.10636320635940007
  (0, 608)	0.04809402251574631
  (0, 647)	0.1320968997702561
  (0, 145)	0.04220381021203303
  (0, 1)	0.03876578583431527
  (0, 673)	0.025868632274383907
  (0, 120)	0.049907229419462296
  (0, 287)	0.10050605184265582
  (0, 374)	0.1570416554970401
  (0, 540)	0.04721988149094425
  :	:
  (62591, 373)	0.06722572182360022
  (62591, 208)	0.06071248241715993
  (62591, 234)	0.07861192097670323
  (62591, 12)	0.07555794846128877
  (62591, 704)	0.06355231491962349
  (62591, 230)	0.08430353674689829
  (62591, 247)	0.0

In [62]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=300, random_state=45)
tfidf_reduced = svd.fit_transform(tfidf_embeddings)
print(tfidf_reduced.shape)
print(tfidf_reduced)

(62592, 300)
[[ 0.31860298 -0.10941072  0.10317675 ...  0.00278509 -0.00708589
  -0.00929653]
 [ 0.07578831 -0.01524089  0.02820455 ...  0.04122813 -0.00059605
   0.02045247]
 [ 0.39807791 -0.15109853  0.057081   ...  0.01464588  0.00635184
   0.03690883]
 ...
 [ 0.45380863  0.33718147 -0.04324545 ...  0.00891854  0.0281111
   0.00085979]
 [ 0.29040425 -0.21906294 -0.02015833 ... -0.05417055 -0.03030166
  -0.00403572]
 [ 0.46950877  0.07418171 -0.13248021 ...  0.00967072  0.01256809
   0.04918511]]


In [63]:
cleaned_data['tfidf_embedding'] = list(tfidf_reduced)
display(cleaned_data)

Unnamed: 0,title,text,label,roberta_embedding,bow_embedding,tfidf_embedding
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,"[0.77837825, -1.3829339, 0.21932077, -0.037815...","[12.751228183014527, -8.760484977091297, -2.46...","[0.3186029772902578, -0.1094107173745029, 0.10..."
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,"[-3.4394891, -1.1532453, -0.57524085, -0.29689...","[0.1540906344076637, -0.06480693954115654, -0....","[0.07578831287738759, -0.015240886953002832, 0..."
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"[0.61816597, 0.10020125, -0.627897, -0.1652965...","[11.144724982986332, -6.606145662614691, -0.79...","[0.3980779078413261, -0.15109853445516602, 0.0..."
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,"[0.6582862, 0.40051103, -0.35593724, 0.6198630...","[2.6159286882160537, -1.72675977752399, -0.519...","[0.21115011222058097, -0.14267633316979805, -0..."
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,"[0.51393855, 0.6349008, 1.0849333, 0.33767414,...","[1.9346475297557924, -1.1505992523835311, -0.7...","[0.1945931561404842, -0.1142489399938969, 0.00..."
...,...,...,...,...,...,...
62587,WIKILEAKS EMAIL SHOWS CLINTON FOUNDATION FUNDS...,An email released by WikiLeaks on Sunday appea...,1,"[-0.74468505, 0.1229167, 1.3666742, 0.07904005...","[3.622973638522903, -0.54198912841281, -1.4584...","[0.23084806769137872, 0.09422806198766091, 0.3..."
62588,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,"[0.5316571, 0.9854491, -0.6684599, -0.01448869...","[17.799926846744107, 1.1087862440581235, 1.350...","[0.4787758131172253, 0.003986352099122974, -0...."
62589,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,"[0.6461022, -1.1085404, 0.13549614, -0.3637871...","[16.499211522306364, 13.110890533481918, -1.99...","[0.45380863043849157, 0.3371814687589905, -0.0..."
62590,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,"[0.5340885, -0.03433156, -0.3962021, 0.1179680...","[5.264526534165657, -3.8853342501017916, -0.87...","[0.290404249417728, -0.21906293500944446, -0.0..."


 ##### CBOW-W2V

In [64]:
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

nltk.download('all')


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\paulo\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\paulo\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\paulo\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\paulo\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\paulo\AppData\Roaming\nltk_data...
[

True

In [65]:
texts = cleaned_data["text"].astype(str).tolist()

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalpha() and t not in ENGLISH_STOP_WORDS]
    return tokens

tokenized_texts = [preprocess(t) for t in texts]


In [66]:
from gensim.models import KeyedVectors
import gensim.downloader

#load pretrained model
w2v_vectors = gensim.downloader.load('word2vec-google-news-300')


In [67]:

embedding_dim = w2v_vectors.vector_size
num_docs = len(tokenized_texts)

w2v_embeddings = np.zeros((num_docs, embedding_dim), dtype=np.float32)

#include batch processing
batch_size = 32
for start_idx in range(0, num_docs, batch_size):
    end_idx = min(start_idx + batch_size, num_docs)
    batch = tokenized_texts[start_idx:end_idx]
    
    # Compute embeddings for the batch
    for i, doc in enumerate(batch):
        vectors = [w2v_vectors[word] for word in doc if word in w2v_vectors]
        if vectors:
            w2v_embeddings[start_idx + i] = np.mean(vectors, axis=0)
        else:
            w2v_embeddings[start_idx + i] = np.zeros(embedding_dim)

In [68]:
print(w2v_embeddings.shape)
cleaned_data['w2v_embedding'] = list(w2v_embeddings)
display(cleaned_data)

(62592, 300)


Unnamed: 0,title,text,label,roberta_embedding,bow_embedding,tfidf_embedding,w2v_embedding
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,"[0.77837825, -1.3829339, 0.21932077, -0.037815...","[12.751228183014527, -8.760484977091297, -2.46...","[0.3186029772902578, -0.1094107173745029, 0.10...","[0.005737772, 0.051631283, 0.039275542, 0.0679..."
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,"[-3.4394891, -1.1532453, -0.57524085, -0.29689...","[0.1540906344076637, -0.06480693954115654, -0....","[0.07578831287738759, -0.015240886953002832, 0...","[0.014999917, 0.057231702, 0.05448833, 0.10918..."
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"[0.61816597, 0.10020125, -0.627897, -0.1652965...","[11.144724982986332, -6.606145662614691, -0.79...","[0.3980779078413261, -0.15109853445516602, 0.0...","[0.04940245, 0.03809506, 0.030635785, 0.096523..."
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,"[0.6582862, 0.40051103, -0.35593724, 0.6198630...","[2.6159286882160537, -1.72675977752399, -0.519...","[0.21115011222058097, -0.14267633316979805, -0...","[-0.022517398, 0.08829837, 0.06998471, 0.05531..."
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,"[0.51393855, 0.6349008, 1.0849333, 0.33767414,...","[1.9346475297557924, -1.1505992523835311, -0.7...","[0.1945931561404842, -0.1142489399938969, 0.00...","[0.031655375, 0.018008867, 0.039424133, 0.0783..."
...,...,...,...,...,...,...,...
62587,WIKILEAKS EMAIL SHOWS CLINTON FOUNDATION FUNDS...,An email released by WikiLeaks on Sunday appea...,1,"[-0.74468505, 0.1229167, 1.3666742, 0.07904005...","[3.622973638522903, -0.54198912841281, -1.4584...","[0.23084806769137872, 0.09422806198766091, 0.3...","[-0.00035927512, 0.04827896, -0.027950248, 0.0..."
62588,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,"[0.5316571, 0.9854491, -0.6684599, -0.01448869...","[17.799926846744107, 1.1087862440581235, 1.350...","[0.4787758131172253, 0.003986352099122974, -0....","[-0.020597927, 0.025891328, 0.0079015875, 0.06..."
62589,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,"[0.6461022, -1.1085404, 0.13549614, -0.3637871...","[16.499211522306364, 13.110890533481918, -1.99...","[0.45380863043849157, 0.3371814687589905, -0.0...","[0.026168875, 0.059571818, 0.025066013, 0.0854..."
62590,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,"[0.5340885, -0.03433156, -0.3962021, 0.1179680...","[5.264526534165657, -3.8853342501017916, -0.87...","[0.290404249417728, -0.21906293500944446, -0.0...","[0.009088822, 0.061462272, 0.026323821, 0.0745..."


#### OLD version

In [69]:

# cleaned_data_copy = cleaned_data.copy()
# cleaned_data_copy['tokens'] = cleaned_data_copy['text'].apply(lambda x: x.split())
# w2v_model = Word2Vec(sentences=cleaned_data_copy['tokens'], vector_size=100, window=5, min_count=5, sg=0, seed=42)
# def get_w2v_embedding(tokens, model):
#     vectors = [model.wv[token] for token in tokens if token in model.wv]
#     return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# w2v_embeddings = np.array([get_w2v_embedding(tokens, w2v_model) for tokens in cleaned_data_copy['tokens']])
# cleaned_data['w2v_embedding'] = [array_to_string(arr) for arr in w2v_embeddings]
# print("Word2Vec embeddings shape:", w2v_embeddings.shape)

##### Push to Hub

In [70]:
cleaned_data.to_parquet('cleaned_welfake_with_embeddings.parquet')

In [71]:
from datasets import Dataset
import pandas as pd
import json

display(cleaned_data)

output_data = Dataset.from_pandas(cleaned_data)

# with open("config.json", "r") as f:
#     config=json.load(f)
# hf_token = config.get("hf_token")
# if(hf_token):
#     login(token=hf_token)
# else:
#     print("can't find token")



Unnamed: 0,title,text,label,roberta_embedding,bow_embedding,tfidf_embedding,w2v_embedding
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,"[0.77837825, -1.3829339, 0.21932077, -0.037815...","[12.751228183014527, -8.760484977091297, -2.46...","[0.3186029772902578, -0.1094107173745029, 0.10...","[0.005737772, 0.051631283, 0.039275542, 0.0679..."
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,"[-3.4394891, -1.1532453, -0.57524085, -0.29689...","[0.1540906344076637, -0.06480693954115654, -0....","[0.07578831287738759, -0.015240886953002832, 0...","[0.014999917, 0.057231702, 0.05448833, 0.10918..."
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"[0.61816597, 0.10020125, -0.627897, -0.1652965...","[11.144724982986332, -6.606145662614691, -0.79...","[0.3980779078413261, -0.15109853445516602, 0.0...","[0.04940245, 0.03809506, 0.030635785, 0.096523..."
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,"[0.6582862, 0.40051103, -0.35593724, 0.6198630...","[2.6159286882160537, -1.72675977752399, -0.519...","[0.21115011222058097, -0.14267633316979805, -0...","[-0.022517398, 0.08829837, 0.06998471, 0.05531..."
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,"[0.51393855, 0.6349008, 1.0849333, 0.33767414,...","[1.9346475297557924, -1.1505992523835311, -0.7...","[0.1945931561404842, -0.1142489399938969, 0.00...","[0.031655375, 0.018008867, 0.039424133, 0.0783..."
...,...,...,...,...,...,...,...
62587,WIKILEAKS EMAIL SHOWS CLINTON FOUNDATION FUNDS...,An email released by WikiLeaks on Sunday appea...,1,"[-0.74468505, 0.1229167, 1.3666742, 0.07904005...","[3.622973638522903, -0.54198912841281, -1.4584...","[0.23084806769137872, 0.09422806198766091, 0.3...","[-0.00035927512, 0.04827896, -0.027950248, 0.0..."
62588,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,"[0.5316571, 0.9854491, -0.6684599, -0.01448869...","[17.799926846744107, 1.1087862440581235, 1.350...","[0.4787758131172253, 0.003986352099122974, -0....","[-0.020597927, 0.025891328, 0.0079015875, 0.06..."
62589,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,"[0.6461022, -1.1085404, 0.13549614, -0.3637871...","[16.499211522306364, 13.110890533481918, -1.99...","[0.45380863043849157, 0.3371814687589905, -0.0...","[0.026168875, 0.059571818, 0.025066013, 0.0854..."
62590,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,"[0.5340885, -0.03433156, -0.3962021, 0.1179680...","[5.264526534165657, -3.8853342501017916, -0.87...","[0.290404249417728, -0.21906293500944446, -0.0...","[0.009088822, 0.061462272, 0.026323821, 0.0745..."


In [72]:
output_data.push_to_hub("Paulozs/WELFake_embeddings")

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/32 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/32 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/442 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Paulozs/WELFake_embeddings/commit/b50ce239b7e8fd68bf3f6910bbf6a85084a09ffd', commit_message='Upload dataset', commit_description='', oid='b50ce239b7e8fd68bf3f6910bbf6a85084a09ffd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Paulozs/WELFake_embeddings', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Paulozs/WELFake_embeddings'), pr_revision=None, pr_num=None)