### Push dataset to HuggingFace

### Features from paper used in dataset

'''
In feature set 3 (Table 2), the ten features
 with highest importance values are “has quoted content”,
 “has URL”, “% of uppercase letters”, “frequency of punc
tuation”, “frequency of words of length 15”, “% of
 whitespaces”, “frequency of words of length 14”, “aver
age sentence length in words”, “frequency of words of
 length 12” and “frequency of words of length 11”. In
 Fig. 3, it is observed that real news has a very high aver
age number of quotes compared to fake news.
'''

In [1]:
from datasets import load_dataset
import warnings
warnings.filterwarnings('ignore')

dataset = load_dataset("davanstrien/WELFake")
print(dataset)

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'label'],
        num_rows: 72134
    })
})


In [2]:
from feat_eng import has_quotes, has_url, percent_uppercase, frequency_punctuation, percent_whitespace, frequency_words_length, avg_sentence_length

train_dataset = dataset['train']

def extract_stylometric_features(example):
    text = example['text']
    
    if text == None:
        return{
            "label": example.get("label"),
            "has_quotes": 0,
            "has_url": 0,
            "percent_uppercase": 0.0,
            "frequency_punctuation": 0,
            "percent_whitespace": 0.0,
            "frequency_words_length_15": 0,
            "frequency_words_length_14": 0,
            "frequency_words_length_12": 0,
            "frequency_words_length_11": 0,
            "avg_sentence_length": 0.0
        }
    
    return {
        "label": example.get("label"),
        "has_quotes" : has_quotes(text),
        "has_url": has_url(text),
        "percent_uppercase": percent_uppercase(text),
        "frequency_punctuation": frequency_punctuation(text),
        "percent_whitespace": percent_whitespace(text),
        "frequency_words_length_15": frequency_words_length(text, 15),
        "frequency_words_length_14": frequency_words_length(text, 14),
        "frequency_words_length_12": frequency_words_length(text, 12),
        "frequency_words_length_11": frequency_words_length(text, 11),
        "avg_sentence_length": avg_sentence_length(text)  
    }

0


In [3]:

stylo_feats = train_dataset.map(extract_stylometric_features)

stylo_feats = stylo_feats.remove_columns(['title', 'text'])
print(stylo_feats)


Dataset({
    features: ['label', 'has_quotes', 'has_url', 'percent_uppercase', 'frequency_punctuation', 'percent_whitespace', 'frequency_words_length_15', 'frequency_words_length_14', 'frequency_words_length_12', 'frequency_words_length_11', 'avg_sentence_length'],
    num_rows: 72134
})


In [10]:
from huggingface_hub import HfApi

api = HfApi()
print(api.whoami())


stylo_feats.push_to_hub("lelexuanzz/WELFake_stylo_feats")

{'type': 'user', 'id': '67ec36270b5ea9f4c4b1eeca', 'name': 'lelexuanzz', 'fullname': 'Lee Le Xuan', 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': '/avatars/79d15e4cf378459279aa40eb9f51791a.svg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'dataset_write', 'role': 'fineGrained', 'createdAt': '2025-04-01T19:07:36.424Z', 'fineGrained': {'canReadGatedRepos': True, 'global': ['discussion.write', 'post.write'], 'scoped': [{'entity': {'_id': '67ec371e0b5ea9f4c4b2342f', 'type': 'dataset', 'name': 'lelexuanzz/WELFake_stylo_feats'}, 'permissions': ['repo.content.read', 'discussion.write', 'repo.write']}, {'entity': {'_id': '67ec36270b5ea9f4c4b1eeca', 'type': 'user', 'name': 'lelexuanzz'}, 'permissions': ['repo.content.read', 'repo.write', 'inference.serverless.write', 'inference.endpoints.infer.write', 'inference.endpoints.write', 'user.webhooks.read', 'user.webhooks.write', 'collection.read', 'collection.write', 'discussion.write', 'user.billin

Creating parquet from Arrow format: 100%|██████████| 73/73 [00:00<00:00, 1184.39ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.49s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/lelexuanzz/WELFake_stylo_feats/commit/91f45bb73b0b1ee917fc5438d8bb83b0a1b529fb', commit_message='Upload dataset', commit_description='', oid='91f45bb73b0b1ee917fc5438d8bb83b0a1b529fb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/lelexuanzz/WELFake_stylo_feats', endpoint='https://huggingface.co', repo_type='dataset', repo_id='lelexuanzz/WELFake_stylo_feats'), pr_revision=None, pr_num=None)