### Push dataset to HuggingFace

### Features from paper used in dataset

'''
In feature set 3 (Table 2), the ten features
 with highest importance values are “has quoted content”,
 “has URL”, “% of uppercase letters”, “frequency of punc
tuation”, “frequency of words of length 15”, “% of
 whitespaces”, “frequency of words of length 14”, “aver
age sentence length in words”, “frequency of words of
 length 12” and “frequency of words of length 11”. In
 Fig. 3, it is observed that real news has a very high aver
age number of quotes compared to fake news.
'''

In [1]:
from datasets import load_dataset
import warnings
warnings.filterwarnings('ignore')

dataset = load_dataset("Paulozs/WELFake_embeddings")
print(dataset)

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'label', 'roberta_embedding', 'bow_embedding', 'tfidf_embedding', 'w2v_embedding'],
        num_rows: 70793
    })
})


In [2]:
from feat_eng import has_quotes, has_url, percent_uppercase, frequency_punctuation, percent_whitespace, frequency_words_length, avg_sentence_length, has_1_to_3_urls, has_4_to_6_urls, has_more_than_6_urls, num_exclamations, num_questions, avg_word_length, lexical_diversity, proportion_stopwords

train_dataset = dataset['train']

def extract_stylometric_features(example):
    text = example['text']
    
    if text == None:
        return{
            "label": example.get("label"),
            "has_quotes": 0,
            "has_url": 0,
            "percent_uppercase": 0.0,
            "frequency_punctuation": 0,
            "percent_whitespace": 0.0,
            "frequency_words_length_15": 0,
            "frequency_words_length_14": 0,
            "frequency_words_length_12": 0,
            "frequency_words_length_11": 0,
            "avg_sentence_length": 0.0,
            "has_1_to_3_urls": 0,
            "has_4_to_6_urls": 0,
            "has_more_than_6_urls": 0,
            "num_exclamations": 0,
            "num_questions": 0,
            "avg_word_length": 0.0,
            "lexical_diversity": 0.0,
            "proportion_stopwords": 0.0
        }
    
    return {
        "label": example.get("label"),
        "has_quotes" : has_quotes(text),
        "has_url": has_url(text),
        "percent_uppercase": percent_uppercase(text),
        "frequency_punctuation": frequency_punctuation(text),
        "percent_whitespace": percent_whitespace(text),
        "frequency_words_length_15": frequency_words_length(text, 15),
        "frequency_words_length_14": frequency_words_length(text, 14),
        "frequency_words_length_12": frequency_words_length(text, 12),
        "frequency_words_length_11": frequency_words_length(text, 11),
        "avg_sentence_length": avg_sentence_length(text)  ,
        "has_1_to_3_urls": has_1_to_3_urls(text),
        "has_4_to_6_urls": has_4_to_6_urls(text),
        "has_more_than_6_urls": has_more_than_6_urls(text),
        "num_exclamations": num_exclamations(text),
        "num_questions": num_questions(text),
        "avg_word_length": avg_word_length(text),
        "lexical_diversity": lexical_diversity(text),
        "proportion_stopwords": proportion_stopwords(text)
        
        
    }

0.9166666666666666


In [3]:

stylo_feats = train_dataset.map(extract_stylometric_features)

stylo_feats = stylo_feats.remove_columns(['title', 'roberta_embedding', 'bow_embedding', 'tfidf_embedding', 'w2v_embedding'])
print(stylo_feats)


Map:   0%|          | 0/70793 [00:00<?, ? examples/s]

Map: 100%|██████████| 70793/70793 [02:09<00:00, 545.77 examples/s]

Dataset({
    features: ['text', 'label', 'has_quotes', 'has_url', 'percent_uppercase', 'frequency_punctuation', 'percent_whitespace', 'frequency_words_length_15', 'frequency_words_length_14', 'frequency_words_length_12', 'frequency_words_length_11', 'avg_sentence_length', 'has_1_to_3_urls', 'has_4_to_6_urls', 'has_more_than_6_urls', 'num_exclamations', 'num_questions', 'avg_word_length', 'lexical_diversity', 'proportion_stopwords'],
    num_rows: 70793
})





In [4]:
# from huggingface_hub import HfApi

# api = HfApi()


# stylo_feats.push_to_hub("lelexuanzz/WELFake_stylo_feats")

In [7]:
stylo_feats.push_to_hub("lelexuanzz/WELFake_stylo_feats", commit_message="Added more features")

Creating parquet from Arrow format: 100%|██████████| 71/71 [00:01<00:00, 44.63ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:18<00:00, 18.43s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/lelexuanzz/WELFake_stylo_feats/commit/0ac6d3534de6674691271a744aa8791c1012ccec', commit_message='Added more features', commit_description='', oid='0ac6d3534de6674691271a744aa8791c1012ccec', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/lelexuanzz/WELFake_stylo_feats', endpoint='https://huggingface.co', repo_type='dataset', repo_id='lelexuanzz/WELFake_stylo_feats'), pr_revision=None, pr_num=None)

# Test Set(GossipCop_Politifact_Stylo)

In [6]:
test_dataset = load_dataset("lelexuanzz/Gossipcop_Politifact_Test_Stylo")
print(test_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'has_quotes', 'has_url', 'percent_uppercase', 'frequency_punctuation', 'percent_whitespace', 'frequency_words_length_15', 'frequency_words_length_14', 'frequency_words_length_12', 'frequency_words_length_11', 'avg_sentence_length'],
        num_rows: 6900
    })
})


In [8]:
test_dataset = test_dataset["train"]

test_stylo_feats = test_dataset.map(extract_stylometric_features)

# test_stylo_feats = test_stylo_feats.remove_columns(['title', 'roberta_embedding', 'bow_embedding', 'tfidf_embedding', 'w2v_embedding'])
print(test_stylo_feats)


Map: 100%|██████████| 6900/6900 [00:16<00:00, 414.78 examples/s]

Dataset({
    features: ['text', 'label', 'has_quotes', 'has_url', 'percent_uppercase', 'frequency_punctuation', 'percent_whitespace', 'frequency_words_length_15', 'frequency_words_length_14', 'frequency_words_length_12', 'frequency_words_length_11', 'avg_sentence_length', 'has_1_to_3_urls', 'has_4_to_6_urls', 'has_more_than_6_urls', 'num_exclamations', 'num_questions', 'avg_word_length', 'lexical_diversity', 'proportion_stopwords'],
    num_rows: 6900
})





In [9]:
test_stylo_feats.push_to_hub("lelexuanzz/Gossipcop_Politifact_Test_Stylo", commit_message="Added more features")

Creating parquet from Arrow format: 100%|██████████| 7/7 [00:00<00:00, 38.14ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.06s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/lelexuanzz/Gossipcop_Politifact_Test_Stylo/commit/db8e5a594a41d651889cc336bd374478fd14d262', commit_message='Added more features', commit_description='', oid='db8e5a594a41d651889cc336bd374478fd14d262', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/lelexuanzz/Gossipcop_Politifact_Test_Stylo', endpoint='https://huggingface.co', repo_type='dataset', repo_id='lelexuanzz/Gossipcop_Politifact_Test_Stylo'), pr_revision=None, pr_num=None)