In [None]:
from nltk import ne_chunk, pos_tag, word_tokenize
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')
from datasets import load_dataset
from collections import Counter, defaultdict

In [None]:
data = load_dataset('GonzaloA/fake_news')

In [None]:
# ------- splits up data by label and stores index --------
fake = []
true = []

for i, article in enumerate(data['train']):
    
    # saving index for further analysis
    info = {'text': article['text'], 'index': i}

    if article['label'] == 0: # label == fake
        fake.append(info)

    else: # label == true
        true.append(info)


# --------- tokenizes the text of the fake news and counts occurrences ----------- 
fake_counts = defaultdict(lambda: {'count': 0, 'articles': []})

for info in fake:
    text = info['text']
    index = info['index'] # stores document index for future analysis
    tokens = word_tokenize(text)

    for token in tokens:
        fake_counts[token]['count'] += 1
        fake_counts[token]['articles'].append(index)


# -------- tokenizes the text of non-fake news and counts occurrences -----------
true_combined = ' '.join([article['text'] for article in true])
true_tokens = word_tokenize(true_combined)
true_amounts = Counter(true_tokens)


# --------- computes token frequency in fake news / frequency in non-fake news --------
fake_tf = {}

for token in fake_counts:
    count = fake_counts[token]['count']
    articles = fake_counts[token]['articles']
    if token in true_amounts:
        fake_tf[token] = {'freq': count / true_amounts[token], 
                          'articles': articles}
    else:
        # pretends tokens not appearing in true appear once (to avoid dividing by zero)
        fake_tf[token] = {'freq': count, 
                          'articles': articles}


# -------- sorts and prints tokens with highest value = most particular to fake news --------
sorted_ = sorted(list(fake_tf.items()), key=lambda x: x[1]['freq'], reverse=True)
for token, info in sorted_:
    print(token, info['freq'], info['articles'][:10])

In [None]:
#with open('fake_tokens.txt', 'a') as f:
#    for token in sorted_:
#        f.write(f'{token[0]}, {token[1][0]}, {token[1][1][:10]}\n')