In [3]:
import pandas as pd

# https://www.kaggle.com/datasets/snapcrack/all-the-news
true_articles = pd.read_csv('../datasets/fake-news/articles1.csv')

In [4]:
import re

data = true_articles['content'].values.tolist()

# Remove Emails
data = [re.sub(r'\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub(r'\s+', ' ', sent) for sent in data]

# Remove single quotes
data = [re.sub(r"\'", "", sent) for sent in data]

#### Tokenizacija

In [None]:
import gensim

def tokenize(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(tokenize(data))
print(data_words[:1])

#### Stemming i lematizacija

In [6]:
import spacy

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [7]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'VERB'])

KeyboardInterrupt: 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# max_df=.1 oznacava da rec moze maksimalno da se pojavljuje 10 puta na svakih 100 reci, 
# odnosno u 10% teksta. Ovo sprecava da se u obzir uzimaju preceste reci koje nisu relevantne, poput 'the', 'and', 'or' i slicno.
# stopwords se takodje izbacuju u ovom koraku
count = CountVectorizer(
    stop_words='english',
    max_df=.1,
    max_features=5000,
    lowercase=True,
    token_pattern='[a-zA-Z0-9]{3,}'
    )
X = count.fit_transform(data_lemmatized)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

# Hiperparametri su odredjeni na osnovu clanka: 
# https://yanlinc.medium.com/how-to-build-a-lda-topic-model-using-from-text-601cdcbfd3a6
lda = LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
    evaluate_every=-1, learning_decay=0.9,
    learning_method='online', learning_offset=10.0,
    max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
    n_components=10, n_jobs=-1, perp_tol=0.1,
    random_state=100, topic_word_prior=None,
    total_samples=1000000.0, verbose=0)

X_topics = lda.fit_transform(X)

In [None]:
n_top_words = 1
feature_names = count.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i]
    for i in topic.argsort()\
            [:-n_top_words - 1:-1]]))

Topic 1:
film
Topic 2:
health
Topic 3:
game
Topic 4:
gun
Topic 5:
immigration
Topic 6:
market
Topic 7:
street
Topic 8:
student
Topic 9:
car
Topic 10:
sander


In [None]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda.score(X))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda.perplexity(X))

# See model parameters
print(lda.get_params())

Log Likelihood:  -42149426.053007945
Perplexity:  1967.0092059711287
{'batch_size': 128, 'doc_topic_prior': None, 'evaluate_every': -1, 'learning_decay': 0.9, 'learning_method': 'online', 'learning_offset': 10.0, 'max_doc_update_iter': 100, 'max_iter': 10, 'mean_change_tol': 0.001, 'n_components': 10, 'n_jobs': -1, 'perp_tol': 0.1, 'random_state': 100, 'topic_word_prior': None, 'total_samples': 1000000.0, 'verbose': 0}


In [None]:
true_articles['subject'] = true_articles['content']
true_articles = true_articles.sample(n=1)

true_articles['subject'].apply(lambda x: list(tokenize(x)))
true_articles['subject'].apply(lambda x: lemmatization(x, allowed_postags=['NOUN', 'VERB']))
true_articles['subject'].apply(lambda x: count.fit_transform(x))
true_articles['subject'].apply(lambda x: lda.transform(x)[1])

ValueError: Iterable over raw text documents expected, string object received.