# Keyword extraction

In [15]:
# Import essential libraries
import pandas as pd
from wordcloud import WordCloud, STOPWORDS
from nltk.stem import WordNetLemmatizer
import yake
import nltk
# nltk.download('omw-1.4')

def yake_extractor(data_list, lemmatizer, stopwords, language = "en", max_ngram_size = 3, deduplication_thresold = 0.9, 
    deduplication_algo = 'seqm', windowSize = 1, numOfKeywords =100):
    # preprocess data
    lower_data = [data.lower() for data in data_list]
    lem_data = [lemmatizer.lemmatize(data) for data in lower_data]
    # text = []
    # for data in lem_data:
    #     text.append(' '.join([w for w in data.split() if w not in stopwords]))
    text = ' '.join(lem_data)
    # extract keywords
    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size,  dedupLim=deduplication_thresold,
                            dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None, stopwords=stopwords)
    keywords = custom_kw_extractor.extract_keywords(text)
    return keywords

if __name__ == '__main__':
    stopwords = set(STOPWORDS)
    lemmatizer = WordNetLemmatizer()

    data_path = r"C:\Users\hn0139\OneDrive - UNT System\A_PhD_PATH\PROJECTS\Misinformation\Misinformation_literature_review\metadata\merged_all_data\journal+doi+abstract+year+citation+fieldofstudy_dropnull.csv"
    out_path = r'C:\Users\hn0139\Documents\GitHub\Misinformation\Data analysis\Content analysis\keyword.csv'
    
    with open(data_path, 'r', encoding = 'utf-8') as f:    
        data = pd.read_csv(f)
    print(data.columns)

    join_fn = lambda x : ' '.join([x.title, x.abstract])
    join_text = data.apply(join_fn, axis=1)
    keywords = yake_extractor(data_list = join_text.tolist(), lemmatizer = lemmatizer, stopwords = stopwords)
    
    # write output to file
    f = open(out_path, 'w')
    f.writelines('keyword,score\n')
    for (w, s) in keywords: 
        f = open(out_path, 'a+')
        f.write('%s,%f\n' %(w,s))   
    f.close()


Index(['Unnamed: 0', 'title', 'authors', 'venue', 'year', 'citationCount',
       'fieldsOfStudy', 'abstract', 'doi', 'query', 'database',
       'publication_type'],
      dtype='object')
[('social media', 9.924303411959038e-07), ('fake news', 1.3047250484093075e-06), ('social media platforms', 2.5632053191541136e-06), ('fake news detection', 2.640716614146843e-06), ('online social networks', 3.395765128235662e-06), ('social media use', 4.5968016736445965e-06), ('social media users', 5.411398006995505e-06), ('social networks', 6.61669595737636e-06), ('social media news', 8.450279803841577e-06), ('social', 8.499241242615078e-06), ('information', 8.783742531554572e-06), ('social media information', 9.109590952449133e-06), ('online social media', 9.394470102162478e-06), ('using social media', 9.861601659799538e-06), ('news', 1.0067561475624971e-05), ('media', 1.251527732559419e-05), ('social media networks', 1.2851504092180756e-05), ('misinformation', 1.4061149016683143e-05), ('social me

# Topic modeling

In [33]:
# Import essential libraries
import pandas as pd
from nltk.tokenize import RegexpTokenizer
# from stop_words import get_stop_words
from wordcloud import WordCloud, STOPWORDS

from nltk.stem.porter import PorterStemmer
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel


'''# Create a list of tokenized and stemmed documents
texts = []
for i in abstract_set:
    # Lowercase tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
    # Remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop + stop_plus]
    # Stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens if len(i)>3]
    texts.append(stemmed_tokens)'''

def preprocessing (tokenizer, text):
    """
    text: str
    """
    stopwords = set(STOPWORDS)
    text = text.lower()
    tokens = tokenizer.tokenize(text)
     # Remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in stopwords]
    # Stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens if len(i)>3]
    return stemmed_tokens

def get_lda_topic (texts, n_topics =[20, 25, 30, 35, 40], n_words = 12):
    """
    texts: a list of token lists
    """
    # Turn our tokenized documents into a id - term dictionary
    dictionary = corpora.Dictionary(texts)
    # Convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]

    # Find the best n_topics
    models = {}
    topics = {} 
    coherence = {}
    for i in n_topics:
        # Generate LDA model
        print('--------- %s ---------'%str(i))
        ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=i, id2word = dictionary, passes=60, alpha='auto',
                            random_state=42)
        shown_topics = ldamodel.show_topics(num_topics=i, 
                                             num_words=n_words,
                                             formatted=False)
        topics[i] = [[word[0] for word in topic[1]] for topic in shown_topics]
        # Compute Coherence Score
        coherence_ldamodel = CoherenceModel(model=ldamodel, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_lda = coherence_ldamodel.get_coherence()
        coherence[i] = coherence_lda
        print('Coherence Score: ', coherence_lda)
    
    best_cor_score = max(list(coherence.values()))
    best_cor_idx = list(coherence.values()).index(best_cor_score)

    # Get topics from results
    # topics = []
    # for topic in ldamodel.print_topics(num_topics=n_topics, num_words=n_words):
    #     topics.append(topic)
    # print(topics)
    # return topics
    print(coherence)
    print(best_cor_idx, best_cor_score)

if __name__ == '__main__':
    # Create Tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # Create PorterStemmer
    p_stemmer = PorterStemmer()

    # Save topics to CSV
    data_path = r"C:\Users\hn0139\OneDrive - UNT System\A_PhD_PATH\PROJECTS\Misinformation\Misinformation_literature_review\metadata\merged_all_data\journal+doi+abstract+year+citation+fieldofstudy_dropnull.csv"
    out_path = r'C:\Users\hn0139\Documents\GitHub\Misinformation\Data analysis\Content analysis\topics.csv'
    
    with open(data_path, 'r', encoding = 'utf-8') as f:    
        data = pd.read_csv(f)

    join_fn = lambda x : ' '.join([x.title, x.abstract])
    join_text = data.apply(join_fn, axis=1)

    clean_text = [preprocessing(tokenizer, text) for text in join_text]
    
    topics = get_lda_topic (clean_text, n_topics = [20, 25, 30, 35, 40], n_words = 10)

    # df_topics = pd.DataFrame(topics)
    # print(df_topics)
    # with open (out_path, 'w',  newline="", encoding='utf-8') as file:
    #     df_topics.to_csv(file)


--------- 20 ---------
Coherence Score:  0.4645722276139618
--------- 25 ---------
Coherence Score:  0.470628677985547
--------- 30 ---------
Coherence Score:  0.4849451488002597
--------- 35 ---------
Coherence Score:  0.46380498878700577
--------- 40 ---------
Coherence Score:  0.47095674013627525


AttributeError: 'dict_values' object has no attribute 'index'

# Topic evolution