In [1]:
import pandas as pd
import nltk
import spacy
import pickle

import pprint

from spacy.lang.en import English
from nltk.stem.wordnet import WordNetLemmatizer

from collections import defaultdict

import gensim
from gensim.models import LdaModel

from sklearn.feature_extraction.text import TfidfVectorizer

## Term-frequency Inverse Document Frequency Analysis
### Aim: extract keywords per article

In [2]:
# configuration of pre-processing tools

nlp = spacy.load('en_core_web_sm')
parser = English()

en_stop = set(nltk.corpus.stopwords.words('english'))
en_stop.add('pron') # append spacy abbreviation for pronoun to stopwords
en_stop.add('')

In [3]:
# load data (output from TR API)

api_output_path = '../Data/TR_API_files/TR_API_results.tsv'
api_output = pd.read_csv(api_output_path, sep='\t')
api_output.head()

Unnamed: 0,id,guid,dateCreated,geography,firstcreated,headline,language,slug,caption,urgency,...,usn,version,versioncreated,versionedguid,wordcount,body_xhtml,dateline,parsed_text,countries_long,countries_long_newversion
0,"tag:reuters.com,2019:newsml_L3N27I00W:391219080","tag:reuters.com,2019:newsml_L3N27I00W",1572659257000,"['VN', 'BE', 'GB', 'CN', 'BG']",2019-11-02T01:47:37.000Z,Vietnam says human traffickers must be strictl...,en,BRITAIN-BODIES/VIETNAM,BRITAIN-BODIES/VIETNAM:Vietnam says human traf...,3,...,L3N27I00W,391219080,2019-11-02T01:47:37.000Z,"tag:reuters.com,2019:newsml_L3N27I00W:391219080",226,<p>Nov 2 (Reuters) - Vietnam sai...,,Nov 2 (Reuters) - Vietnam said on Saturday tha...,"Vietnam,China","Vietnam,China"
1,"tag:reuters.com,2019:newsml_L8N27H5Q0:447177668","tag:reuters.com,2019:newsml_L8N27H5Q0",1572646886000,"['BR', 'BD', 'MX', 'AF', 'IN', 'US']",2019-11-01T22:21:26.000Z,Brazil police arrest man said to be one of wor...,en,BRAZIL-HUMAN TRAFFICKING/ (TV),BRAZIL-HUMAN TRAFFICKING/ (TV):Brazil police a...,3,...,L8N27H5Q0,447177668,2019-11-01T22:21:26.000Z,"tag:reuters.com,2019:newsml_L8N27H5Q0:447177668",346,"<p>SAO PAULO, Nov 1 (Reuters) - ...",,"SAO PAULO, Nov 1 (Reuters) - Brazilian federal...","Brazil,Bangladesh,Mexico,Afghanistan,India","Brazil,Bangladesh,Mexico,Afghanistan,India"
2,"tag:reuters.com,2019:newsml_KBN1XB3XS:6","tag:reuters.com,2019:newsml_KBN1XB3XS",1572640591000,"['VN', 'GB', 'CN', 'IE', 'BG', 'IND']",2019-11-01T11:05:29.000Z,"Second man charged over UK truck deaths, victi...",en-GB,UK-BRITAIN-BODIES,UK-BRITAIN-BODIES:Second man charged over UK t...,4,...,KBN1XB3XS,6,2019-11-01T20:36:31.000Z,"tag:reuters.com,2019:newsml_KBN1XB3XS:6",450,<p>By Amanda Ferguson</p>\n ...,2019-11-01 20:36:31 GMT+00:00,By Amanda Ferguson\nBELFAST (Reuters) - A seco...,"Vietnam,China","Vietnam,China"
3,"tag:reuters.com,2019:newsml_L8N27H2WN:1498332260","tag:reuters.com,2019:newsml_L8N27H2WN",1572640534000,"['VN', 'GB', 'CN', 'IE', 'BG']",2019-11-01T11:45:27.000Z,UPDATE 3-Second man charged over UK truck deat...,en,"BRITAIN-BODIES/ (UPDATE 3, PIX, TV)","BRITAIN-BODIES/ (UPDATE 3, PIX, TV):UPDATE 3-S...",3,...,L8N27H2WN,1498332260,2019-11-01T20:35:34.000Z,"tag:reuters.com,2019:newsml_L8N27H2WN:1498332260",499,<p>* Thirty-nine bodies were fou...,,* Thirty-nine bodies were found in a truck on ...,"Vietnam,China","Vietnam,China"
4,"tag:reuters.com,2019:newsml_L8N27H0YP:739041794","tag:reuters.com,2019:newsml_L8N27H0YP",1572627800000,"['LB', 'IL']",2019-11-01T07:17:26.000Z,UPDATE 6-Hezbollah: Lebanon's next government ...,en,"LEBANON-PROTESTS/ (UPDATE 6, PIX, TV)","LEBANON-PROTESTS/ (UPDATE 6, PIX, TV):UPDATE 6...",3,...,L8N27H0YP,739041794,2019-11-01T17:03:20.000Z,"tag:reuters.com,2019:newsml_L8N27H0YP:739041794",722,<p>* Hezbollah urges forming new...,,* Hezbollah urges forming new cabinet quickly\...,Lebanon,Lebanon


In [4]:
text_collection = api_output[['guid', 'parsed_text']].copy()

In [5]:
# get list of documents

text_collection['document'] = text_collection.parsed_text.apply(lambda x: ' '.join(x.split('\n')))

text_collection = text_collection.drop('parsed_text', axis=1)

text_collection

Unnamed: 0,guid,document
0,"tag:reuters.com,2019:newsml_L3N27I00W",Nov 2 (Reuters) - Vietnam said on Saturday tha...
1,"tag:reuters.com,2019:newsml_L8N27H5Q0","SAO PAULO, Nov 1 (Reuters) - Brazilian federal..."
2,"tag:reuters.com,2019:newsml_KBN1XB3XS",By Amanda Ferguson BELFAST (Reuters) - A secon...
3,"tag:reuters.com,2019:newsml_L8N27H2WN",* Thirty-nine bodies were found in a truck on ...
4,"tag:reuters.com,2019:newsml_L8N27H0YP",* Hezbollah urges forming new cabinet quickly ...
...,...,...
272,"tag:reuters.com,2019:newsml_KCN1WK0HB",By John Davison BAGHDAD (Reuters) - Dozens of ...
273,"tag:reuters.com,2019:newsml_L5N26P39K",(Updates with new death toll) By John Davison ...
274,"tag:reuters.com,2019:newsml_L5N26370I",(Clarifies description of software tool) By Ka...
275,"tag:reuters.com,2019:newsml_L5N26P2ZA","(Updates with German interior minister, Greek ..."


In [None]:
# pre-process with spacy

text_collection['nlp'] = text_collection.document.apply(lambda x: nlp(x))

In [None]:
# add lemmatized version

text_collection['lemmas'] = text_collection.nlp.apply(lambda x: [t.lemma_.lower() for t in x])

In [None]:
# count lemma frequencies and only keep that appear more than once
# remove stopwords

frequency = defaultdict(int)
for text in text_collection.lemmas:
    for lemma in text:
        frequency[lemma] += 1
        

text_collection['processed_lemmas'] = text_collection.lemmas.apply(lambda x: [lemma for lemma in x if frequency[lemma] > 1 and not lemma in en_stop])
text_collection.processed_lemmas

In [None]:
# add bigrams and trigrams to docs (only ones that appear 20 times or more)

docs = text_collection.processed_lemmas.tolist()

bigram = gensim.models.Phrases(docs, min_count=5)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # token is a bigram, add to document.
            docs[idx].append(token)
            
            
text_collection['docs_ngrams'] = docs

In [None]:
# calculate tf-idf

corpus = text_collection['docs_ngrams'].apply(lambda x: ' '.join(x)).tolist()

vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(corpus)

feature_names = vectorizer.get_feature_names()

In [None]:
# top number of terms identified through tf-idf

n_max = 10

tf_idf_docscores = list()

for doc_idx in range(len(corpus)):
    print('\n')
    
    feature_index = tfidf_matrix[doc_idx,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[doc_idx, x] for x in feature_index])
    
    tfidf_per_lemma = sorted([(feature_names[i], s) for (i, s) in tfidf_scores], key=lambda x: x[1], reverse=True)
    
    
    for lemma, score in tfidf_per_lemma[:n_max]:
        print(lemma, score)
        
        
    tf_idf_docscores.append(tfidf_per_lemma[:n_max])        

In [None]:
text_collection['keywords'] = tf_idf_docscores

text_collection.head()

In [None]:
keyword_output = text_collection[['guid', 'document', 'keywords']].copy()

keyword_output['keywords'] = keyword_output.keywords.apply(lambda x: '; '.join([f'{i[0]}: {i[1]}' for i in x]))

keyword_output.head()

In [None]:
keyword_output.to_csv('../Data/TR_API_files/keywords_per_article.tsv', sep='\t', index=None)