In [2]:
import pandas as pd
import numpy as np
import requests
import json

In [3]:
apikey = "ad88ad93-b2db-485a-a0c2-82da759612b4" 
baseURL = "https://content.guardianapis.com/search?"
section = "science" 
page = 1

#page through all results using for loop
url_list = []
for page in range(1,566):
    url = baseURL+'section='+section+"&page="+str(page)+"&api-key="+apikey
    url_list += [url]

In [4]:
def getArticleData(url):
    response = requests.get(url)
    data = json.loads(response.content)
    result = data['response']['results']
    return result

In [5]:
result = [] #list that contains all results
for i in url_list:
    result = result+[getArticleData(i)]  

In [7]:
title = []
date = []
for k in result:
    for r in k:
        date += [r['webPublicationDate']]
        title += [r['webTitle']]  

In [8]:
science_df = pd.DataFrame({'Date':date,'Title':title})
science_df

Unnamed: 0,Date,Title
0,2021-06-16T05:00:22Z,Plantwatch: staghorn ferns – the plants that f...
1,2021-06-15T17:30:06Z,Ultra-thin film could one day turn regular gla...
2,2021-06-15T14:11:10Z,Blood glaciers: why is Alpine snow turning pink?
3,2021-06-14T16:04:02Z,Did you solve it? Ace of spades
4,2021-06-14T11:32:49Z,Ailsa Land obituary
...,...,...
5645,2015-10-05T17:08:10Z,What happened to wildlife when Chernobyl drove...
5646,2015-10-05T16:27:32Z,"William C Campbell, Satoshi Ōmura and Tu Youyo..."
5647,2015-10-05T16:20:11Z,One small click: thousands of Apollo astronaut...
5648,2015-10-05T14:05:21Z,Tu Youyou: how Mao’s challenge to malaria pion...


In [14]:
import warnings
warnings.filterwarnings("ignore")

from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.utils import tokenize
from gensim.utils import simple_preprocess
from gensim.corpora.textcorpus import remove_stopwords
#from gensim.summarization import keywords
from gensim.models.ldamodel import LdaModel
import pyLDAvis
import pyLDAvis.gensim_models

In [15]:
tokens = list(tokenize(science_df['Title'][0], lowercase = True))

In [16]:
science_df['terms'] = [remove_stopwords(simple_preprocess(title)) for title in science_df['Title']]

In [17]:
science_df

Unnamed: 0,Date,Title,terms
0,2021-06-16T05:00:22Z,Plantwatch: staghorn ferns – the plants that f...,"[plantwatch, staghorn, ferns, plants, form, co..."
1,2021-06-15T17:30:06Z,Ultra-thin film could one day turn regular gla...,"[ultra, film, day, turn, regular, glasses, nig..."
2,2021-06-15T14:11:10Z,Blood glaciers: why is Alpine snow turning pink?,"[blood, glaciers, alpine, snow, turning, pink]"
3,2021-06-14T16:04:02Z,Did you solve it? Ace of spades,"[solve, ace, spades]"
4,2021-06-14T11:32:49Z,Ailsa Land obituary,"[ailsa, land, obituary]"
...,...,...,...
5645,2015-10-05T17:08:10Z,What happened to wildlife when Chernobyl drove...,"[happened, wildlife, chernobyl, drove, humans,..."
5646,2015-10-05T16:27:32Z,"William C Campbell, Satoshi Ōmura and Tu Youyo...","[william, campbell, satoshi, ōmura, tu, youyou..."
5647,2015-10-05T16:20:11Z,One small click: thousands of Apollo astronaut...,"[small, click, thousands, apollo, astronaut, p..."
5648,2015-10-05T14:05:21Z,Tu Youyou: how Mao’s challenge to malaria pion...,"[tu, youyou, mao, challenge, malaria, pioneer,..."


In [18]:
vocab = Dictionary(science_df['terms'])
print(vocab.token2id) # this function dorectly goves out the frequency of the vocabs



__Use TF-IDF Model on Titles and Find Most Relevant terms__

In [19]:
# convert corpus to BoW format
corpus = [vocab.doc2bow(terms) for terms in science_df['terms']]  
model = TfidfModel(corpus)# fit a tf-idf model to the corpus
tfidf_doc = model[corpus] # apply model to the first corpus

In [20]:
def get_tfidf (index):
    term_values = [(vocab[e[0]],e[1]) for e in model[corpus[index]] if e[1]>0]
    srt =  sorted(term_values, key=lambda x: x[1],reverse=True)
    return list(map(lambda x: x[0],srt[:5]))

__LDA Model of our Corpus__

In [21]:
# create LDA model witg corpus and vocab, and define the topic numbers
lda_model = LdaModel(corpus = corpus, id2word = vocab, num_topics = 20)

In [22]:
# show_topic function returns a list with format of [topic number, topic content]
for topic in lda_model.show_topics(num_topics = 3, num_words = 15):
    print("Topic "+str(topic[0])+"\n"+topic[1]+"\n")

Topic 4
0.028*"study" + 0.013*"space" + 0.013*"says" + 0.012*"brain" + 0.012*"claims" + 0.010*"nhs" + 0.010*"brains" + 0.010*"obituary" + 0.008*"head" + 0.008*"live" + 0.008*"genes" + 0.008*"people" + 0.008*"research" + 0.008*"trump" + 0.007*"finds"

Topic 9
0.019*"way" + 0.014*"scientist" + 0.011*"space" + 0.011*"astronaut" + 0.010*"obituary" + 0.010*"uk" + 0.009*"chief" + 0.009*"milky" + 0.007*"deaths" + 0.007*"maths" + 0.007*"life" + 0.007*"gm" + 0.007*"stephen" + 0.007*"prehistoric" + 0.007*"hawking"

Topic 18
0.026*"week" + 0.019*"study" + 0.015*"lab" + 0.013*"great" + 0.012*"rise" + 0.010*"save" + 0.009*"dna" + 0.009*"stephen" + 0.008*"ai" + 0.008*"hawking" + 0.008*"challenge" + 0.008*"david" + 0.007*"blood" + 0.007*"winter" + 0.007*"led"



__Get the probability that a certain document belongs to a certain topic__

In [23]:
doc = science_df['Title'][1]
print("doc:\n",doc)
doc_topics = lda_model.get_document_topics(corpus[2] ,minimum_probability=0.3)
print("doc_topics:\n",doc_topics)

for topic in doc_topics:
    terms = [term for term, prob in lda_model.show_topic(topic[0])]
    print(terms)

doc:
 Ultra-thin film could one day turn regular glasses into night vision goggles, researchers say
doc_topics:
 [(10, 0.64882183)]
['gene', 'research', 'scientists', 'british', 'editing', 'human', 'breakthrough', 'heart', 'therapy', 'cell']


__Visualize the Spread of Topics__

In [24]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, vocab)
vis

__Overall Topic Analysis:__


- One of the most distinctive topic is Topic 1, which contains keywords such as DNA, baby, gender-related terms, medicine, humans. Thus we can understand that this topic may be highly relevant to study in the biological field.


- The second distinctive topic (Topic No.2) contains terms like nasa, mars, mission, space, life...etc. Which indicates that this topic is quite releveant to explorations into outerspace, especially Mars and moon (whci are the terms, too). 


- Some topcs are highly overlapped with each other. Topic 5, 7, 8, 19, 20 gather a cluster. And from the terms thay include, we can observe that their shared traits lies in more general scientific terms such as scientists, science, experiements... etc. So their value are rather low  when distincting topics from articles.


- Overall, in most topics recognized by the model, though we can connect some terms with human knowledge, other terms are compiled by patterns that are out of common senses. This is both a benefit and downside. This means that after calculating the corpus, the machine has found patterns that is difficult for human to find. By utilizing this features, we can efficeintly classify different articles. However, sice we don't know how the model actually differs them, it will be hard for us human to reveal when the algorithm is making a mistake.