# Finding key words in articles using TF-IDF
<b>Term Frequency</b> = (# of times term appears) / (total # of terms in article) \
<b>Inverse Document Frequency</b> = log(# of sentences / # of sentences with the term) \
<b>TF-IDF</b> - term frequency * inverse document frequency

Higher TF-IDF score means the term is more important. I will find the TF-IDF score of each term and keep the top 10.

In [1]:
import pandas as pd
from sqlalchemy import create_engine
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import json
import math

In [2]:
with open('../secrets.json') as file:
    secrets = json.load(file)
    connection_string = secrets['connection_string']
db = create_engine(connection_string)
df = pd.read_sql('select * from news_article', con=db)

In [3]:
content = df.iloc[1]['content']
content

'LONDON (Reuters) - One in three COVID-19 survivors in a study of more than 230,000 mostly American patients were diagnosed with a brain or psychiatric disorder within six months, suggesting the pandemic could lead to a wave of mental and neurological problems, scientists said on Tuesday. FILE PHOTO: Nurses react as they treat a COVID-19 patient in the ICU (Intensive Care Unit) at Milton Keynes University Hospital, amid the spread of the coronavirus disease (COVID-19) pandemic, Milton Keynes, Britain, January 20, 2021. REUTERS/Toby Melville/File Photo Researchers who conducted the analysis said it was not clear how the virus was linked to psychiatric conditions such as anxiety and depression, but that these were the most common diagnoses among the 14 disorders they looked at. Post-COVID cases of stroke, dementia and other neurological disorders were rarer, the researchers said, but were still significant, especially in those who had severe COVID-19. “Although the individual risks for m

# Find the term frequency
(# of times term appears) / (total # of terms in article)

In [6]:
# tokenize the content
tokens = word_tokenize(content)

# remove stopwords and punctuation - also include the word 'said' becuase this shows up a lot
tokens = [t for t in tokens if t.lower() not in stopwords.words('english') and len(t) >= 3 and t.lower() != 'said']

tokens

['LONDON',
 'Reuters',
 'One',
 'three',
 'COVID-19',
 'survivors',
 'study',
 '230,000',
 'mostly',
 'American',
 'patients',
 'diagnosed',
 'brain',
 'psychiatric',
 'disorder',
 'within',
 'six',
 'months',
 'suggesting',
 'pandemic',
 'could',
 'lead',
 'wave',
 'mental',
 'neurological',
 'problems',
 'scientists',
 'Tuesday',
 'FILE',
 'PHOTO',
 'Nurses',
 'react',
 'treat',
 'COVID-19',
 'patient',
 'ICU',
 'Intensive',
 'Care',
 'Unit',
 'Milton',
 'Keynes',
 'University',
 'Hospital',
 'amid',
 'spread',
 'coronavirus',
 'disease',
 'COVID-19',
 'pandemic',
 'Milton',
 'Keynes',
 'Britain',
 'January',
 '2021',
 'REUTERS/Toby',
 'Melville/File',
 'Photo',
 'Researchers',
 'conducted',
 'analysis',
 'clear',
 'virus',
 'linked',
 'psychiatric',
 'conditions',
 'anxiety',
 'depression',
 'common',
 'diagnoses',
 'among',
 'disorders',
 'looked',
 'Post-COVID',
 'cases',
 'stroke',
 'dementia',
 'neurological',
 'disorders',
 'rarer',
 'researchers',
 'still',
 'significant',
 'e

In [20]:
# find the unique terms, then count how many times each term appears
unique_terms = []

for token in tokens:
    if token not in unique_terms:
        unique_terms.append(token)
        
# find how many times each term appears
term_counts = {}
for term in unique_terms:
    term_counts.update({term: 0})

for token in tokens:
    term_counts[token] += 1

# find term frequencies by diving # of times each term appears by the term counts
term_freqs = {}
num_terms = len(tokens)

for term in unique_terms:
    term_freqs.update({term: term_counts[term] / num_terms})

term_freqs

{'LONDON': 0.0035587188612099642,
 'Reuters': 0.0035587188612099642,
 'One': 0.0035587188612099642,
 'three': 0.0071174377224199285,
 'COVID-19': 0.046263345195729534,
 'survivors': 0.010676156583629894,
 'study': 0.010676156583629894,
 '230,000': 0.0035587188612099642,
 'mostly': 0.0071174377224199285,
 'American': 0.0035587188612099642,
 'patients': 0.010676156583629894,
 'diagnosed': 0.014234875444839857,
 'brain': 0.010676156583629894,
 'psychiatric': 0.014234875444839857,
 'disorder': 0.0071174377224199285,
 'within': 0.014234875444839857,
 'six': 0.010676156583629894,
 'months': 0.014234875444839857,
 'suggesting': 0.0071174377224199285,
 'pandemic': 0.0071174377224199285,
 'could': 0.0035587188612099642,
 'lead': 0.0035587188612099642,
 'wave': 0.0035587188612099642,
 'mental': 0.014234875444839857,
 'neurological': 0.010676156583629894,
 'problems': 0.0035587188612099642,
 'scientists': 0.0071174377224199285,
 'said': 0.03202846975088968,
 'Tuesday': 0.0035587188612099642,
 'FI

# Find inverse document frequency
log(# of sentences / # of sentences with the term)

In [21]:
# split content into sentences
sentences = sent_tokenize(content)
num_sentences = len(sentences)

# split each sentence into word tokens, no need to remove stop words here
sentences = [word_tokenize(sent) for sent in sentences]

sentences

[['LONDON',
  '(',
  'Reuters',
  ')',
  '-',
  'One',
  'in',
  'three',
  'COVID-19',
  'survivors',
  'in',
  'a',
  'study',
  'of',
  'more',
  'than',
  '230,000',
  'mostly',
  'American',
  'patients',
  'were',
  'diagnosed',
  'with',
  'a',
  'brain',
  'or',
  'psychiatric',
  'disorder',
  'within',
  'six',
  'months',
  ',',
  'suggesting',
  'the',
  'pandemic',
  'could',
  'lead',
  'to',
  'a',
  'wave',
  'of',
  'mental',
  'and',
  'neurological',
  'problems',
  ',',
  'scientists',
  'said',
  'on',
  'Tuesday',
  '.'],
 ['FILE',
  'PHOTO',
  ':',
  'Nurses',
  'react',
  'as',
  'they',
  'treat',
  'a',
  'COVID-19',
  'patient',
  'in',
  'the',
  'ICU',
  '(',
  'Intensive',
  'Care',
  'Unit',
  ')',
  'at',
  'Milton',
  'Keynes',
  'University',
  'Hospital',
  ',',
  'amid',
  'the',
  'spread',
  'of',
  'the',
  'coronavirus',
  'disease',
  '(',
  'COVID-19',
  ')',
  'pandemic',
  ',',
  'Milton',
  'Keynes',
  ',',
  'Britain',
  ',',
  'January',
 

In [25]:
# find number of sentences containing each term
sentence_freqs = {}

for term in unique_terms:
    sentence_freqs.update({term: 0})
    
for term in unique_terms:
    for sent in sentences:
        if term in sent:
            sentence_freqs[term] += 1

# compute inverse document frequency for each term
idf = {}

for term in unique_terms:
    term_val = 0

    # avoid division by 0
    if sentence_freqs[term] != 0:
        term_val = math.log(num_sentences / sentence_freqs[term])

    idf.update({
        term: term_val
    })
    
idf

{'LONDON': 2.833213344056216,
 'Reuters': 2.833213344056216,
 'One': 2.833213344056216,
 'three': 2.1400661634962708,
 'COVID-19': 0.4353180712578455,
 'survivors': 1.7346010553881064,
 'study': 1.7346010553881064,
 '230,000': 2.833213344056216,
 'mostly': 2.1400661634962708,
 'American': 2.833213344056216,
 'patients': 1.7346010553881064,
 'diagnosed': 1.4469189829363254,
 'brain': 1.7346010553881064,
 'psychiatric': 1.4469189829363254,
 'disorder': 2.1400661634962708,
 'within': 1.4469189829363254,
 'six': 1.7346010553881064,
 'months': 1.4469189829363254,
 'suggesting': 2.1400661634962708,
 'pandemic': 2.1400661634962708,
 'could': 2.833213344056216,
 'lead': 2.833213344056216,
 'wave': 2.833213344056216,
 'mental': 1.4469189829363254,
 'neurological': 1.7346010553881064,
 'problems': 2.833213344056216,
 'scientists': 2.1400661634962708,
 'said': 0.6359887667199967,
 'Tuesday': 2.833213344056216,
 'FILE': 2.833213344056216,
 'PHOTO': 2.833213344056216,
 'Nurses': 2.833213344056216,


# Find TF-IDF
(term frequency) * (inverse document frequency)

In [26]:
# find tfidf for each term
tfidf = {}

for term in unique_terms:
    tfidf.update({
        term: term_freqs[term] * idf[term]
    })
    
tfidf

{'LONDON': 0.010082609765324612,
 'Reuters': 0.010082609765324612,
 'One': 0.010082609765324612,
 'three': 0.015231787640542852,
 'COVID-19': 0.020139270200540894,
 'survivors': 0.018518872477453093,
 'study': 0.018518872477453093,
 '230,000': 0.010082609765324612,
 'mostly': 0.015231787640542852,
 'American': 0.010082609765324612,
 'patients': 0.018518872477453093,
 'diagnosed': 0.020596711500872958,
 'brain': 0.018518872477453093,
 'psychiatric': 0.020596711500872958,
 'disorder': 0.015231787640542852,
 'within': 0.020596711500872958,
 'six': 0.018518872477453093,
 'months': 0.020596711500872958,
 'suggesting': 0.015231787640542852,
 'pandemic': 0.015231787640542852,
 'could': 0.010082609765324612,
 'lead': 0.010082609765324612,
 'wave': 0.010082609765324612,
 'mental': 0.020596711500872958,
 'neurological': 0.018518872477453093,
 'problems': 0.010082609765324612,
 'scientists': 0.015231787640542852,
 'said': 0.020369746976797047,
 'Tuesday': 0.010082609765324612,
 'FILE': 0.01008260

In [35]:
# take the top 10 words with highest TF-IDF score
# swap keys and values so the list can be sorted by TF-IDF score easily
swapped_key_and_vals = []
for item in tfidf.items():
    swapped_key_and_vals.append((item[1], item[0]))

# take the last ten items in reversed order so it's sorted in descending order
top_ten = sorted(swapped_key_and_vals)[-1:-11:-1]

# format for response
response = []
for item in top_ten:
    response.append({
        'term': item[1],
        'score': item[0]
    })
    
response

[{'term': 'disorders', 'score': 0.022237449284587072},
 {'term': 'within', 'score': 0.020596711500872958},
 {'term': 'severe', 'score': 0.020596711500872958},
 {'term': 'psychiatric', 'score': 0.020596711500872958},
 {'term': 'months', 'score': 0.020596711500872958},
 {'term': 'mental', 'score': 0.020596711500872958},
 {'term': 'diagnosed', 'score': 0.020596711500872958},
 {'term': 'said', 'score': 0.020369746976797047},
 {'term': 'Milton', 'score': 0.020165219530649224},
 {'term': 'Keynes', 'score': 0.020165219530649224}]

In [39]:
# format top 10 as list
top_ten_terms = [item[1] for item in top_ten]
';'.join(top_ten_terms)

'disorders;within;severe;psychiatric;months;mental;diagnosed;said;Milton;Keynes'