# Headline similarity using cosine similarity formula

In [1]:
import nltk
import pandas as pd

In [4]:
df = pd.read_csv('../data/articles.csv')
df.head()

Unnamed: 0,post_id,post_title,url,score,publisher,headline,date_published,content
0,fra8wl,Top US general resists Trump administration?s ...,https://mondoweiss.net/2020/03/top-u-s-general...,10285,mondoweiss,Top U.S. general resists Trump administration?...,2020-03-28 15:44:00,A brave U.S. army lieutenant general may be ri...
1,frcvgj,Experts believe the explosion of coronavirus c...,https://www.si.com/soccer/2020/03/25/atalanta-...,2854,si,Atalanta vs Valencia linked to accelerating co...,2020-03-25 00:00:00,ROME (AP) ? It was the biggest soccer game in ...
2,fr5uqd,Boris Johnson's government is reportedly furio...,https://www.businessinsider.com/coronavirus-bo...,79397,businessinsider,Boris Johnson's government is reportedly furio...,2020-03-29 00:00:00,"UK government officials say there'll be ""recko..."
3,fr7uzc,Toyota Gearing Up To Build Ventilators And Fac...,https://www.carscoops.com/2020/03/toyota-geari...,4988,carscoops,Toyota Gearing Up To Build Ventilators And Fac...,2020-03-27 22:36:00,"The United States will soon have over 100,000 ..."
4,frbkqr,Prime Minister Justin Trudeau says Health Cana...,https://www.ctvnews.ca/health/coronavirus/trud...,2341,ctvnews,Trudeau vows 'no corners cut' in accepting mas...,2020-03-29 13:04:00,TORONTO -- Prime Minister Justin Trudeau says ...


### collect all headlines and word tokenize them
should stem and/or lemmatize these words at some point too

In [8]:
headline_words = []

for i in range(len(df)):
    headline = df.iloc[i]['headline']
    words = nltk.word_tokenize(headline)
    words = [word for word in words if len(word) > 3] # only keep words longer than 3 characters
    headline_words.append(words)

headline_words[:5]

[['U.S.',
  'general',
  'resists',
  'Trump',
  'administration',
  'efforts',
  'provoke',
  'with',
  'Iran',
  'Mondoweiss'],
 ['Atalanta', 'Valencia', 'linked', 'accelerating', 'coronavirus', 'spread'],
 ['Boris',
  'Johnson',
  'government',
  'reportedly',
  'furious',
  'with',
  'China',
  'believes',
  'could',
  'have',
  'times',
  'more',
  'coronavirus',
  'cases',
  'than',
  'claims'],
 ['Toyota',
  'Gearing',
  'Build',
  'Ventilators',
  'Face',
  'Shields',
  'Mercedes',
  'Offers',
  'Printers'],
 ['Trudeau',
  'vows',
  'corners',
  'accepting',
  'masks',
  'other',
  'supplies',
  'from',
  'China']]

### find all unique words

In [10]:
unique_words = []

for words in headline_words:
    for word in words:
        if word not in unique_words:
            unique_words.append(word)
            
unique_words[:5]

['U.S.', 'general', 'resists', 'Trump', 'administration']

### create a bag of words
This turns each headline into a vector where each index represents a word in the dictionary of known words. The value of each index is a count of the number of times that words shows up in the headline.

In [16]:
def create_bow(headline):
    bow = [0 for _ in range(len(unique_words))]
    words = nltk.word_tokenize(headline)
    for word in words:
        if word in unique_words:
            indx = unique_words.index(word)
            bow[indx] += 1
            
    return bow

In [19]:
# test creating a bag of words

headline = df.iloc[4]['headline']
print(headline)
bow = create_bow(headline)
bow

Trudeau vows 'no corners cut' in accepting masks, other supplies from China


[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


### find cosine similarity between two vectors