In [22]:
import nltk
import json
import numpy as np
import heapq

In [3]:
news = []
with open('news.json', 'r') as file:
    for line in file:
        news.append(json.loads(line))

In [4]:
print(f"No of news headlines: {len(news)}")

No of news headlines: 209527


In [40]:
headlines = []
for new in news:
    temp = new['headline'] + ". " + new['short_description']
    headlines.append(temp)
print(headlines[0])

Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters. Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.


### Preprocessing

In [12]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kabirsprakash/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [46]:
def get_lemm_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
def preprocess_sentence(sentence):
    words = nltk.word_tokenize(sentence)
    pos_tags = nltk.pos_tag(words)
    lemmatised_words = []
    for word, tag in pos_tags:
        lemm_tag = get_lemm_pos(tag)
        lemmatised_words.append(lemmatiser.lemmatize(word.lower(), lemm_tag))
    return lemmatised_words

lemmatiser = WordNetLemmatizer()
pre_headlines = []
for headline in headlines:
    Words = preprocess_sentence(headline)
    pre_headlines.append(Words)

In [48]:
print(headlines[1])
print(pre_headlines[1])

American Airlines Flyer Charged, Banned For Life After Punching Flight Attendant On Video. He was subdued by passengers and crew when he fled to the back of the aircraft after the confrontation, according to the U.S. attorney's office in Los Angeles.
['american', 'airline', 'flyer', 'charged', ',', 'banned', 'for', 'life', 'after', 'punch', 'flight', 'attendant', 'on', 'video', '.', 'he', 'be', 'subdue', 'by', 'passenger', 'and', 'crew', 'when', 'he', 'flee', 'to', 'the', 'back', 'of', 'the', 'aircraft', 'after', 'the', 'confrontation', ',', 'accord', 'to', 'the', 'u.s.', 'attorney', "'s", 'office', 'in', 'los', 'angeles', '.']


### Embedding headlines into vectors

In [49]:
from gensim.models import Word2Vec

model = Word2Vec(pre_headlines,vector_size=100,min_count=1,window=5,workers=4)

In [20]:
print(model.wv["american"])

[-0.33915922 -0.13276395  1.0542692  -0.34620583  0.5063199  -1.5144683
  0.8800564  -1.0230283  -1.2816181   0.6434435  -0.5533166   0.1745081
 -0.7408204   2.4152102   0.42065004 -1.5542356   0.06946942 -1.2881159
 -1.008451    0.9541815   2.017711    0.44355062 -0.7874748   0.4828279
  0.626906   -0.16335349 -0.82482284  1.446604   -1.6163267  -2.7904549
  1.2714406  -0.19972034 -0.74721265 -0.47680017  0.3045603  -0.29945716
  1.9881146  -1.8192551  -0.38309824 -0.98787725 -0.46971914 -0.13682006
 -1.1677041   0.8814744   0.3259528  -0.28379503  0.5834679  -0.6184034
 -0.3693009   0.38733783 -0.49844733  0.3887702   0.19816263 -0.26782888
 -2.3215046   1.6271516   0.72744    -1.220142   -2.848284   -0.2339624
 -0.16797523 -1.5536301  -0.21997717 -0.42705718 -1.2278932   0.02035571
 -1.1293062   1.7771329   0.11400793 -0.36091107 -1.0919218   1.3435435
 -0.9473884  -1.6329008  -0.41245696  0.6570451   0.36958787 -0.9710406
  1.5779631  -0.21171144 -0.910774    1.7714694   0.3899484 

Finding closest headlines match from dataset based on cosine similarity of vectors

In [58]:
from numpy.linalg import norm

def cosine_similarity(lst1,lst2):
    return np.dot(lst1,lst2) / (norm(lst1)*norm(lst2))

def avgWord2Vec(sen):
    vectors = []
    for word in sen:
        if word in model.wv:
            vectors.append(model.wv[word])
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors,axis=0)

def closest_sentences(sen):
    p = preprocess_sentence(sen)
    v_in = avgWord2Vec(p)
    pq = []
    heapq.heapify(pq)
    for i in range(len(pre_headlines)):
        v = avgWord2Vec(pre_headlines[i])
        dist = cosine_similarity(v_in,v)
        heapq.heappush(pq,(dist,i))
        if len(pq) > 5:
            heapq.heappop(pq)
    res = []
    for _,idx in pq:
        res.append((news[idx]['headline'],news[idx]['short_description']))
    return res

In [57]:
# checking accuracy for one of sentence from dataset
print(f"Closest match of: {headlines[0]}")
print(closest_sentences(" ".join(pre_headlines[0])))

Closest match of: Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters. Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.
["IMF May Be Right in Suggesting Countries Raise Fuel Taxes. It's tough to find any drivers who relish digging into their wallets to fill up at the pump. According to the International Monetary Fund, though, not only should fuel taxes jump by more than 50 percent, the increase should have Canadians whistling a happy tune. Now, here's the real kicker: The IMF is right.", "Computer Science in K-12 Classrooms Needs to Catch Up. It's estimated that in the next decade the number of computer science jobs in the U.S. will outnumber qualified people by 1 million. That's 1 million jobs for the taking that Americans will miss out on because of inadequate skill sets.", "Sunday Roundup. With Friday's numbers showing the addition of 217,000 jobs, the U

In [61]:
input = "Atrocities, alleged war crimes and civilians caught in the crossfire of Africa's largest war"
print(f"Input sentence: {input}")
print("Top 5 closest matches from dataset :-")
matches = closest_sentences(input)
num = 1
for sen in matches:
    print(f"{num}. Headline - {sen[0]}")
    print(f"---> Description - {sen[1]}")
    num += 1

Input sentence: Atrocities, alleged war crimes and civilians caught in the crossfire of Africa's largest war
Top 5 closest matches from dataset :-
1. Headline - Syrian Army In 'Final Stages' Of Aleppo Offensive
---> Description - It was the country's largest city before the war.
2. Headline - Iraqi Military Says It's Taken Most Of ISIS Stronghold In Country's Northwest
---> Description - Retaking Tal Afar was the latest objective in the U.S.-backed war on ISIS following the recapture in July of Mosul.
3. Headline - Al Qaeda Militants Seize Part Of Yemen's Main Port City Of Aden
---> Description - The war in Yemen has led to dangerous instability.
4. Headline - Bangladesh Executes 2 Opposition Leaders For War Crimes
---> Description - The pair were convicted of war crimes committed during the country's 1971 war of independence.
5. Headline - Both Sides In The Battle Of Aleppo Committed War Crimes: UN
---> Description - The report also revealed Syrian government jets deliberately bombed 