In [2]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import nltk

In [3]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/koushik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#Load and preprocess headlines
df = pd.read_json("News_Category_Dataset_v3.json", lines=True)
headlines = df["headline"].astype(str).tolist()

def preprocess(text):
    return [word for word in simple_preprocess(text) if word not in stop_words]

tokenized_headlines = [preprocess(headline) for headline in headlines]

In [8]:
headlines[0]

'Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters'

In [7]:
tokenized_headlines[0]

['million',
 'americans',
 'roll',
 'sleeves',
 'omicron',
 'targeted',
 'covid',
 'boosters']

In [9]:
# Train Word2Vec embeddings
w2v_model = Word2Vec(sentences=tokenized_headlines, vector_size=100, window=5, min_count=2, workers=4)

In [10]:
# Sentence Embedding (average of word vectors)
def get_sentence_vector(tokens):
    vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(w2v_model.vector_size)

headline_vectors = np.array([get_sentence_vector(tokens) for tokens in tokenized_headlines])

In [None]:
# Similarity function
def find_top5_similar_headlines(query):
    query_tokens = preprocess(query)
    query_vector = get_sentence_vector(query_tokens).reshape(1, -1)
    sim = cosine_similarity(query_vector, headline_vectors).flatten()
    top5 = sim.argsort()[-5:][::-1]

    print("\nTop 5 Similar Headlines:\n")
    for i, idx in enumerate(top5, 1):
        print(f"{i}. {headlines[idx]}")


In [13]:
query = "NASA announces new mission to Mars"
find_top5_similar_headlines(query)



Top 5 Similar Headlines:

1. New Orleans Launches Its Loyola Streetcar, With Another Leg Awaited
2. 'Utter Devastation' After Major Quake, Aftershocks Hit New Zealand
3. Pharrell's 'G I R L' Album Reaches New Heights In The U.K.
4. Eagles Land, Then Soar in New Documentary
5. Elle Starts Filling The Holes In Its Masthead, Announces New Hires


In [15]:
query = headlines[0]
print(query)
find_top5_similar_headlines(query)

Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters

Top 5 Similar Headlines:

1. Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters
2. CDC Estimates More Than 140 Million Americans Have Had COVID So Far
3. Shrinking Majority Of Americans Supports Marijuana Legalization
4. Wealth of Forbes 400 Billionaires Equals Wealth of All 41 Million African-Americans
5. Medicaid Matters: The GOP Attack On 74 Million Americans
