In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
data = pd.read_json("https://raw.githubusercontent.com/Kuna1Chauhan/EDA/main/News_Category_Dataset_v3.json", lines=True)

# Preprocess the data
data['text'] = data['headline'] + ' ' + data['short_description']

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['text'])

# Compute the similarity matrix
similarity_matrix = cosine_similarity(tfidf_matrix)

# Function to find the most similar data to a given data
def find_most_similar_data(input_data, top_k=5):
    # Vectorize the input data
    input_vector = vectorizer.transform([input_data])

    # Compute the similarity between the input data and all other data points
    similarity_scores = cosine_similarity(input_vector, tfidf_matrix)

    # Get the indices of the top-k most similar data points
    top_indices = similarity_scores.argsort()[0][-top_k:][::-1]

    # Return the most similar data points
    return data.iloc[top_indices]

# Example usage
input_data = "New study finds link between coffee consumption and improved brain function"
most_similar_data = find_most_similar_data(input_data)
print(most_similar_data[['headline', 'short_description']])
