# ML_Question_3

#### Importing libraries

In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.metrics.distance import jaccard_distance
import Levenshtein

#### Reading dataset

In [83]:
df = pd.read_json("News_Category_Dataset_v3.json", lines = True)

#### Top Rows of dataset

In [84]:
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


#### Finding number of rows and column in dataset

In [85]:
df.shape

(209527, 6)

#### Name of All Features

In [86]:
df.columns

Index(['link', 'headline', 'category', 'short_description', 'authors', 'date'], dtype='object')

#### Dropping irrelavant column

In [87]:
df = df.drop(['link','date'], axis=1)


#### Creating new column for news description

In [89]:
df['News_description'] = df['headline'] + ' : '+ df['short_description']

## Model Evaluation

In [90]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.metrics.distance import jaccard_distance
import Levenshtein

# Assuming you have a DataFrame called 'df' with your dataset

# Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the text data into a numerical representation
tfidf_matrix = vectorizer.fit_transform(df['News_description'])

# Define the given data point
given_data = "news on health and wellness"

# Transform the given data point using the vectorizer
given_data_tfidf = vectorizer.transform([given_data])

# Calculate cosine similarities between the given data point and all other data points
cosine_similarities = cosine_similarity(given_data_tfidf, tfidf_matrix).flatten()

# Calculate Jaccard similarities between the given data point and all other data points
jaccard_similarities = df['News_description'].apply(lambda x: 1 - jaccard_distance(set(given_data.split()), set(x.split()))).values

# Calculate Levenshtein distances between the given data point and all other data points
levenshtein_distances = df['News_description'].apply(lambda x: Levenshtein.distance(given_data, x)).values

# Combine the similarity metrics into a DataFrame
similarity_df = pd.DataFrame({
    'cosine_similarity': cosine_similarities,
    'jaccard_similarity': jaccard_similarities,
    'levenshtein_distance': levenshtein_distances,
    'original_index': df.index
})

# Sort the DataFrame by similarity scores in descending order
similarity_df.sort_values(by=['cosine_similarity', 'jaccard_similarity', 'levenshtein_distance'], ascending=False, inplace=True)

# Get the most similar data point
most_similar_index = similarity_df.iloc[1]['original_index']
most_similar_data_point = df.loc[most_similar_index]

# Print the most similar data point
print("Given Data Point:", given_data)
print("Most Similar Data Point:")
print(most_similar_data_point)


Given Data Point: news on health and wellness
Most Similar Data Point:
headline                                    What the Hell Is Wellness?
category                                                HEALTHY LIVING
short_description                                                     
authors              Susanna Barkataki, Contributor Susanna shares ...
News_description                         What the Hell Is Wellness? : 
Name: 105412, dtype: object
