In [1]:
import pandas as pd
data = pd.read_json('News_Category_Dataset_V3.json', lines=True)
df = data.copy()

In [2]:
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [3]:
df.shape

(209527, 6)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209527 entries, 0 to 209526
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   link               209527 non-null  object        
 1   headline           209527 non-null  object        
 2   category           209527 non-null  object        
 3   short_description  209527 non-null  object        
 4   authors            209527 non-null  object        
 5   date               209527 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.6+ MB


In [5]:
#check for null values
df.isnull().sum()

link                 0
headline             0
category             0
short_description    0
authors              0
date                 0
dtype: int64

In [6]:
#there are no null values

In [7]:
#check for duplicates
print(f'there are around {df.duplicated().sum()} duplicate rows' )
df[df.duplicated()]

there are around 13 duplicate rows


Unnamed: 0,link,headline,category,short_description,authors,date
67677,https://www.huffingtonpost.comhttp://www.mothe...,"On Facebook, Trump's Longtime Butler Calls For...",POLITICS,"Anthony Senecal, who worked as Donald Trump's ...",,2016-05-12
67923,https://www.huffingtonpost.comhttp://gizmodo.c...,Former Facebook Workers: We Routinely Suppress...,TECH,Facebook workers routinely suppressed news sto...,,2016-05-09
70239,https://www.huffingtonpost.comhttp://www.cnbc....,"On Equal Pay Day, The Gap Is Still Too Wide",WOMEN,Equal Pay Day falls on April 12 in 2016. It's ...,,2016-04-12
139830,https://www.huffingtonpost.comhttp://www.cnn.c...,The World's Most Dangerous Workout?,WELLNESS,"Is the ""sport of fitness"" the world's most dan...",,2014-02-10
144409,https://www.huffingtonpost.comhttp://www.upwor...,Some People Call It 'The Best Anti-Smoking Ad ...,WELLNESS,Almost all smokers know cigarettes are bad for...,,2013-12-22
145142,https://www.huffingtonpost.comhttp://www.weath...,10 Cities That Could Run Out Of Water - Weathe...,ENVIRONMENT,"Securing access to plentiful, renewable source...",,2013-12-15
178155,https://www.huffingtonpost.comhttp://www.busin...,Google Is Attacking Apple From The Inside Out ...,TECH,After years of hammering away at Apple's share...,,2013-01-01
187329,https://www.huffingtonpost.comhttp://www.nytim...,"Eating For Health, Not Weight",WELLNESS,Almost half of Americans are on a diet -- not ...,,2012-09-23
194596,https://www.huffingtonpost.comhttp://blogs.wsj...,Apple Removes Green EPEAT Electronics Certific...,TECH,Apple has pulled its products off the U.S. gov...,,2012-07-07
194598,https://www.huffingtonpost.comhttp://www.theda...,Microsoft's $6.2 Billion Writedown Shows It's ...,TECH,Fighting for online advertising dominance with...,,2012-07-07


In [8]:
#lets drop the duplicates
df = df.drop_duplicates()

In [9]:
#Link, date and authours are unique therefore drop column
# df = df.drop(labels=['link','headline' ,'category','authors', 'date'], axis=1)

df = df[['short_description']]
df.head()

Unnamed: 0,short_description
0,Health experts said it is too early to predict...
1,He was subdued by passengers and crew when he ...
2,"""Until you have a dog you don't understand wha..."
3,"""Accidentally put grown-up toothpaste on my to..."
4,Amy Cooper accused investment firm Franklin Te...


In [10]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# nltk.download('omw-1.4')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def clean_strings(string:str) -> str:
    string = string.strip()
    string = string.lower()

    pattern = re.compile('[\W_]+')  
    string = re.sub(pattern, ' ', string)

    string_list  = string.split(' ')

    string = [lemmatizer.lemmatize(i) for i in string_list if i not in stop_words]
    string = ' '.join(string)
    return string


df['modified_description'] = df['short_description'].apply(lambda x:clean_strings(x))

df.head()

Unnamed: 0,short_description,modified_description
0,Health experts said it is too early to predict...,health expert said early predict whether deman...
1,He was subdued by passengers and crew when he ...,subdued passenger crew fled back aircraft conf...
2,"""Until you have a dog you don't understand wha...",dog understand could eaten
3,"""Accidentally put grown-up toothpaste on my to...",accidentally put grown toothpaste toddler too...
4,Amy Cooper accused investment firm Franklin Te...,amy cooper accused investment firm franklin te...


In [11]:
from nltk.translate.bleu_score import sentence_bleu

In [24]:
max_bleu_score = 0
closest_row = ""

input_string = 'experts said it is early to predict whether demand would match up with the 171 million doses of the new boosters the U.S.'


cleaned_str = clean_strings(input_string)

for sentence in df['modified_description']:
    
    bleu_score = nltk.translate.bleu_score.sentence_bleu([sentence], cleaned_str)
    
    if bleu_score > max_bleu_score:
        max_bleu_score = bleu_score
        closest_row = sentence
print("input string", input_string)
print("Closest Row:", closest_row)
print('bleu score', max_bleu_score)

input string experts said it is early to predict whether demand would match up with the 171 million doses of the new boosters the U.S.
Closest Row: health expert said early predict whether demand would match 171 million dos new booster u ordered fall 
bleu score 0.7858700308158821


In [None]:
# df.to_csv('modified_description.csv')