In [1]:
import json
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import requests
from string import punctuation
from collections import defaultdict
stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
stop_words = set(stopwords_list.decode().splitlines()) 
stop_words = stop_words.union(set(punctuation))
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/kang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df = pd.read_json('News_Category_Dataset_v3.json', lines=True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209527 entries, 0 to 209526
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   link               209527 non-null  object        
 1   headline           209527 non-null  object        
 2   category           209527 non-null  object        
 3   short_description  209527 non-null  object        
 4   authors            209527 non-null  object        
 5   date               209527 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.6+ MB


In [4]:
missing_values = df.isnull().sum()
print(missing_values)

link                 0
headline             0
category             0
short_description    0
authors              0
date                 0
dtype: int64


In [5]:
df.drop_duplicates(inplace=True)
df.shape

(209514, 6)

In [6]:
df['headline'] = df['headline'].str.lower()
df['headline_tokenized'] = df['headline'].apply(word_tokenize)

In [7]:
filtered_sentences = []

for sentence in df['headline_tokenized']:
    filtered_sentence = []
    for word in sentence:
        if word not in stop_words:
            word = word.strip(punctuation+' ')
            filtered_sentence.append({word:10})
    filtered_sentences.append(filtered_sentence)

# Assign the filtered sentences back to a new column
df['filtered_headline'] = filtered_sentences

print(df)

                                                     link  \
0       https://www.huffpost.com/entry/covid-boosters-...   
1       https://www.huffpost.com/entry/american-airlin...   
2       https://www.huffpost.com/entry/funniest-tweets...   
3       https://www.huffpost.com/entry/funniest-parent...   
4       https://www.huffpost.com/entry/amy-cooper-lose...   
...                                                   ...   
209522  https://www.huffingtonpost.com/entry/rim-ceo-t...   
209523  https://www.huffingtonpost.com/entry/maria-sha...   
209524  https://www.huffingtonpost.com/entry/super-bow...   
209525  https://www.huffingtonpost.com/entry/aldon-smi...   
209526  https://www.huffingtonpost.com/entry/dwight-ho...   

                                                 headline   category  \
0       over 4 million americans roll up sleeves for o...  U.S. NEWS   
1       american airlines flyer charged, banned for li...  U.S. NEWS   
2       23 of the funniest tweets about cats and do

In [8]:
df['short_description'] = df['short_description'].str.lower()
df['short_description_tokenized'] = df['short_description'].apply(word_tokenize)

In [9]:
filtered_sentences = []

for sentence in df['short_description_tokenized']:
    filtered_sentence = []
    for word in sentence:
        if word not in stop_words:
            word = word.strip(punctuation+' ')
            filtered_sentence.append({word:5})
    filtered_sentences.append(filtered_sentence)

# Assign the filtered sentences back to a new column
df['short_description_filtered'] = filtered_sentences

print(df)

                                                     link  \
0       https://www.huffpost.com/entry/covid-boosters-...   
1       https://www.huffpost.com/entry/american-airlin...   
2       https://www.huffpost.com/entry/funniest-tweets...   
3       https://www.huffpost.com/entry/funniest-parent...   
4       https://www.huffpost.com/entry/amy-cooper-lose...   
...                                                   ...   
209522  https://www.huffingtonpost.com/entry/rim-ceo-t...   
209523  https://www.huffingtonpost.com/entry/maria-sha...   
209524  https://www.huffingtonpost.com/entry/super-bow...   
209525  https://www.huffingtonpost.com/entry/aldon-smi...   
209526  https://www.huffingtonpost.com/entry/dwight-ho...   

                                                 headline   category  \
0       over 4 million americans roll up sleeves for o...  U.S. NEWS   
1       american airlines flyer charged, banned for li...  U.S. NEWS   
2       23 of the funniest tweets about cats and do

In [10]:
# def combine_dict_lists(list1, list2):
#     combined_dict = defaultdict(int)  # Use defaultdict to automatically sum values
    
#     # Add values from the first list
#     for d in list1:
#         for dictionary in d:
#             for key, value in dictionary:
#                 combined_dict[key] += value
    
#     # Add values from the second list
#     for d in list2:
#         for dictionary in d:
#             for key, value in dictionary:
#                 combined_dict[key] += value
    
#     # Convert the defaultdict back to a list of dictionaries
#     return [{key: value} for key, value in combined_dict.items()]

# # Apply the function to each row
# df['combined_column'] = df.apply(lambda row: combine_dict_lists(df['filtered_headline'], df['short_description_filtered']), axis=1)

# print(df)

In [11]:
# from collections import defaultdict
# import pandas as pd

# Sample DataFrame setup (replace with actual data as needed)
# df = pd.DataFrame({
#     'filtered_headline': [[{'rim': 10}, {'ceo': 10}, {'thorsten': 10}]],
#     'short_description_filtered': [[{'health': 5}, {'experts': 5}, {'ceo': 5}]]
# })

# Function to combine lists of dictionaries with matching keys
def combine_dict_lists(list1, list2):
    combined_dict = defaultdict(int)  # Use defaultdict to automatically sum values
    
    # Add values from the first list
    for d in list1:
        for key, value in d.items():  # Corrected to use .items()
            combined_dict[key] += value
    
    # Add values from the second list
    for d in list2:
        for key, value in d.items():  # Corrected to use .items()
            combined_dict[key] += value
    
    # Convert the defaultdict back to a list of dictionaries
    return [{key: value} for key, value in combined_dict.items()]

# Apply the function to each row
df['combined_column'] = df.apply(lambda row: combine_dict_lists(row['filtered_headline'], row['short_description_filtered']), axis=1)

print(df)

                                                     link  \
0       https://www.huffpost.com/entry/covid-boosters-...   
1       https://www.huffpost.com/entry/american-airlin...   
2       https://www.huffpost.com/entry/funniest-tweets...   
3       https://www.huffpost.com/entry/funniest-parent...   
4       https://www.huffpost.com/entry/amy-cooper-lose...   
...                                                   ...   
209522  https://www.huffingtonpost.com/entry/rim-ceo-t...   
209523  https://www.huffingtonpost.com/entry/maria-sha...   
209524  https://www.huffingtonpost.com/entry/super-bow...   
209525  https://www.huffingtonpost.com/entry/aldon-smi...   
209526  https://www.huffingtonpost.com/entry/dwight-ho...   

                                                 headline   category  \
0       over 4 million americans roll up sleeves for o...  U.S. NEWS   
1       american airlines flyer charged, banned for li...  U.S. NEWS   
2       23 of the funniest tweets about cats and do

In [13]:
df.to_csv('output.csv', index=False)