In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dataset_path = r"D:\Data_and_AI\Datasets\News Category Dataset\News_Category_Dataset_v3.json"

#the file is in JSON Line format, so we set lines=True
df = pd.read_json(dataset_path, lines=True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [3]:
df.shape

(209527, 6)

In [4]:
print(df.category.value_counts())

category
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATI

In [5]:
#Dropping full-row duplicates
df=df.drop_duplicates()
df.shape

(209514, 6)

In [6]:
for column in df.columns:
    print(f"No. of duplicates in {column} column: {df[column].duplicated().sum()}")

No. of duplicates in link column: 28
No. of duplicates in headline column: 1518
No. of duplicates in category column: 209472
No. of duplicates in short_description column: 22492
No. of duplicates in authors column: 180345
No. of duplicates in date column: 205624


__Duplicates in category, authors, and date columns are fine. Other columns must be addressed__

In [7]:
df[df.link.duplicated(keep=False)]

Unnamed: 0,link,headline,category,short_description,authors,date
63573,https://www.huffingtonpost.comhttp://www.polit...,Hardly anybody wants to speak at Trump's conve...,POLITICS,POLITICO reached out to more than 50 prominent...,,2016-06-27
63586,https://www.huffingtonpost.comhttp://www.polit...,Hardly anybody wants to speak at Trump's conve...,POLITICS,A slot at the GOP convention used to be a care...,,2016-06-27
65188,https://www.huffingtonpost.comhttp://www.polit...,Women For Trump – His Female Fans Speak Out,POLITICS,"In March, days after Mitt Romney excoriated Do...",,2016-06-09
65202,https://www.huffingtonpost.comhttp://www.polit...,Women Supporting Trump Speak Out,POLITICS,"In March, days after Mitt Romney excoriated Do...",,2016-06-09
65759,https://www.huffingtonpost.comhttp://elections...,POLLS: Tight Race For Clinton And Sanders In C...,POLITICS,This chart combines the latest opinion polls o...,,2016-06-03
65801,https://www.huffingtonpost.comhttp://elections...,POLL: California Narrows,POLITICS,Polls and chart for 2016 California Democratic...,,2016-06-02
65907,https://www.huffingtonpost.comhttp://www.theat...,"The Fierceness Of ‘Femme, Fat And Asian’",WOMEN,Few RuPaul’s Drag Race fans could have been su...,,2016-06-01
65989,https://www.huffingtonpost.comhttp://nymag.com...,Hillary Clinton vs. Herself,WOMEN,In a locker room at the University of Bridgepo...,,2016-05-31
65991,https://www.huffingtonpost.comhttp://nymag.com...,Hillary Clinton vs. Herself,POLITICS,In a locker room at the University of Bridgepo...,,2016-05-31
66841,https://www.huffingtonpost.comhttp://www.theat...,"The Fierceness of ‘Femme, Fat, and Asian'",QUEER VOICES,Few RuPaul’s Drag Race fans could have been su...,,2016-05-21


__Link column duplicates have different short descriptions. We should keep the entries that have the longest one.__

In [8]:
indices = df.groupby("link")["short_description"].apply(lambda x: x.str.len().idxmax())
df = df.loc[indices].reset_index(drop=True)
df.shape

(209486, 6)

Dropped "link" column duplicates (28)

In [9]:
for column in df.columns:
    print(f"No. of duplicates in {column} column: {df[column].duplicated().sum()}")

No. of duplicates in link column: 0
No. of duplicates in headline column: 1509
No. of duplicates in category column: 209444
No. of duplicates in short_description column: 22481
No. of duplicates in authors column: 180317
No. of duplicates in date column: 205596


In [10]:
df.duplicated(subset=['headline', 'short_description']).sum()

472

In [11]:
df.drop_duplicates(subset=['headline', 'short_description'], inplace=True)

In [12]:
for column in df.columns:
    print(f"No. of duplicates in {column} column: {df[column].duplicated().sum()}")

No. of duplicates in link column: 0
No. of duplicates in headline column: 1037
No. of duplicates in category column: 208972
No. of duplicates in short_description column: 22009
No. of duplicates in authors column: 179849
No. of duplicates in date column: 205124


In [13]:
for column_name in df.columns:
    # Count NaN values
    nan_count = df[column_name].isna().sum()

    # Count empty strings (after converting to string and stripping whitespace)
    empty_string_count = (df[column_name].astype(str).str.strip() == '').sum()

    # Total empty entries (NaN + empty strings)
    total_empty = nan_count + empty_string_count

    # Print results
    print(f"Column: {column_name}")
    print(f"NaN count: {nan_count}")
    print(f"Empty string count: {empty_string_count}")
    print(f"Total empty entries: {total_empty}\n\n")

Column: link
NaN count: 0
Empty string count: 0
Total empty entries: 0


Column: headline
NaN count: 0
Empty string count: 2
Total empty entries: 2


Column: category
NaN count: 0
Empty string count: 0
Total empty entries: 0


Column: short_description
NaN count: 0
Empty string count: 19611
Total empty entries: 19611


Column: authors
NaN count: 0
Empty string count: 37288
Total empty entries: 37288


Column: date
NaN count: 0
Empty string count: 0
Total empty entries: 0




Drop Empty entries in the headline column (2 rows)

In [14]:
column_name = 'headline'
# Create a mask for non-empty entries
# Keep rows where value is not NaN and not an empty string (after stripping)
mask = ~(df[column_name].isna() | (df[column_name].astype(str).str.strip() == ''))

# Count entries to be dropped
entries_to_drop = len(df) - mask.sum()

# Drop rows where column has empty entries
df = df[mask].copy()

__Now we have duplicate headlines with different short descriptions, and duplicate short descriptions with different headlines, which is not a problem for what we're doing, as we've already made sure we have unique (headline, short description) pairs.__

## Text Preprocessing

In [15]:
df['train_text'] = df['headline'].astype(str) + " " + df["short_description"].astype(str)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date,train_text
0,https://www.huffingtonpost.com/entry/%C3%A9dga...,Édgar Ramírez Stands In Solidarity With Venezu...,LATINO VOICES,“The recall vote (revocatory referendum) is cl...,Tanisha Love Ramirez,2016-09-01,Édgar Ramírez Stands In Solidarity With Venezu...
1,https://www.huffingtonpost.com/entry/-20-reaso...,20 Reasons to Drop Everything and Go to Spain,TRAVEL,See more photos of natural parks in Spain 20.)...,"minube, Contributor\nCommunity of over 2 milli...",2014-01-15,20 Reasons to Drop Everything and Go to Spain ...
2,https://www.huffingtonpost.com/entry/-2014-bet...,WATCH: 2014 BET Awards Performances,BLACK VOICES,,Brennan Williams,2014-06-30,WATCH: 2014 BET Awards Performances
3,https://www.huffingtonpost.com/entry/-4-not-ou...,#4 Not Our Finest Hour: Why Is Liberal America...,POLITICS,"In countless ways, the political force that ha...","Andy Schmookler, ContributorAward-winning auth...",2014-09-30,#4 Not Our Finest Hour: Why Is Liberal America...
4,https://www.huffingtonpost.com/entry/-5-looks-...,Flat Out Adorable: 5 Looks to Mimic This Season,STYLE,Whether you're out exploring a new city or hea...,"Alisa Gould-Simon, ContributorCo-Founder, Pose",2014-06-19,Flat Out Adorable: 5 Looks to Mimic This Seaso...


In [16]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [17]:
stop_words = set(stopwords.words('english'))
lem = WordNetLemmatizer()

def preprocess_and_tokenize(text):
    """
    Cleans text, tokenizes, removes stopwords and short tokens, and lemmatizes.
    Returns a list of tokens.
    """
    # Convert to string type to handle potential non-string entries
    text = str(text)
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub(r'[^a-z\s]', ' ', text.lower())

    # Tokenize the cleaned text
    tokens = word_tokenize(text)

    # Remove stopwords and very short tokens (length less than 3)
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]

    # Lemmatize the tokens to reduce words to their base form
    tokens = [lem.lemmatize(t) for t in tokens]

    return tokens # Returns a list of tokens, not a single string

In [18]:
df['train_text_tokens'] = df['train_text'].apply(preprocess_and_tokenize)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date,train_text,train_text_tokens
0,https://www.huffingtonpost.com/entry/%C3%A9dga...,Édgar Ramírez Stands In Solidarity With Venezu...,LATINO VOICES,“The recall vote (revocatory referendum) is cl...,Tanisha Love Ramirez,2016-09-01,Édgar Ramírez Stands In Solidarity With Venezu...,"[dgar, ram, rez, stand, solidarity, venezuelan..."
1,https://www.huffingtonpost.com/entry/-20-reaso...,20 Reasons to Drop Everything and Go to Spain,TRAVEL,See more photos of natural parks in Spain 20.)...,"minube, Contributor\nCommunity of over 2 milli...",2014-01-15,20 Reasons to Drop Everything and Go to Spain ...,"[reason, drop, everything, spain, see, photo, ..."
2,https://www.huffingtonpost.com/entry/-2014-bet...,WATCH: 2014 BET Awards Performances,BLACK VOICES,,Brennan Williams,2014-06-30,WATCH: 2014 BET Awards Performances,"[watch, bet, award, performance]"
3,https://www.huffingtonpost.com/entry/-4-not-ou...,#4 Not Our Finest Hour: Why Is Liberal America...,POLITICS,"In countless ways, the political force that ha...","Andy Schmookler, ContributorAward-winning auth...",2014-09-30,#4 Not Our Finest Hour: Why Is Liberal America...,"[finest, hour, liberal, america, falling, far,..."
4,https://www.huffingtonpost.com/entry/-5-looks-...,Flat Out Adorable: 5 Looks to Mimic This Season,STYLE,Whether you're out exploring a new city or hea...,"Alisa Gould-Simon, ContributorCo-Founder, Pose",2014-06-19,Flat Out Adorable: 5 Looks to Mimic This Seaso...,"[flat, adorable, look, mimic, season, whether,..."


## Topic Modeling

### LDA: Latent Dirichlet Allocation

In [19]:
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.phrases import Phrases, Phraser 

In [20]:
# Step 1: Build the Bigram model using the full dataset's tokenized text.
# The Phrases model learns frequent contiguous sequences of words across all documents.
# min_count: Ignore all phrases with total frequency lower than this.
# threshold: A measure to form phrases; higher value means fewer phrases are created.
bigram = Phrases(df['train_text_tokens'], min_count=5, threshold=100)
# Phraser creates an optimized model for faster transformation.
bigram_mod = Phraser(bigram)


# Step 2: Build the Trigram model on top of the Bigram model, using the full dataset.
# This allows for the detection of three-word phrases, like "new_york_city", from the entire corpus.
trigram = Phrases(bigram_mod[df['train_text_tokens']], min_count=5, threshold=100)
trigram_mod = Phraser(trigram)


# Step 3: Apply Bigram and Trigram transformation to all documents in the DataFrame.
# This iterates through each document's list of tokens and replaces individual words
# with detected multi-word phrases (e.g., 'new', 'york' -> 'new_york').
processed_df = [trigram_mod[bigram_mod[doc]] for doc in df['train_text_tokens']]

In [21]:
# Step 4: Create a Gensim Dictionary from all processed tokens with n-grams.
# This dictionary will map all unique n-grams (words, bigrams, trigrams) from your
# entire dataset to integer IDs. It is a crucial input for the LDA model.
dictionary = corpora.Dictionary(processed_df)

# Step 5: Filter out very rare or very common tokens based on statistics from the full dataset.
# This step helps to improve topic quality by removing noise (very rare words)
# and overly generic terms (very common words that don't differentiate topics).
# no_below: Minimum number of documents a token must appear in to be kept.
# no_above: Maximum proportion of documents a token can appear in to be kept (e.g., 0.5 means 50%).
# keep_n: Maximum number of tokens to keep in the dictionary (top N most frequent).
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)

In [22]:
# Step 6: Create a Gensim Corpus (Bag-of-Words representation) from all processed tokens.
# Each document in the entire dataset is converted into a list of (word_id, word_count) tuples,
# representing its frequency of occurrence for each unique token in the dictionary.
corpus = [dictionary.doc2bow(doc) for doc in processed_df]

print(f"Number of unique tokens in dictionary (after n-grams and filtering): {len(dictionary)}")
print(f"Number of documents in corpus: {len(corpus)}")
print(f"Example of first document in corpus (word_id, count): {corpus[0][:5]}")

Number of unique tokens in dictionary (after n-grams and filtering): 33806
Number of documents in corpus: 209012
Example of first document in corpus (word_id, count): [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]


After trying different values for num_topics, 7 seems like a good compromise between coherence and interpretability 

In [23]:
num_topics = 7 

# id2word: Provides the mapping from word IDs back to words for topic interpretation.
# passes: The number of training iterations. More passes can lead to better convergence.
# random_state: Ensures reproducibility of the results.
# chunksize: Determines how many documents are processed in each training chunk.
# alpha and eta: Dirichlet priors for topic-document and word-topic distributions.
#                'auto' allows the model to learn these parameters from the data.
# per_word_topics: Set to True to enable per-word topic distribution tracking.
lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=42,
    passes=15,
    chunksize=2000,
    alpha='auto',
    eta='auto',
    per_word_topics=True
)

print("\nLDA Model Training Complete.")

# Inspect the Topics.
# Print the top 10 most probable words/n-grams for each learned topic.
print(f"\nTop {num_topics} Topics from LDA Model:")
for idx, topic in lda_model.print_topics(num_words=10):
    print(f"Topic {idx}: {topic}")

# Calculate Topic Coherence (c_v)
# C_V coherence measures the semantic similarity between high-scoring words in a topic
# and is considered a good indicator of human interpretability.

coherence_model_lda = CoherenceModel(
    model=lda_model,
    texts=processed_df, 
    dictionary=dictionary,
    coherence='c_v' 
)
coherence_lda = coherence_model_lda.get_coherence()
print(f"\nCoherence Score (C_V) for LDA model: {coherence_lda:.4f}")


LDA Model Training Complete.

Top 7 Topics from LDA Model:
Topic 0: 0.020*"make" + 0.020*"life" + 0.018*"way" + 0.016*"child" + 0.014*"need" + 0.014*"take" + 0.013*"kid" + 0.012*"work" + 0.011*"help" + 0.009*"school"
Topic 1: 0.026*"day" + 0.020*"world" + 0.013*"home" + 0.012*"best" + 0.012*"wedding" + 0.010*"city" + 0.009*"top" + 0.008*"white_house" + 0.007*"around" + 0.006*"host"
Topic 2: 0.015*"gop" + 0.014*"police" + 0.010*"claim" + 0.009*"killed" + 0.009*"dead" + 0.008*"california" + 0.008*"fire" + 0.008*"shooting" + 0.007*"florida" + 0.007*"administration"
Topic 3: 0.018*"food" + 0.012*"vote" + 0.011*"medium" + 0.011*"ukraine" + 0.011*"russia" + 0.010*"russian" + 0.009*"protest" + 0.008*"lawmaker" + 0.008*"voter" + 0.008*"art"
Topic 4: 0.019*"year" + 0.016*"one" + 0.016*"new" + 0.013*"woman" + 0.013*"people" + 0.013*"time" + 0.010*"get" + 0.010*"like" + 0.009*"first" + 0.007*"want"
Topic 5: 0.031*"week" + 0.028*"show" + 0.025*"photo" + 0.020*"video" + 0.019*"look" + 0.017*"star"

Possible Interpretations:
* Topic 0: __Family__ (life, child, kid, work, school)
* Topic 1: __Travel / Lifestyle__ (white house, city, wedding, world, host)
* Topic 2: __Crime / U.S. Local News__ (GOP, police, california, florida, killed, dead)
* Topic 3: __Politics / World News__ ( Ukraine, Russia, protest, lawmaker, vote)
* Topic 4: __General Interests__ (want, like, people, new)
* Topic 5: __Entertainment / Media__ (actor, show, start, photo, video)
* Topic 6: __Donald Trump / U.S. Republican Party__ (Donald, Trump, state, american, republican, president)

In [24]:
# Assign dominant topic to each document in the DataFrame
# Iterate through the corpus to get the topic distribution for each document.
# The dominant topic for each document is the one with the highest probability.
df_document_topics = []
for i, row in enumerate(lda_model[corpus]):
    # Each 'row' contains a list of (topic_id, probability) tuples for a document.
    # Sort these tuples in descending order by probability to find the dominant topic.
    row = sorted(row[0], key=lambda x: x[1], reverse=True)

    # Extract the dominant topic ID and its probability.
    # Handle cases where a document might not have any topic assigned (though rare with LDA).
    dominant_topic_id = row[0][0] if row else -1
    dominant_topic_prob = row[0][1] if row else 0.0

    # Store the results for the current document.
    df_document_topics.append({
        'dominant_topic': dominant_topic_id,
        'dominant_topic_prob': dominant_topic_prob,
        #'topic_distribution': row # keeping the full topic distribution for reference
    })

# Add the dominant topic information as new columns to the main DataFrame 'df'.
df['dominant_topic_id_lda'] = [d['dominant_topic'] for d in df_document_topics]
df['dominant_topic_prob_lda'] = [d['dominant_topic_prob'] for d in df_document_topics]

print("\nDominant LDA topics assigned to the DataFrame.")
print("First 5 rows with dominant topic and its probability:")
df[['headline', 'dominant_topic_id_lda', 'dominant_topic_prob_lda']].head()


Dominant LDA topics assigned to the DataFrame.
First 5 rows with dominant topic and its probability:


Unnamed: 0,headline,dominant_topic_id_lda,dominant_topic_prob_lda
0,Édgar Ramírez Stands In Solidarity With Venezu...,4,0.340438
1,20 Reasons to Drop Everything and Go to Spain,4,0.315553
2,WATCH: 2014 BET Awards Performances,4,0.318119
3,#4 Not Our Finest Hour: Why Is Liberal America...,4,0.279397
4,Flat Out Adorable: 5 Looks to Mimic This Season,1,0.26395


In [25]:
df.dominant_topic_prob_lda.describe()

count    209012.000000
mean          0.336888
std           0.069568
min           0.165971
25%           0.285484
50%           0.326343
75%           0.377958
max           0.718266
Name: dominant_topic_prob_lda, dtype: float64

Saving the LDA Model

In [27]:
# Save LDA model
lda_model.save("Model\lda_model_gensim")

# Save dictionary (used to create BoW for new text)
dictionary.save("Model\lda_dictionary.dict")

## Sentiment Analysis

In [29]:
df.to_csv("pre_sentiment_data_2.csv", index=False)