### Knowledge graph

#### Worked by: Jaamie Maarsh Joy Martin
#### Assisted by: Prof. Arasu Narayan

##### Task: To identify the underlying relationships inherent in the articles based on their 'short_description' and 'headline' attributes.

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag, ne_chunk
from nltk.tree import Tree
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import networkx as nx
import matplotlib.pyplot as plt

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/joymartin.j/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/joymartin.j/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/joymartin.j/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/joymartin.j/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /home/joymartin.j/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

#### Loading the dataset

In [2]:
df_news = pd.read_json('News_Category_Dataset_v3.json', lines=True)

# using only necessary columns
df_news = df_news[['headline', 'short_description']]

display(df_news)


Unnamed: 0,headline,short_description
0,Over 4 Million Americans Roll Up Sleeves For O...,Health experts said it is too early to predict...
1,"American Airlines Flyer Charged, Banned For Li...",He was subdued by passengers and crew when he ...
2,23 Of The Funniest Tweets About Cats And Dogs ...,"""Until you have a dog you don't understand wha..."
3,The Funniest Tweets From Parents This Week (Se...,"""Accidentally put grown-up toothpaste on my to..."
4,Woman Who Called Cops On Black Bird-Watcher Lo...,Amy Cooper accused investment firm Franklin Te...
...,...,...
209522,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,Verizon Wireless and AT&T are already promotin...
209523,Maria Sharapova Stunned By Victoria Azarenka I...,"Afterward, Azarenka, more effusive with the pr..."
209524,"Giants Over Patriots, Jets Over Colts Among M...","Leading up to Super Bowl XLVI, the most talked..."
209525,Aldon Smith Arrested: 49ers Linebacker Busted ...,CORRECTION: An earlier version of this story i...


In [3]:
# Combine 'headline' and 'short_description' for analysis
df_news['news'] = df_news['headline'] + " " + df_news['short_description']

# dropping of unnecessary columns
df_news.drop(columns=['headline', 'short_description'], inplace=True)

display(df_news.head())

Unnamed: 0,news
0,Over 4 Million Americans Roll Up Sleeves For O...
1,"American Airlines Flyer Charged, Banned For Li..."
2,23 Of The Funniest Tweets About Cats And Dogs ...
3,The Funniest Tweets From Parents This Week (Se...
4,Woman Who Called Cops On Black Bird-Watcher Lo...


In [4]:
df_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209527 entries, 0 to 209526
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   news    209527 non-null  object
dtypes: object(1)
memory usage: 1.6+ MB


#### Text Preprocessing

In [5]:
# Display one news entry from the dataset
print(df_news['news'].iloc[0])  # Displays the first news item

print(df_news['news'].iloc[1])



Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.
American Airlines Flyer Charged, Banned For Life After Punching Flight Attendant On Video He was subdued by passengers and crew when he fled to the back of the aircraft after the confrontation, according to the U.S. attorney's office in Los Angeles.


In [6]:
# Tokenization 
df_news['tokenized_news'] = df_news['news'].apply(lambda x: [word_tokenize(sentence) for sentence in sent_tokenize(x)])


In [7]:
# Sample tokenisation
print(df_news['tokenized_news'].iloc[0])

[['Over', '4', 'Million', 'Americans', 'Roll', 'Up', 'Sleeves', 'For', 'Omicron-Targeted', 'COVID', 'Boosters', 'Health', 'experts', 'said', 'it', 'is', 'too', 'early', 'to', 'predict', 'whether', 'demand', 'would', 'match', 'up', 'with', 'the', '171', 'million', 'doses', 'of', 'the', 'new', 'boosters', 'the', 'U.S.', 'ordered', 'for', 'the', 'fall', '.']]


In [8]:
# Parts of speech
# POS tagging
df_news['pos_news'] = df_news['tokenized_news'].apply(lambda tokenized: [pos_tag(tokens) for tokens in tokenized])


In [9]:
display(df_news['pos_news'])

0         [[(Over, IN), (4, CD), (Million, NNP), (Americ...
1         [[(American, NNP), (Airlines, NNPS), (Flyer, N...
2         [[(23, CD), (Of, IN), (The, DT), (Funniest, NN...
3         [[(The, DT), (Funniest, NNP), (Tweets, NNPS), ...
4         [[(Woman, NNP), (Who, WP), (Called, VBD), (Cop...
                                ...                        
209522    [[(RIM, NNP), (CEO, NNP), (Thorsten, NNP), (He...
209523    [[(Maria, NNP), (Sharapova, NNP), (Stunned, VB...
209524    [[(Giants, NNS), (Over, IN), (Patriots, NNP), ...
209525    [[(Aldon, NNP), (Smith, NNP), (Arrested, NNP),...
209526    [[(Dwight, NNP), (Howard, NNP), (Rips, NNP), (...
Name: pos_news, Length: 209527, dtype: object

In [None]:
# Named Entity Recognition (NER)
def extract_entities(pos_news):
    entities = []
    for sentence in pos_news:
        tree = ne_chunk(sentence)
        for subtree in tree:
            if isinstance(subtree, Tree):  # If it's a named entity subtree
                entity = " ".join(word for word, tag in subtree.leaves())
                entities.append(entity)
    return entities

df_news['News_entities'] = df_news['pos_news'].apply(extract_entities)


In [None]:
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(tokens)

df_news['processed_text'] = df_news['news'].apply(preprocess_text)


In [None]:
# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = vectorizer.fit_transform(df_news['processed_text'])

# Cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix)


In [None]:
# Clustering using K-Means
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(tfidf_matrix)
df_news['cluster'] = clusters

# Create a graph based on similarity
G = nx.Graph()

# Add nodes with attributes
for idx, row in df.iterrows():
    G.add_node(idx, label=row['text'], cluster=row['cluster'])

# Add edges if similarity exceeds a threshold
threshold = 0.3
for i in range(len(cosine_sim)):
    for j in range(i + 1, len(cosine_sim)):
        if cosine_sim[i, j] > threshold:
            G.add_edge(i, j, weight=cosine_sim[i, j])

# Visualize the graph
plt.figure(figsize=(12, 12))
pos = nx.spring_layout(G)
colors = [df_news.loc[node]['cluster'] for node in G.nodes]
nx.draw_networkx_nodes(G, pos, node_size=50, node_color=colors, cmap=plt.cm.tab10)
nx.draw_networkx_edges(G, pos, alpha=0.5)
plt.title("Article Relationship Graph")
plt.show()
