In [6]:
import pandas as pd
import spacy

from utilities import getSupabaseClient

In [7]:
supabase = getSupabaseClient()

response = supabase.table('Data').select("*").execute()
print("Data read from supabase")


establishing client
Data read from supabase


In [8]:
data = [[article['id'], article['articleData']['title'], article['articleData']['description'], article['topic']] for article in response.data if article is not None and article["articleData"] is not None]
data = pd.DataFrame(data, columns=["id", "title", "description", "topic"])

# deal with missing data
data = data.fillna("")

data

Unnamed: 0,id,title,description,topic
0,5939,DeSantis Says He Would Cancel Student Visas of...,"At a G.O.P. candidate showcase in Iowa, Gov. R...",98
1,5938,"DeSantis-Haley Rivalry Heats Up, With Attacks ...",As they vie to be the race’s Trump alternative...,11
2,5944,Trump Co-Defendant Says He Wants to Keep Lawye...,A hearing brought to an apparent end the back ...,92
3,5391,Republican Support of Jordan for House Speaker...,The feud between the former speaker and the No...,0
4,6443,"If Trump Trial Isn’t Broadcast Live, a Plea to...",A request to broadcast one of Donald Trump’s f...,24
...,...,...,...,...
690,5434,Michigan US Rep. Shri Thanedar renounces DSA m...,"U.S. Rep. Shri Thanedar, a Michigan Democrat, ...",37
691,6034,Haley threatens to overtake DeSantis as Trump ...,Escalating frictions between the two candidate...,56
692,6387,Son of Tennessee police chief wanted for shoot...,A Tennessee man wanted for allegedly shooting ...,48
693,6388,Christian homeschooling parents sue California...,A group of California Christian homeschooling ...,9


In [9]:
# combine text fields
combined = pd.DataFrame(data["title"] + "; " + data["description"], columns=["text"])
combined['id'] = data['id']
combined['topic'] = data['topic']

combined

Unnamed: 0,text,id,topic
0,DeSantis Says He Would Cancel Student Visas of...,5939,98
1,"DeSantis-Haley Rivalry Heats Up, With Attacks ...",5938,11
2,Trump Co-Defendant Says He Wants to Keep Lawye...,5944,92
3,Republican Support of Jordan for House Speaker...,5391,0
4,"If Trump Trial Isn’t Broadcast Live, a Plea to...",6443,24
...,...,...,...
690,Michigan US Rep. Shri Thanedar renounces DSA m...,5434,37
691,Haley threatens to overtake DeSantis as Trump ...,6034,56
692,Son of Tennessee police chief wanted for shoot...,6387,48
693,Christian homeschooling parents sue California...,6388,9


In [10]:
grouped_data = combined.groupby('topic', as_index=False).agg({"text": ' '.join})
grouped_data

Unnamed: 0,topic,text
0,,"To Build Momentum, Scott Tackles Race and Raci..."
1,0,Republican Support of Jordan for House Speaker...
2,1,Early Intelligence Shows Hamas Attack Surprise...
3,10,1st section of massive NYC landfill-to-park tr...
4,100,"In Israel, U.S. lawmakers witness war up close..."
...,...,...
113,95,"1 dead, 3 hurt after historic schooner's mast ..."
114,96,"Hollywood Actors, Studios to Restart Contract ..."
115,97,"Social Media Changed How Brands Talk to Us, bu..."
116,98,DeSantis Says He Would Cancel Student Visas of...


In [11]:
test_text = grouped_data.iloc[1]['text']

In [12]:
nlp = spacy.load("en_core_web_lg", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])

doc = nlp(test_text)
entities = [(ent.text, ent.label_) for ent in doc.ents]

ent_df = pd.DataFrame(entities, columns=["entity", "label"])

ent_df['entity'].value_counts()

entity
House              71
Republicans        39
Jim Jordan         26
Kevin McCarthy     17
Ohio               16
                   ..
2024                1
last week           1
one                 1
nearly 16 years     1
2009                1
Name: count, Length: 84, dtype: int64