In [1]:
import pandas as pd
import spacy

from utilities import getSupabaseClient

In [2]:
supabase = getSupabaseClient()

response = supabase.table('Data').select("*").execute()
print("Data read from supabase")


establishing client
Data read from supabase


In [6]:
data = [[article['id'], article['articleData']['title'], article['articleData']['description'], article['topic']] for article in response.data if article is not None and article["articleData"] is not None]
data = pd.DataFrame(data, columns=["id", "title", "description", "topic"])

# deal with missing data
data = data.fillna("")

data

Unnamed: 0,id,title,description,topic
0,4309,Jimmy Carter está por cumplir 99 años,Carter ya era el presidente más longevo de la ...,105
1,4314,Biden Officials Focus on African Crises at Uni...,Secretary of State Antony J. Blinken met on Fr...,66
2,4381,Inside the Unfounded Claim That DeSantis Abuse...,A former prisoner’s story of mistreatment at t...,18
3,5458,Scalise Bid for Speaker Meets Resistance From ...,The holdouts who refuse to back the No. 2 Repu...,58
4,4590,Copyright board delivers blow to 'Terminator' ...,An award-winning image was denied U.S. copyrig...,22
...,...,...,...,...
995,4811,"As Jimmy Carter turns 99, he’s still full of s...","Seven months after entering hospice, Jimmy Car...",105
996,4817,MSNBC anchor Alicia Menendez won’t cover senat...,The journalist daughter of Sen. Bob Menendez s...,97
997,4818,"Squabbles, missteps and a struggle to govern: ...",Republicans last week offered a kaleidoscopic ...,65
998,4808,How the right’s elevation of Robert F. Kennedy...,"Kennedy might run with a third party, which co...",16


In [9]:
# combine text fields
combined = pd.DataFrame(data["title"] + "; " + data["description"], columns=["text"])
combined['id'] = data['id']
combined['topic'] = data['topic']

combined

Unnamed: 0,text,id,topic
0,Jimmy Carter está por cumplir 99 años; Carter ...,4309,105
1,Biden Officials Focus on African Crises at Uni...,4314,66
2,Inside the Unfounded Claim That DeSantis Abuse...,4381,18
3,Scalise Bid for Speaker Meets Resistance From ...,5458,58
4,Copyright board delivers blow to 'Terminator' ...,4590,22
...,...,...,...
995,"As Jimmy Carter turns 99, he’s still full of s...",4811,105
996,MSNBC anchor Alicia Menendez won’t cover senat...,4817,97
997,"Squabbles, missteps and a struggle to govern: ...",4818,65
998,How the right’s elevation of Robert F. Kennedy...,4808,16


In [10]:
grouped_data = combined.groupby('topic', as_index=False).agg({"text": ' '.join})
grouped_data

Unnamed: 0,topic,text
0,0,Hunter Biden Set to Be Arraigned Over Federal ...
1,1,PG&E's $6 Billion Plan to Prevent Wildfires Is...
2,10,Newsom facing competing pressures as he decide...
3,100,"As His Fraud Trial Begins, Trump Looks to Capi..."
4,101,"Gaetz Moves to Oust McCarthy, Threatening His ..."
...,...,...
114,95,"Tennessee deputy seriously injured, ‘armed ind..."
115,96,The police chief who led a raid of a small Kan...
116,97,N.J. governor calls on Menendez to resign from...
117,98,UNESCO adds Ohio’s Hopewell Ceremonial Earthwo...


In [25]:
test_text = grouped_data.iloc[1]['text']

In [26]:
nlp = spacy.load("en_core_web_lg", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])

doc = nlp(test_text)
entities = [(ent.text, ent.label_) for ent in doc.ents]

ent_df = pd.DataFrame(entities, columns=["entity", "label"])

ent_df['entity'].value_counts()

entity
California                                         15
2020                                                2
Orange County                                       2
Southern California Edison                          2
China                                               2
52-year-old                                         2
PG&E                                                1
2                                                   1
$7 million                                          1
Oakland                                             1
Southern California                                 1
over 3 million                                      1
Family of                                           1
Point La Jolla                                      1
34-year-old                                         1
Karan Singh                                         1
liqour                                              1
West Covina                                         1
ACLU                 