In [2]:
%run 00_utils.ipynb

In [13]:
from tqdm.notebook import tqdm
tqdm.pandas(desc="Processing DataFrame")

In [144]:
import spacy
nlp=spacy.load("en_core_web_sm")

In [240]:
from openai import OpenAI
oa_client = OpenAI()

In [171]:
import litellm
from litellm import completion
import instructor
from tenacity import retry, stop_after_attempt, wait_exponential
from langsmith import traceable
from pydantic import BaseModel, Field
from instructor.utils import disable_pydantic_error_url
from textwrap import dedent

In [172]:
#litellm._turn_on_debug()
litellm.drop_params = True
disable_pydantic_error_url()

# Initialize the instructor client
client = instructor.from_litellm(completion)

In [None]:
import re
from collections import Counter
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from typing import Tuple

In [4]:
# Get data from Sheets
df_classify = sheets_to_df("data_classify", SHEET_URL)
df_classify['published'] = pd.to_datetime(df_classify['published'], utc=True, errors='coerce')
df_classify.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307 entries, 0 to 306
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   url          307 non-null    object             
 1   source       307 non-null    object             
 2   title        307 non-null    object             
 3   description  307 non-null    object             
 4   author       307 non-null    object             
 5   published    307 non-null    datetime64[ns, UTC]
 6   summary      307 non-null    object             
 7   category     307 non-null    object             
dtypes: datetime64[ns, UTC](1), object(7)
memory usage: 19.3+ KB


In [5]:
#Filter out rows with 'Other' category
df= df_classify[df_classify['category'] != 'Other'].reset_index(drop=True)
df.head()

Unnamed: 0,url,source,title,description,author,published,summary,category
0,https://www.espn.com.au/tennis/story/_/id/4767...,ESPN,ðŸŽ¾AO live: Wawrinka stuns in five-set marathon,Novak Djokovic and Jannik Sinner headline a bu...,ESPN,2026-01-23 04:25:42+00:00,ðŸŽ¾AO live: Wawrinka stuns in five-set marathon....,Sports
1,https://www.espn.com.au/tennis/story/_/id/4769...,ESPN,What was that for?' Osaka asks of terse Cirstea,Naomi Osaka received a cool response from Sora...,ESPN,2026-01-23 04:25:42+00:00,What was that for?' Osaka asks of terse Cirste...,Sports
2,https://www.espn.com.au/golf/story/_/id/476898...,ESPN,"McIlroy to Rahm, Hatton: Pay fines, play Ryder...",Rory McIlroy wants Ryder Cup team-mates Jon Ra...,PA,2026-01-23 04:25:42+00:00,"McIlroy to Rahm, Hatton: Pay fines, play Ryder...",Sports
3,https://www.espn.com.au/nba/story/_/id/4768975...,ESPN,"Giannis cites chemistry, selfish play after routs",Giannis Antetokounmpo says chemistry issues mi...,ESPN,2026-01-23 04:25:42+00:00,"Giannis cites chemistry, selfish play after ro...",Sports
4,https://www.espn.com.au/afl/story/_/id/4768933...,ESPN,Hawks break tradition and appoint co-captains,Midfielder Jai Newcombe has been elevated to s...,ESPN,2026-01-23 04:25:42+00:00,Hawks break tradition and appoint co-captains....,Sports


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   url          201 non-null    object             
 1   source       201 non-null    object             
 2   title        201 non-null    object             
 3   description  201 non-null    object             
 4   author       201 non-null    object             
 5   published    201 non-null    datetime64[ns, UTC]
 6   summary      201 non-null    object             
 7   category     201 non-null    object             
dtypes: datetime64[ns, UTC](1), object(7)
memory usage: 12.7+ KB


In [7]:
df.category.value_counts()

category
Lifestyle    84
Sports       66
Finance      33
Music        18
Name: count, dtype: int64

In [9]:
df.groupby('category')['source'].value_counts()

category   source        
Finance    SMH               16
           The Guardian      13
           SBS                3
           Canberra Times     1
Lifestyle  The Guardian      54
           SMH               20
           ABC                5
           Canberra Times     3
           SBS                2
Music      The Guardian      11
           SMH                5
           ABC                2
Sports     ESPN              38
           SMH               22
           ABC                4
           Canberra Times     1
           The Guardian       1
Name: count, dtype: int64

## Generate embeddings for article summaries

In [239]:
def get_embedding(text, model="text-embedding-3-large"):
    text = text.replace("\n", " ")
    return oa_client.embeddings.create(input = [text], model=model).data[0].embedding

In [16]:
df['embedding'] = df['summary'].progress_apply(lambda x: get_embedding(x, model='text-embedding-3-small'))

Processing DataFrame:   0%|          | 0/201 [00:00<?, ?it/s]

In [242]:
## Export to Sheets
df_to_sheets(df, "data_embeddings", SHEET_URL)

"Data uploaded successfully to sheet: 'data_embeddings'"

In [129]:
len(df['embedding'].iloc[0])

1536

## Deduplicate articles within each category

In [202]:
# ----------------------------
# Union-Find (connected components)
# ----------------------------
def connected_components(n: int, edges: list[Tuple[int, int]]) -> list[list[int]]:
    parent = list(range(n))
    rank = [0] * n

    def find(x: int) -> int:
        while parent[x] != x:
            parent[x] = parent[parent[x]]
            x = parent[x]
        return x

    def union(a: int, b: int) -> None:
        ra, rb = find(a), find(b)
        if ra == rb:
            return
        if rank[ra] < rank[rb]:
            parent[ra] = rb
        elif rank[ra] > rank[rb]:
            parent[rb] = ra
        else:
            parent[rb] = ra
            rank[ra] += 1

    for i, j in edges:
        union(i, j)

    comps = {}
    for i in range(n):
        root = find(i)
        comps.setdefault(root, []).append(i)

    return list(comps.values())


# ----------------------------
# Graph dedupe for one category
# ----------------------------
def graph_dedupe_category(
    X: np.ndarray,
    threshold: float,
    k: int,
) -> list[list[int]]:
    n = X.shape[0]
    if n < 2:
        return []

    nn = NearestNeighbors(
        n_neighbors=min(k + 1, n),
        metric="cosine",
    )

    nn.fit(X)
    dists, nbrs = nn.kneighbors(X)

    edges = set()

    for i in range(n):
        for dist, j in zip(dists[i, 1:], nbrs[i, 1:]):  # skip self
            sim = 1.0 - float(dist)
            if sim >= threshold:
                j = int(j)
                a, b = (i, j) if i < j else (j, i)
                edges.add((a, b))

    clusters = connected_components(n, list(edges))
    return [c for c in clusters if len(c) >= 2]


# ----------------------------
# Full pipeline over all categories
# ----------------------------
def dedupe_all_categories(
    df: pd.DataFrame,
    threshold: float = 0.7,
    k: int = 15,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Input df columns:
    (url, author, published, category, source, title, summary, embedding)

    Returns:
    1) cluster_df columns: [cluster_id, num_articles, url, article_summary]
       (one row per article that belongs to a cluster)
    2) clustered_articles_df: original df filtered to only clustered articles,
       with cluster_id attached
    """

    cluster_rows = []
    category_cluster_counters: dict[str, int] = {}

    # Keep original row index so we can map cluster_id back
    df = df.reset_index(drop=False).rename(columns={"index": "_row_index"})

    for category, df_cat in df.groupby("category", sort=False):
        df_cat = df_cat.reset_index(drop=True)

        # per-category cluster counter
        category_cluster_counters.setdefault(category, 0)

        X = np.vstack(df_cat["embedding"].values)
        clusters = graph_dedupe_category(
            X=X,
            threshold=threshold,
            k=min(k, len(df_cat) - 1),
        )

        for cluster in clusters:
            cid = category_cluster_counters[category]
            cluster_id = f"{category}_{cid}"
            category_cluster_counters[category] += 1

            cluster_size = len(cluster)

            for local_i in cluster:
                row = df_cat.iloc[local_i]
                cluster_rows.append({
                    "cluster_id": cluster_id,
                    "category": category,
                    "num_articles": cluster_size,
                    "url": row["url"],
                    "article_summary": row["summary"],
                    "_row_index": int(row["_row_index"]),
                })

    cluster_df = pd.DataFrame(cluster_rows, columns=["cluster_id", "category", "num_articles", "url", "article_summary", "_row_index"])

    if cluster_df.empty:
        # return empty frames with expected schema
        empty_cluster_df = pd.DataFrame(columns=["cluster_id", "category", "num_articles", "url", "article_summary"])
        empty_clustered_articles_df = df.iloc[0:0].drop(columns=["_row_index"]).copy()
        empty_clustered_articles_df["cluster_id"] = None
        return empty_cluster_df, empty_clustered_articles_df

    # clustered_articles_df = original rows that are in a cluster, with cluster_id
    clustered_articles_df = (
        df.merge(
            cluster_df[["_row_index", "cluster_id"]],
            on="_row_index",
            how="inner",
        )
        .drop(columns=["_row_index"])
        .reset_index(drop=True)
    )

    # final cluster_df with requested columns only
    cluster_df = cluster_df.drop(columns=["_row_index"]).reset_index(drop=True)

    return cluster_df, clustered_articles_df


In [203]:
res1, res2 = dedupe_all_categories(df)

In [204]:
res1['cluster_id'].value_counts()

cluster_id
Lifestyle_0    6
Sports_0       5
Music_1        3
Sports_1       2
Sports_2       2
Finance_0      2
Finance_2      2
Finance_1      2
Lifestyle_1    2
Music_0        2
Name: count, dtype: int64

In [205]:
res1.groupby('category')['cluster_id'].value_counts()

category   cluster_id 
Finance    Finance_0      2
           Finance_1      2
           Finance_2      2
Lifestyle  Lifestyle_0    6
           Lifestyle_1    2
Music      Music_1        3
           Music_0        2
Sports     Sports_0       5
           Sports_1       2
           Sports_2       2
Name: count, dtype: int64

In [135]:
res1

Unnamed: 0,cluster_id,category,url,author,published,source,title,summary,row_index
0,Sports_0,Sports,https://www.espn.com.au/tennis/story/_/id/4769...,ESPN,2026-01-23 04:25:42+00:00,ESPN,What was that for?' Osaka asks of terse Cirstea,What was that for?' Osaka asks of terse Cirste...,1
1,Sports_0,Sports,https://www.abc.net.au/news/2026-01-22/tennis-...,Chris De Silva,2026-01-22 12:21:46+00:00,ABC,Osaka apologises for 'disrespectful' interview...,Osaka apologises for 'disrespectful' interview...,28
2,Sports_0,Sports,https://www.smh.com.au/sport/tennis/osaka-apol...,Billie Eder,2026-01-22 12:15:20+00:00,SMH,Osaka apologises for â€˜disrespectfulâ€™ on-court ...,Osaka apologises for â€˜disrespectfulâ€™ on-court ...,29
3,Sports_0,Sports,https://www.smh.com.au/sport/tennis/osaka-expl...,SMH,2026-01-22 11:21:20+00:00,SMH,Osaka explains handshake incident,Osaka explains handshake incident. Naomi Osaka...,31
4,Sports_0,Sports,https://www.smh.com.au/sport/tennis/osaka-clas...,SMH,2026-01-22 11:19:06+00:00,SMH,Osaka clashes with rival in icy handshake moment,Osaka clashes with rival in icy handshake mome...,32
5,Sports_1,Sports,https://www.abc.net.au/news/2026-01-22/austral...,Luke Pentony,2026-01-22 09:52:20+00:00,ABC,I wanted it so bad': How Maddison Inglis claim...,I wanted it so bad': How Maddison Inglis claim...,43
6,Sports_1,Sports,https://www.smh.com.au/sport/tennis/her-oppone...,"Billie Eder, Hannah Kennelly",2026-01-22 09:43:57+00:00,SMH,Her opponent was serving for the match. This i...,Her opponent was serving for the match. This i...,44
7,Sports_2,Sports,https://www.smh.com.au/sport/tennis/frenchman-...,SMH,2026-01-22 08:49:00+00:00,SMH,Frenchman loses his cool,Frenchman loses his cool. Stan Wawrinka gets u...,45
8,Sports_2,Sports,https://www.smh.com.au/sport/tennis/wawrinka-f...,SMH,2026-01-22 08:33:41+00:00,SMH,Wawrinka forces fifth set,Wawrinka forces fifth set. Stan Wawrinka uncor...,46
9,Finance_3,Finance,https://www.sbs.com.au/news/podcast-episode/su...,SBS,2026-01-22 07:37:40+00:00,SBS,Surprise jobs report boosts chance of February...,Surprise jobs report boosts chance of February...,54


In [136]:
res2

Unnamed: 0,url,source,title,description,author,published,summary,category,embedding,embedding_numpy,cluster_id
0,https://www.espn.com.au/tennis/story/_/id/4769...,ESPN,What was that for?' Osaka asks of terse Cirstea,Naomi Osaka received a cool response from Sora...,ESPN,2026-01-23 04:25:42+00:00,What was that for?' Osaka asks of terse Cirste...,Sports,"[-0.0162181556224823, 0.021036049351096153, -0...","[-0.0162181556224823, 0.021036049351096153, -0...",Sports_0
1,https://www.abc.net.au/news/2026-01-22/tennis-...,ABC,Osaka apologises for 'disrespectful' interview...,Naomi Osaka walks back her aggressive retort t...,Chris De Silva,2026-01-22 12:21:46+00:00,Osaka apologises for 'disrespectful' interview...,Sports,"[0.011207101866602898, 0.011254043318331242, 0...","[0.011207101866602898, 0.011254043318331242, 0...",Sports_0
2,https://www.smh.com.au/sport/tennis/osaka-apol...,SMH,Osaka apologises for â€˜disrespectfulâ€™ on-court ...,Naomi Osaka and Sorana Cirstea had a heated ex...,Billie Eder,2026-01-22 12:15:20+00:00,Osaka apologises for â€˜disrespectfulâ€™ on-court ...,Sports,"[-0.004477877169847488, 0.017543857917189598, ...","[-0.004477877169847488, 0.017543857917189598, ...",Sports_0
3,https://www.smh.com.au/sport/tennis/osaka-expl...,SMH,Osaka explains handshake incident,Naomi Osaka speaks after her win against Soran...,SMH,2026-01-22 11:21:20+00:00,Osaka explains handshake incident. Naomi Osaka...,Sports,"[-0.002710364991798997, -0.006483675912022591,...","[-0.002710364991798997, -0.006483675912022591,...",Sports_0
4,https://www.smh.com.au/sport/tennis/osaka-clas...,SMH,Osaka clashes with rival in icy handshake moment,Naomi Osaka and Sorana Cirstea share words aft...,SMH,2026-01-22 11:19:06+00:00,Osaka clashes with rival in icy handshake mome...,Sports,"[-0.01812024414539337, -0.04119137302041054, 0...","[-0.01812024414539337, -0.04119137302041054, 0...",Sports_0
5,https://www.abc.net.au/news/2026-01-22/austral...,ABC,I wanted it so bad': How Maddison Inglis claim...,With a 10-point tiebreaker to decide her Austr...,Luke Pentony,2026-01-22 09:52:20+00:00,I wanted it so bad': How Maddison Inglis claim...,Sports,"[0.004808810539543629, 0.02937224879860878, 0....","[0.004808810539543629, 0.02937224879860878, 0....",Sports_1
6,https://www.smh.com.au/sport/tennis/her-oppone...,SMH,Her opponent was serving for the match. This i...,These are the moments that defined Maddison In...,"Billie Eder, Hannah Kennelly",2026-01-22 09:43:57+00:00,Her opponent was serving for the match. This i...,Sports,"[0.000546822149772197, 0.014806261286139488, 0...","[0.000546822149772197, 0.014806261286139488, 0...",Sports_1
7,https://www.smh.com.au/sport/tennis/frenchman-...,SMH,Frenchman loses his cool,Stan Wawrinka gets up in a fiery rally and Art...,SMH,2026-01-22 08:49:00+00:00,Frenchman loses his cool. Stan Wawrinka gets u...,Sports,"[-0.011354419402778149, 0.0429031141102314, -0...","[-0.011354419402778149, 0.0429031141102314, -0...",Sports_2
8,https://www.smh.com.au/sport/tennis/wawrinka-f...,SMH,Wawrinka forces fifth set,Stan Wawrinka uncorks a ridiculous backhand wi...,SMH,2026-01-22 08:33:41+00:00,Wawrinka forces fifth set. Stan Wawrinka uncor...,Sports,"[-0.04509558156132698, 0.030436096712946892, -...","[-0.04509558156132698, 0.030436096712946892, -...",Sports_2
9,https://www.sbs.com.au/news/podcast-episode/su...,SBS,Surprise jobs report boosts chance of February...,SBS Finance Editor Ricardo Goncalves speaks wi...,SBS,2026-01-22 07:37:40+00:00,Surprise jobs report boosts chance of February...,Finance,"[-0.023292867466807365, 0.022158056497573853, ...","[-0.023292867466807365, 0.022158056497573853, ...",Finance_3


## Extract keywords from article summaries

In [167]:
ALLOWED_POS = {"NOUN", "ADJ"}
ALLOWED_ENTS = {"ORG", "PERSON", "GPE", "EVENT", "LOC"}

def normalize_entities(text: str) -> str:
    text = text.strip().lower()
    text = re.sub(r"[^a-z0-9]+", "_", text)
    return text


def split_possessive_entity(text: str) -> list[str]:
    """
    Splits a possessive entity expression into its components.

    This function takes a text string containing a possessive entity (e.g., "Trump's Greenland")
    and splits it into its constituent components, normalizing each part. The possessive
    marker "'s" (or "â€™s") is removed, and the resulting components are filtered to exclude
    empty strings.

    Args:
        text (str): The input text containing a possessive entity.

    Returns:
        A list of normalized components of the possessive expression, excluding empty strings.
    """
    # Split once: "Trump's Greenland" -> ["Trump", " Greenland"]
    parts = re.split(r"(?:'s|â€™s)\b", text, maxsplit=1)
    out = [normalize_entities(p) for p in parts]
    return [x for x in out if x]


def extract_keywords(text: str) -> list[str]:
    if not text:
        return []

    doc = nlp(text)

    # Extract entities
    entities: list[str] = []
    for ent in doc.ents:
        if ent.label_ not in ALLOWED_ENTS:
            continue

        if re.search(r"(?:'s|â€™s)\b", ent.text):
            entities.extend(split_possessive_entity(ent.text))
        else:
            norm = normalize_entities(ent.text)
            if norm:
                entities.append(norm)

    # Extract lemmas from allowed POS
    lemmas = [
        t.lemma_.lower()
        for t in doc
        if t.pos_ in ALLOWED_POS
        and not t.is_stop
        and not t.like_num
        and t.is_alpha
    ]
    sorted_lemmas = [w for w, _ in Counter(lemmas).most_common()]

    # Merge entities and lemmas, keeping only unique terms
    seen = set()
    combined: list[str] = []
    for term in entities + sorted_lemmas:
        if term and term not in seen:
            seen.add(term)
            combined.append(term)

    return combined[:10]

In [207]:
res1['keywords'] = res1['article_summary'].progress_apply(lambda x: extract_keywords(x))

Processing DataFrame:   0%|          | 0/28 [00:00<?, ?it/s]

In [231]:
res1['keywords']

0     [osaka, cirstea, naomi_osaka, sorana_cirstea, ...
1     [osaka, naomi_osaka, sorana_cirstea, opponent,...
2     [osaka, cirstea, naomi_osaka, sorana_cirstea, ...
3     [osaka, naomi_osaka, sorana_cirstea, handshake...
4     [osaka, naomi_osaka, sorana_cirstea, clash, ri...
5     [aus_open, maddison_inglis, emotional, open, w...
6     [australia, maddison_inglis, laura_siegemund, ...
7     [stan_wawrinka, arthur_gea, frenchman, cool, f...
8     [stan_wawrinka, arthur_gea, set, force, ridicu...
9     [ricardo_goncalves, johnathan_mcmenamin, barre...
10    [rba, australia, the_reserve_bank, rate, stron...
11    [trump, australia, donald_trump, greenland, s_...
12    [asx, trump, greenland, australia, slide, sink...
13    [litchfield, litchfield_minerals, silver_valle...
14    [litchfield, nt, litchfield_minerals, ip, copp...
15    [david, victoria_beckham, emma_brockes, brookl...
16    [david_beckham, brooklyn, child, mistake, soci...
17    [beckham, brooklyn, brooklyn_peltz_beckham

## Generate cluster title and summary using LLM

In [173]:
def sanitise_inputs(inputs: dict) -> dict:
    del inputs['response_model']
    return inputs

@traceable(name='LLMRun', run_type='llm', process_inputs=sanitise_inputs)
@retry(stop=stop_after_attempt(2), wait=wait_exponential(multiplier=10, max=60))
def get_llm_response(
        messages: list[dict[str, str]],
        ls_provider="openai",
        ls_model_name="gpt-4.1",
        temperature=0,
        seed=None,
        response_model=None,
        max_retries=2,
):
    """
    Helper function to get a response from the OpenAI compatible completion endpoints using litellm and instructor.
    Includes retry logic for rate limits as well as tracing with langsmith.
    """

    model = f"{ls_provider}/{ls_model_name}"

    params = {
        "messages": messages,
        "model": model,
        "temperature": temperature,
    }

    if seed is not None:
        params["seed"] = seed

    if response_model is not None:
        # Instructor structured outputs
        params["response_model"] = response_model

        # Set number of retries incase output does not match the response_model
        params["max_retries"] = max_retries
        return client.chat.completions.create(**params)

    return completion.create(**params)

In [191]:
# Define the classification prompt and response model
story_prompt = dedent(f"""
You will be presented with a list of news articles belonging to the same news story cluster.
Your task is to extract <title> and <summary> of the cluster that accurately repesents the story.

Guidelines:
- You must only use the information and facts provided in the articles.
- <title> should be a concise headline (less than 8 words) for the news story cluster in Australian Spelling. Use sentence case.
- <summary> should be an accurate summary including relevant information and entities in Australian Spelling. Length should be between 30-50 words.
""")

class StoryResponse(BaseModel):
    title: str = Field(
        ...,
        description="Short title of the given news cluster in Australian Spelling. Use sentence case. Must be less than 8 words.",
    )
    summary: str = Field(
        ...,
        description="Summary of the given news cluster in Australian Spelling. Length should be between 30-50 words.",
    )

In [188]:
# Create a tool to get content for story cluster
@traceable(name='ClusterContent', run_type='tool')
def get_cluster_data(articles: str) -> str:
    messages = [
        {"role": "system", "content": story_prompt},
        {"role": "user", "content": f"Articles:\n----\n{articles}"}
    ]
    model_params = {
            "ls_provider": "openai",
            "ls_model_name": "gpt-4.1"
        }
    response = get_llm_response(
        messages=messages,
        **model_params,
        seed=42,
        response_model=StoryResponse,
        langsmith_extra={
            'metadata': {
                'ls_provider': model_params['ls_provider'],
                'ls_model_name': model_params['ls_model_name']
            }
        }
    )
    return response

## Aggregate cluster data

In [224]:
def mode_or_none(s: pd.Series):
    m = s.mode()
    return m.iloc[0] if not m.empty else None

def generate_cluster_content(summaries: pd.Series) -> dict:
    texts = "\n----\n".join(summaries.tolist())

    # Generate using llm
    res = get_cluster_data(texts)
    return res


def top_keywords_tf(keywords: pd.Series, top_n: int = 10) -> list[str]:
    c = Counter()
    for kws in keywords.dropna():
        if isinstance(kws, list):
            c.update(kws)
    return [k for k, _ in c.most_common(top_n)]


def aggregate_cluster_df(cluster_df: pd.DataFrame, top_n_keywords: int = 10) -> pd.DataFrame:
    agg = (
        cluster_df
        .groupby("cluster_id", as_index=False)
        .agg(
            category=("category", mode_or_none),
            num_articles=("num_articles", mode_or_none),
            content=("article_summary", generate_cluster_content),
            keywords=("keywords", lambda s: top_keywords_tf(s, top_n=top_n_keywords)),
        )
    )

    # split dict into columns
    agg["title"] = agg["content"].map(lambda d: d.title)
    agg["summary"] = agg["content"].map(lambda d: d.summary)
    agg = agg.drop(columns=["content"])

    return agg


In [225]:
clusters = aggregate_cluster_df(res1)

In [241]:
clusters['embedding'] = clusters['summary'].progress_apply(lambda x: get_embedding(x, model='text-embedding-3-small'))

Processing DataFrame:   0%|          | 0/10 [00:00<?, ?it/s]

In [235]:
clusters.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   cluster_id    10 non-null     object
 1   category      10 non-null     object
 2   num_articles  10 non-null     int64 
 3   keywords      10 non-null     object
 4   title         10 non-null     object
 5   summary       10 non-null     object
 6   embedding     10 non-null     object
dtypes: int64(1), object(6)
memory usage: 692.0+ bytes


In [247]:
# ## Export to Sheets
# df_to_sheets(clusters, "clusters_db", SHEET_URL)

In [248]:
articles = res2.copy()
articles.drop(columns=["description"], inplace=True)
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   url         28 non-null     object             
 1   source      28 non-null     object             
 2   title       28 non-null     object             
 3   author      28 non-null     object             
 4   published   28 non-null     datetime64[ns, UTC]
 5   summary     28 non-null     object             
 6   category    28 non-null     object             
 7   embedding   28 non-null     object             
 8   cluster_id  28 non-null     object             
dtypes: datetime64[ns, UTC](1), object(8)
memory usage: 2.1+ KB


In [250]:
# ## Export to Sheets
# df_to_sheets(articles, "articles_db", SHEET_URL)