In [162]:
import ollama
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import xgboost

from tqdm import tqdm

In [91]:
huffpo = pd.read_parquet("../../data/raw/news_categories.parquet")
uci = pd.read_parquet("../../data/raw/uci_categories.parquet")
pt = pd.read_parquet("../../data/raw/recognasumm.parquet")
pt = pt[pt["URL"].notna() & pt["URL"].str.contains("http")]

# Distant labeling + Naive Bayes

## Generate per-dataset list of relevant sections

In [8]:
uci["CATEGORY"].unique()

array(['b', 't', 'e', 'm'], dtype=object)

In [40]:
uci_all = uci["URL"].apply(lambda x: x.replace("http://", "").split("/")[1]).value_counts()

In [43]:
uci_filtered = uci_all.sort_values(ascending=False).head(250)

In [50]:
sdf = pd.DataFrame({"slug": uci_filtered.index})

In [51]:
sdf["category"] = ""

In [52]:
sdf.to_csv("uci_categories_attributed.csv", index=False)

In [54]:
pt["Categoria"].unique()

array(['Entretenimento', 'Internacional', 'Economia', 'Política',
       'Ciência e Tecnologia', 'Turismo e Gastronomia', 'Saúde', 'Brasil',
       'Podcast', 'Bem-Estar', 'Mundo', 'saúde', 'Governo Lula',
       'entretenimento', 'política', 'Esportes', 'Esporte', 'Educação',
       'Ciência e Saúde', 'Jornais e Programas', 'Pop e Arte',
       'Turismo e Viagem', 'Meio-Ambiente', 'VIVA BEM'], dtype=object)

In [66]:
pt_all = pt.loc[pt["URL"].str.contains("http"), "URL"].apply(lambda x: x.split("://")[1].split("/")[1]).value_counts()

In [70]:
pt_filtered = pt_all.sort_values(ascending=False).head(250)
sdf = pd.DataFrame({"slug": pt_filtered.index})
sdf["category"] = ""
sdf.to_csv("recognasumm_categories_attributed.csv", index=False)

## Load in mappings and apply

In [92]:
uci["derived_section"] = uci["URL"].apply(lambda x: x.split("://")[1].split("/")[1])
pt["derived_section"] = pt["URL"].apply(lambda x: x.split("://")[1].split("/")[1])

In [93]:
uci_mapping = pd.read_csv("../../data/uci_categories_attributed.csv")
pt_mapping = pd.read_csv("../../data/recognasumm_categories_attributed.csv")

In [94]:
uci = pd.merge(uci_mapping[uci_mapping.category.notna()], uci, left_on="slug", right_on="derived_section", how="right")
uci = uci.drop(columns=["slug", "derived_section"])
uci = uci.rename(columns={"category": "derived_label"})

In [96]:
pt = pd.merge(pt_mapping[pt_mapping.category.notna()], pt, left_on="slug", right_on="derived_section", how="right")
pt = pt.drop(columns=["slug", "derived_section"])
pt = pt.rename(columns={"category": "derived_label"})

## Apply count vectorizer and NB

In [103]:
uci_train = uci[uci.derived_label.notna()]

In [108]:
X_train, X_test, y_train, y_test = train_test_split(uci_train["TITLE"], uci_train["derived_label"], test_size=0.2, random_state=20240819)

In [110]:
# count vectorize X's
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

In [111]:
# train model
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

In [112]:
# test model
clf.score(X_test_counts, y_test)

0.8401466027628982

In [121]:
clf.score(X_test_counts, uci.iloc[X_test.index]["CATEGORY"])

0.8581900197349873

# Embeddings + XGBoost

In [125]:
embedder = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)

<All keys matched successfully>


In [139]:
uci_sample = uci.sample(10000).reset_index()

In [140]:
uci_embeddings = embedder.encode(uci_sample["TITLE"])

In [146]:
# encode "CATEGORY" as integers
uci_sample["CATEGORY"] = uci_sample["CATEGORY"].astype("category")
uci_sample["CATEGORY"] = uci_sample["CATEGORY"].cat.codes

In [147]:
X_train, X_test, y_train, y_test = train_test_split(uci_embeddings, uci_sample["CATEGORY"], test_size=0.2, random_state=20240819)

In [148]:
# train xgboost model
xgb = xgboost.XGBClassifier()
xgb.fit(X_train, y_train)

In [149]:
# test xgboost model
xgb.score(X_test, y_test)

0.8825

# Llama 3.1

In [155]:
prompt = """You are an editor at a news organization. You have been tasked with categorizing news articles into one of the following categories:
- Business
- Entertainment
- Health
- Technology

You will be given the title of a news article and you must categorize it into one of the above categories.

Return the category of the news article, and a brief explanation of why you chose that category.

EXAMPLE:
Title: 5 SECONDS OF SUMMER Announce 'Rock Out With Your Socks Out' 2015 

Category: Entertainment
Explanation: The title mentions a band and a concert tour, which are typically associated with the entertainment category.
"""

In [161]:
mapping = {
    "Entertainment": 1,
    "Technology": 3,
    "Business": 0,
    "Health": 2
}

In [169]:
results = []

for i, row in tqdm(uci_sample.head(100).iterrows()):
    title = row["TITLE"]
    category = row["CATEGORY"]

    llm_resp = ollama.chat(
        "llama3.1",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": f"Title: {title}"},
        ]
    )

    cat = llm_resp["message"]["content"].split("\n")[0].split("Category: ")[1]
    try:
        cat_code = mapping[cat]
    except KeyError:
        cat_code = None

    results.append(
        {
            "title": title,
            "category": category,
            "predicted_category": cat_code
        }
    )

100it [05:36,  3.36s/it]


In [171]:
rdf = pd.DataFrame(results)

In [173]:
nnrdf = rdf[rdf.predicted_category.notna()]

In [174]:
# measure accuracy
nnrdf["category"] = nnrdf["category"].astype(int)
nnrdf["predicted_category"] = nnrdf["predicted_category"].astype(int)

nnrdf["correct"] = nnrdf["category"] == nnrdf["predicted_category"]

nnrdf["correct"].mean()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nnrdf["category"] = nnrdf["category"].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nnrdf["predicted_category"] = nnrdf["predicted_category"].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nnrdf["correct"] = nnrdf["category"] == nnrdf["predicted_category"]


0.717391304347826