In [222]:
from tensorflow.keras.datasets import reuters
import pandas as pd
import numpy as np

In [9]:
(train_data, train_labels), (test_data, test_labels) = reuters.load_data(
            num_words=10000)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters.npz
[1m2110848/2110848[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1us/step


In [12]:
len(test_data)

2246

## Decoding news wires back to text

In [13]:
word_index = reuters.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters_word_index.json
[1m550378/550378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1us/step


In [66]:
label_names = reuters.get_label_names()

In [14]:
reverse_word_index = dict(
[(value, key) for (key, value) in word_index.items()]) 

In [30]:
def decode_text(text, reverse_word_index):
    decoded_text = " ".join(
                [reverse_word_index.get(i - 3, "?") for i in text])
    return decoded_text

In [41]:
decode_text(train_data[2], reverse_word_index)

'? shr 3 28 dlrs vs 22 cts shr diluted 2 99 dlrs vs 22 cts net 46 0 mln vs 3 328 000 avg shrs 14 0 mln vs 15 2 mln year shr 5 41 dlrs vs 1 56 dlrs shr diluted 4 94 dlrs vs 1 50 dlrs net 78 2 mln vs 25 9 mln avg shrs 14 5 mln vs 15 1 mln note earnings per share reflect the two for one split effective january 6 1987 per share amounts are calculated after preferred stock dividends loss continuing operations for the qtr 1986 includes gains of sale of investments in ? corp of 14 mln dlrs and associated companies of 4 189 000 less writedowns of investments in national ? inc of 11 8 mln and ? corp of 15 6 mln reuter 3'

## Sample texts

In [55]:
import random


In [185]:
random.seed(42)
sample_index  = random.sample(range(1, len(train_data)), 150)

In [186]:
text_sample = train_data[sample]
labels_sample = train_labels[sample]

In [187]:
label_names_sample = [label_names[label] for label in labels_sample]

In [188]:
decoded_text_sample = [decode_text(text, reverse_word_index) for text in text_sample]

In [189]:
df = pd.DataFrame({"text": decoded_text_sample, "label_code": labels_sample, "label_name": label_names_sample})

In [191]:
#df.to_csv("newsweek_sample.csv", index=False)

In [192]:
df = pd.read_csv("newsweek_sample.csv")

## Extracting topics with an LLM

In [193]:
#!pip install ollama
#ref: https://github.com/ollama/ollama
#run in terminal: ollama run llama3.2:1b
import ollama

Usar el texto 2 de la muestra para mostrar la inestabilidad del resultado.

In [194]:
prompt_v1 = "You are an expert news anchor. What news category does this article belong to?: {text}"
prompt_v2 = "You are an expert news anchor. What news category does this article belong to? Provide your best guess: {text}"
prompt_v3 = "You are an expert news anchor. What news category does this article belong to? \
    Provide your best guess in a sentence with no more than 4 words: {text}"
prompt_v4 = "You are an expert news anchor. What news category does the following article belong to? \
    Please answer only with the category. If you are unsure, your best guess is ok: {text}"

prompt_v5 = "You are an expert news anchor. What news category does the following article belong to? \
    Please answer only with one category.\
    If you are unsure, your best guess is ok. Do not include question marks: {text}"

In [195]:
def ollama_query(text, prompt):
    prompt = prompt
    

    response = ollama.chat(
        model = "llama3.2:1b",
        messages = [{"role":"user", "content":prompt.format(text=text)}]    
    )

    return response["message"]["content"]

In [221]:
results = []

#run llm 10 times to solve for topic stability
for i in range(10):
    df_c = df.copy()

    topics = []
    for row in range(df_c.shape[0]):
        temp_text = df_c.iloc[row].text
        topic = ollama_query(temp_text, prompt_v5)
        topics.append(topic)
    df_c.loc[:, "llm_topic"] = topics
    results.append(df_c)

NameError: name 'np' is not defined

In [224]:
df_c = df.copy()

In [225]:
df_c["topics"] = topics

In [229]:
topics_iter1 = df_c["topics"].value_counts().reset_index()

In [280]:
pareto = (topics_iter1["count"].cumsum() / topics_iter1["count"].sum()) < .81

In [287]:
topic_list = topics_iter1.to_dict(orient="records")

In [295]:
for topic in topic_list[:2]:
    print(topic["topics"])

Business
Economy


In [291]:
topic_mapping = {}

In [306]:
"Business" in "Business/Financial News"

True

In [305]:
topics_iter1

Unnamed: 0,topics,count
0,Business,35
1,Economy,21
2,Economics,15
3,Financial News,14
4,Energy,8
5,Business/Finance,4
6,Business/Financial News,4
7,Investing,3
8,Engineering,2
9,Business News,2


In [303]:
for topic in topic_list:
    a = topic["topics"]
    for topic2 in topic_list:
        b = topic2["topics"]
        
        pair = (a,b)
        
        tokens = tokenizer(pair, return_tensors='tf',padding=True)
        with torch.no_grad():
            outputs = model(**tokens)
        
        # Extract embeddings for the [CLS] token
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        
        similarity = tf.keras.losses.cosine_similarity(cls_embedding[0], cls_embedding[1], axis=-1)
        #similarity = np.abs(similarity)
        print(pair, similarity) 

('Business', 'Business') tf.Tensor(-1.0, shape=(), dtype=float32)
('Business', 'Economy') tf.Tensor(-0.8753977, shape=(), dtype=float32)
('Business', 'Economics') tf.Tensor(-0.82352716, shape=(), dtype=float32)
('Business', 'Financial News') tf.Tensor(-0.89197177, shape=(), dtype=float32)
('Business', 'Energy') tf.Tensor(-0.96130544, shape=(), dtype=float32)
('Business', 'Business/Finance') tf.Tensor(-0.8023946, shape=(), dtype=float32)
('Business', 'Business/Financial News') tf.Tensor(-0.82624143, shape=(), dtype=float32)
('Business', 'Investing') tf.Tensor(-0.9683739, shape=(), dtype=float32)
('Business', 'Engineering') tf.Tensor(-0.81622, shape=(), dtype=float32)
('Business', 'Business News') tf.Tensor(-0.8636549, shape=(), dtype=float32)
('Business', 'Financial Markets') tf.Tensor(-0.864365, shape=(), dtype=float32)
('Business', 'Financials') tf.Tensor(-0.8486435, shape=(), dtype=float32)
('Business', 'Agriculture') tf.Tensor(-0.785066, shape=(), dtype=float32)
('Business', 'Invest

KeyboardInterrupt: 

## Reducing topics with embeddings

In [242]:
#!pip install transformers
#!pip install torch
#!pip install tf-keras

In [None]:
#adapted from https://www.geeksforgeeks.org/word-embeddings-in-nlp/

In [243]:
from transformers import BertTokenizer, TFBertModel
import torch

In [244]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertModel.from_pretrained(model_name)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [245]:
word_pairs = [('learn', 'learning'), ('india', 'indian'), ('fame', 'famous')]

In [250]:
import tensorflow as tf

In [254]:
# Compute similarity for each pair of words
for pair in word_pairs:
	tokens = tokenizer(pair, return_tensors='tf')
	with torch.no_grad():
		outputs = model(**tokens)
	
	# Extract embeddings for the [CLS] token
	cls_embedding = outputs.last_hidden_state[:, 0, :]

	similarity = tf.keras.losses.cosine_similarity(cls_embedding[0], cls_embedding[1], axis=-1)
	
	print(f"Similarity between '{pair[0]}' and '{pair[1]}' using BERT: {similarity:.3f}")


Similarity between 'learn' and 'learning' using BERT: -0.930
Similarity between 'india' and 'indian' using BERT: -0.957
Similarity between 'fame' and 'famous' using BERT: -0.956
