<a href="https://colab.research.google.com/github/Mauriciocr207/voice-assistant-ai/blob/main/colab/voice_assistant_ai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Visualización de Embeddings en 2D y 3D de BETO
## Modelo
Utilizaremos el modelo `dccuchile/bert-base-spanish-wwm-cased` BETO para obtener sus embeddings y visualizarlos

In [18]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.decomposition import PCA
import plotly.express as px
import pandas as pd

# Cargar BETO y tokenizer
model_name = "dccuchile/bert-base-spanish-wwm-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Cargar palabras desde el archivo CSV
csv_path = "./words.csv"
df_words = pd.read_csv(csv_path)
words = df_words["word"].tolist()

# Obtener embeddings de cada palabra
def get_word_embeddings(words):
    embeddings = []
    for word in words:
        inputs = tokenizer(word, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
        # Extraer el embedding de la primera capa oculta del token [CLS] o el promedio de todos los tokens
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)
    return embeddings

embeddings = get_word_embeddings(words)

# Reducir dimensionalidad a 3D
pca = PCA(n_components=3)
reduced_embeddings = pca.fit_transform(embeddings)

# Crear un DataFrame para Plotly
df_embeddings = pd.DataFrame(reduced_embeddings, columns=["PC1", "PC2", "PC3"])
df_embeddings["word"] = words

# Visualizar en 3D interactivo con Plotly
fig = px.scatter_3d(
    df_embeddings,
    x="PC1",
    y="PC2",
    z="PC3",
    text="word",
    title="Embeddings de palabras en 3D (PCA)",
    labels={"PC1": "x", "PC2": "y", "PC3": "y"}
)

# Ajustar el diseño para parecer más un plano tridimensional
fig.update_layout(
    scene=dict(xaxis_showspikes=False,
               yaxis_showspikes=False)
)

# Mostrar el gráfico interactivo
fig.show()


Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
