# 📊 Vectorización de Clientes Telco con ChromaDB (Versión Estable)
Este notebook genera embeddings a partir del dataset enriquecido y guarda los resultados en una base vectorial persistente usando ChromaDB.

In [1]:
!pip install -q chromadb sentence-transformers

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.5/19.5 MB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.6/101.6 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m79.3 MB/s[0m eta [36m0:00:

In [2]:
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer
import os

In [3]:
# Cargar el CSV enriquecido con loyalty_index y loyalty_class
DATASET_PATH = "Dataset_with_Loyalty_Fields(in).csv"  # Asegúrate de subir este archivo a Colab
df = pd.read_csv(DATASET_PATH)
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,loyalty_index,loyalty_class
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,11,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,No,One year,No,Mailed check,56.95,1889.5,No,44,1
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,2,0
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,55,1
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,2,0


In [4]:
# Convertir cada fila en una representación textual
def fila_a_texto(row):
    return (
        f"Customer {row['customerID']} is a {row['gender']} with {row['InternetService']} internet, "
        f"{row['tenure']} months of tenure, churn: {row['Churn']}, "
        f"loyalty index: {row['loyalty_index']}, loyalty class: {row['loyalty_class']}."
    )

documentos = df.apply(fila_a_texto, axis=1).tolist()
ids = [f"id_{i}" for i in range(len(documentos))]

In [5]:
# Generar embeddings con Sentence-Transformers
modelo = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = modelo.encode(documentos).tolist()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
# Crear y guardar base de datos vectorial con ChromaDB (versión moderna)
DB_DIR = "chroma_loyalty_db"
os.makedirs(DB_DIR, exist_ok=True)

chroma_client = chromadb.PersistentClient(path=DB_DIR)
collection = chroma_client.get_or_create_collection("customers_loyalty")

# Insertar por batches (evita error de límite de tamaño)
batch_size = 5000

for i in range(0, len(documentos), batch_size):
    collection.add(
        documents=documentos[i:i + batch_size],
        embeddings=embeddings[i:i + batch_size],
        ids=ids[i:i + batch_size]
    )

print(f"Base vectorial guardada correctamente en '{DB_DIR}' con {len(documentos)} documentos.")

Base vectorial guardada correctamente en 'chroma_loyalty_db' con 7032 documentos.


In [8]:
# Prueba de consulta
query = "Customer with high loyalty index and no churn"
query_embedding = modelo.encode([query]).tolist()

resultados = collection.query(query_embeddings=query_embedding, n_results=5)
print("\n Resultados más relevantes para la consulta:")
for doc in resultados['documents'][0]:
    print("-", doc)


 Resultados más relevantes para la consulta:
- Customer 9975-SKRNR is a Male with No internet, 1 months of tenure, churn: No, loyalty index: 11, loyalty class: 0.
- Customer 5510-BOIUJ is a Male with No internet, 1 months of tenure, churn: Yes, loyalty index: 1, loyalty class: 0.
- Customer 3675-YDUPJ is a Male with No internet, 10 months of tenure, churn: No, loyalty index: 20, loyalty class: 0.
- Customer 5624-BQSSA is a Female with No internet, 1 months of tenure, churn: Yes, loyalty index: 1, loyalty class: 0.
- Customer 5028-HTLJB is a Male with No internet, 1 months of tenure, churn: Yes, loyalty index: 1, loyalty class: 0.
