# Librerias y carga de datos

In [1]:
import pandas as pd
from google.colab import files
from transformers import pipeline
from tqdm import tqdm

!pip install -q transformers datasets accelerate tqdm


df = pd.read_csv("/content/df.csv")

# Modelo MoritzLaurer/deberta-v3-large-zeroshot-v2.0

In [None]:
LABELS = [
    "comedy", "dance", "sports", "gaming", "educational",
    "vehicles", "beauty", "food", "pets", "technology",
    "music", "unidentified", "other"
]


df = pd.read_csv("/content/df_traducido.csv")
texts = df["descripcion_en"].fillna(" ").tolist()


classifier = pipeline(
    "zero-shot-classification",
    model="MoritzLaurer/deberta-v3-large-zeroshot-v2.0",
    device=0
)


def zero_shot_pipeline_batch(texts, labels, batch_size=32, threshold=0.35):
    results_labels = []
    results_probs_selected = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Procesando textos"):
        batch = texts[i:i+batch_size]
        for text in batch:
            output = classifier(text, labels, multi_label=True)

            selected_labels = []
            selected_probs = []
            for lbl, score in zip(output["labels"], output["scores"]):
                if score >= threshold:
                    selected_labels.append(lbl)
                    selected_probs.append(score)
            results_labels.append(selected_labels)
            results_probs_selected.append(selected_probs)
    return results_labels, results_probs_selected


pred_labels, pred_probs_selected = zero_shot_pipeline_batch(
    texts,
    LABELS,
    batch_size=64,
    threshold=0.35
)


df["categories"] = pred_labels
df["categories_probs"] = pred_probs_selected


pred_probs_all = []
for text in texts:
    output = classifier(text, LABELS, multi_label=True)
    pred_probs_all.append([float(s) for s in output["scores"]])

for idx, label in enumerate(LABELS):
    df[f"prob_{label}"] = [p[idx] for p in pred_probs_all]


# Descarga archivo CSV

In [None]:

output_path = "/content/df_with_categories_preciso.csv"
df.to_csv(output_path, index=False)
output_path
