In [None]:
!pip install lambeq[extras]

Collecting lambeq[extras]
  Downloading lambeq-0.4.3-py3-none-any.whl.metadata (5.4 kB)
Collecting pytket>=1.31.0 (from lambeq[extras])
  Downloading pytket-1.40.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.7 kB)
Collecting tensornetwork (from lambeq[extras])
  Downloading tensornetwork-0.4.6-py3-none-any.whl.metadata (6.8 kB)
Collecting discopy>=1.1.7 (from lambeq[extras])
  Downloading discopy-1.2.0-py3-none-any.whl.metadata (21 kB)
Collecting pennylane>=0.29.1 (from lambeq[extras])
  Downloading PennyLane-0.40.0-py3-none-any.whl.metadata (10 kB)
Collecting pennylane-honeywell (from lambeq[extras])
  Downloading PennyLane_Honeywell-0.34.1-py3-none-any.whl.metadata (8.0 kB)
Collecting pennylane-qiskit (from lambeq[extras])
  Downloading PennyLane_qiskit-0.40.0-py3-none-any.whl.metadata (6.4 kB)
Collecting pytket-qiskit>=0.21.0 (from lambeq[extras])
  Downloading pytket_qiskit-0.63.0-py3-none-any.whl.metadata (4.8 kB)
Collecting rustworkx>=0.14.0 (from penn

In [None]:
import os
import re
import random
import pandas as pd
import spacy
import gc
from IPython.display import display

csv_path = 'bbc-news-data.csv'
if os.path.exists(csv_path):
    print("El archivo existe. Procediendo a cargarlo...")
    try:
        df = pd.read_csv(csv_path, sep='\t', engine='python')
        print("Archivo cargado exitosamente.")
        display(df.head())
    except pd.errors.ParserError as e:
        print("Error al parsear el archivo CSV:", e)
    except Exception as e:
        print("Ocurrió un error al cargar el archivo:", e)
else:
    print("El archivo no existe en la ruta especificada:", csv_path)

from sklearn.model_selection import train_test_split
X = df['title'].values
y = df['category'].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

def word_count(sentence):
    return len(sentence.split())

filtered = []
for text, cat in zip(X_train, y_train):
    if word_count(text) <= 10:
        filtered.append((text, cat))
print("Número de titulares cortos (antes de submuestrear):", len(filtered))

# usar solo los primeros 300 titulares para reducir el uso de RAM
filtered = filtered[:37]
print("Número de titulares cortos utilizados:", len(filtered))

def clean_sentence(sentence):
    sentence = sentence.replace("'s", "s")
    sentence = re.sub(r'[^\w\s]', '', sentence)
    return sentence

def add_period(sentence):
    sentence = sentence.strip()
    if not sentence.endswith('.'):
        sentence += '.'
    return sentence

train_sents = [ (text, cat) for (text, cat) in filtered ]

from lambeq import BobcatParser
from lambeq.text2diagram.bobcat_parser import BobcatParseError
errors = 0
parser = BobcatParser(root_cats=['S'])
nlp = spacy.load('en_core_web_sm')
real_diagrams = []
real_texts = []
for text, cat in train_sents:
    try:
        print(f"Analizando la oración: {text}")
        text_clean = clean_sentence(text)
        text_clean = add_period(text_clean)
        diag = parser.sentence2diagram(text_clean)
        real_diagrams.append(diag)
        real_texts.append(text_clean)
    except BobcatParseError:
        print(f"Error al analizar la oración: {text_clean}")
        errors += 1
        continue
print(f"{errors} errores al analizar oraciones (reales)")

# Liberar variables no necesarias para reducir el uso de RAM
del X, y, X_train, y_train, df
gc.collect()


El archivo existe. Procediendo a cargarlo...
Archivo cargado exitosamente.


Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...


Número de titulares cortos (antes de submuestrear): 1780
Número de titulares cortos utilizados: 37
Analizando la oración: Asian banks halt dollar's slide
Analizando la oración: GB quartet get cross country call
Analizando la oración: Spider-Man creator wins profits
Analizando la oración: Howard unveils election platform
Analizando la oración: Police probe BNP mosque leaflet
Analizando la oración: Holmes urged to compete at Worlds
Analizando la oración: Kenyan school turns to handhelds
Analizando la oración: Philippoussis doubt over Open bid
Analizando la oración: Mourinho plots impressive course
Analizando la oración: Lloyd's of London head chides FSA
Analizando la oración: Cole faces lengthy injury lay-off
Analizando la oración: US box office set for record high
Analizando la oración: Player burn-out worries Robinson
Analizando la oración: UKIP's secret weapon?
Error al analizar la oración: UKIPs secret weapon.
Analizando la oración: BMW reveals new models pipeline
Analizando la oraci

7488

In [None]:
# Generar titulares "falsos" a partir de los reales
def generate_fake_sentence(sentence):
    words = sentence.split()
    random.shuffle(words)
    fake = " ".join(words)
    fake = add_period(fake)
    fake = clean_sentence(fake)
    return fake

fake_diagrams = []
for text in real_texts:
    fake_text = generate_fake_sentence(text)
    try:
        diag_fake = parser.sentence2diagram(fake_text)
        fake_diagrams.append(diag_fake)
        print(f"Agregado correctamente: {fake_text}")
    except (BobcatParseError, ValueError) as e:
        print(f"Error al analizar el fake: {fake_text}. Se omitirá. Error: {e}")
        continue

print(f"Generados {len(fake_diagrams)} diagramas falsos válidos.")

from lambeq import StronglyEntanglingAnsatz, AtomicType
ob_map = {
    AtomicType.NOUN: 1,
    AtomicType.CONJUNCTION: 1,
    AtomicType.PREPOSITIONAL_PHRASE: 1,
    AtomicType.SENTENCE: 1,
    AtomicType.PUNCTUATION: 1,
    AtomicType.NOUN_PHRASE: 1,
}
ansatz = StronglyEntanglingAnsatz(
    ob_map=ob_map,
    n_layers=2,
    n_single_qubit_params=3
)
# Convertir diagramas reales a circuitos
real_circuits = []
for diag in real_diagrams:
    circuit = ansatz(diag)
    real_circuits.append(circuit)

# Convertir diagramas falsos a circuitos
fake_circuits = []
for diag in fake_diagrams:
    circuit = ansatz(diag)
    fake_circuits.append(circuit)

print("Número de circuitos reales:", len(real_circuits))
print("Número de circuitos falsos:", len(fake_circuits))



Agregado correctamente: dollars slide Asian halt banks
Agregado correctamente: get GB country quartet call cross
Agregado correctamente: SpiderMan profits wins creator
Agregado correctamente: Howard unveils election platform
Agregado correctamente: leaflet mosque Police probe BNP
Agregado correctamente: urged to compete at Worlds Holmes
Agregado correctamente: handhelds turns school Kenyan to
Agregado correctamente: Philippoussis doubt over Open bid
Agregado correctamente: plots course impressive Mourinho
Agregado correctamente: Lloyds of head London chides FSA
Agregado correctamente: injury layoff faces lengthy Cole
Agregado correctamente: high record for box office US set
Agregado correctamente: burnout Player Robinson worries
Agregado correctamente: pipeline models BMW new reveals
Error al analizar el fake: in Sport betting rules spotlight. Se omitirá. Error: Bobcat failed to parse 'in Sport betting rules spotlight'.
Agregado correctamente: Exel shares rumour lifts Takeover
Agregado

In [None]:
# Etiqueta 1 para reales, 0 para falsos.
n_real = len(real_circuits)
n_fake = len(fake_circuits)
n_samples = min(n_real, n_fake)
X_real = real_circuits[:n_samples]
y_real = [1]*n_samples
X_fake = fake_circuits[:n_samples]
y_fake = [0]*n_samples
X_all = X_real + X_fake
y_all = y_real + y_fake

import torch
from lambeq import PennyLaneModel, Dataset, PytorchTrainer
backend_config = {'backend': 'default.qubit'}
model = PennyLaneModel.from_diagrams(
    X_all,
    probabilities=True,
    normalize=True,
    backend_config=backend_config
)
model.output_dim = 2  # dos clases, real y falso
model.initialise_weights()

BATCH_SIZE = 1  # reducir el batch size para ahorrar memoria
train_dataset = Dataset(X_all, y_all, batch_size=BATCH_SIZE)

def acc(y_hat, y):
    return (torch.argmax(y_hat, dim=1) == torch.tensor(y)).sum().item() / len(y)

def loss(y_hat, y):
    return torch.nn.functional.cross_e
    ntropy(y_hat, torch.tensor(y).clone().detach().long())

EPOCHS = 10
LEARNING_RATE = 0.1
SEED = 42
torch.manual_seed(SEED)
trainer = PytorchTrainer(
    model=model,
    loss_function=loss,
    optimizer=torch.optim.Adam,
    learning_rate=LEARNING_RATE,
    epochs=EPOCHS,
    evaluate_functions={'acc': acc},
    evaluate_on_train=True,
    use_tensorboard=False,
    verbose='text',
    seed=SEED
)
trainer.fit(train_dataset)

with torch.no_grad():
    predictions = model(X_all)
    accuracy = acc(predictions, y_all)
    print(f'Precisión final en el conjunto de entrenamiento: {accuracy:.4f}')

  return torch.nn.functional.cross_entropy(y_hat, torch.tensor(y).clone().detach().long())
  return (torch.argmax(y_hat, dim=1) == torch.tensor(y)).sum().item() / len(y)
Epoch 1:   train/loss: 0.8873   valid/loss: -----   train/time: 1m13s   valid/time: -----   train/acc: 0.5185   valid/acc: -----
Epoch 2:   train/loss: 0.4163   valid/loss: -----   train/time: 1m14s   valid/time: -----   train/acc: 0.5556   valid/acc: -----
Epoch 3:   train/loss: 0.8659   valid/loss: -----   train/time: 1m14s   valid/time: -----   train/acc: 0.6296   valid/acc: -----
Epoch 4:   train/loss: 0.3203   valid/loss: -----   train/time: 1m12s   valid/time: -----   train/acc: 0.7778   valid/acc: -----
Epoch 5:   train/loss: 0.8263   valid/loss: -----   train/time: 1m12s   valid/time: -----   train/acc: 0.6667   valid/acc: -----
Epoch 6:   train/loss: 0.4554   valid/loss: -----   train/time: 1m52s   valid/time: -----   train/acc: 0.7778   valid/acc: -----
Epoch 7:   train/loss: 0.3894   valid/loss: -----   trai

Precisión final en el conjunto de entrenamiento: 0.9444


In [None]:
# Guardar el modelo
MODEL_SAVE_PATH = "reduced_quantum_discriminator.lt"
model.save(MODEL_SAVE_PATH)
print(f"Modelo guardado en: {MODEL_SAVE_PATH}")

Modelo guardado en: reduced_quantum_discriminator.lt
