In [2]:
#load data
import pandas as pd
def load_data(csv_file):
    """Loads Question Pairs from a CSV file

    Args:
        csv_file (str): Path to csv_file
    Returns:
        tuple: A tuple containing supervised data pairs
        returns [],[] on error
    """
    sentences1 = []
    sentences2 = []
    is_duplicate = []
    
    try:
        df = pd.read_csv(csv_file, encoding='utf-8')
        #print("Column names:", df.columns)
        sentence1_series = df['question1']
        sentence2_series = df['question2']
        is_duplicate_series = df['is_duplicate']
        
        sentences1 = sentence1_series.tolist()
        sentences2 = sentence2_series.tolist()
        is_duplicate = is_duplicate_series.tolist()
        
        if len(sentences1) != len(sentences2):
            raise ValueError("The number of sentences in question1 and question2 do not match.")
        else:
            print(f"Loaded {len(sentences1)} sentences.")
        return sentences1, sentences2, is_duplicate
        
    except FileNotFoundError:
        print("Wrong Path")
        return [],[],[]
    
    except Exception as e:
        print(f"An {e} Error Occurred")
        return [],[],[]

DATA_PATH = r'C:/Users/Jash\Documents/Research\Semantic Equivilance\SemanticEquivilance/question_pairs/questions.csv'
sentences1, sentences2, value = load_data(DATA_PATH)


Loaded 404351 sentences.


In [5]:
from lambeq import BobcatParser, SpacyTokeniser, Rewriter, AtomicType, IQPAnsatz
from lambeq.backend.grammar import Diagram as grammatical_diagram
from lambeq.backend.quantum import Diagram as quantum_circuit
from typing import Optional
import os, time, multiprocessing
os.environ["TOKENIZERS_PARALLELISM"] = "true" #environment variable for multithreading

#Global data sequencing variables
num_processes = multiprocessing.cpu_count()
print(f"Using {num_processes} processes.")

_tokenizer = None
_parser = None
_rewriter = None
_ansatz = None

def _initializer():
    global _tokenizer, _parser, _rewriter, _ansatz
    _tokenizer = SpacyTokeniser()  # Initialize tokenizer
    _parser = BobcatParser(verbose="suppress")  # Initialize parser 
    _rewriter = Rewriter(['prepositional_phrase', 'determiner'])  # Initialize rewriter
    _ansatz = IQPAnsatz({AtomicType.NOUN: 1, AtomicType.SENTENCE: 1}, n_layers=2, n_single_qubit_params=3)  # Initialize ansatz


Using 12 processes.


In [6]:
def process_data(sentence: str, tokeniser, parser, rewriter, ansatz) -> Optional[grammatical_diagram]:
    """Process a single sentence to a diagram.

    Args:
        sentence (str): Sentence to be converted to a diagram.

    Returns:
        Optional[quantum_circuit]: Either returns a diagram or None if an error occurs.
    """
    try:
        sentence = sentence.strip().lower()
        tokens = tokeniser.tokenise_sentence(sentence)
        diagram = parser.sentence2diagram(tokens, tokenised=True)
        if diagram is not None:
            diagram = rewriter(diagram)
            normalised_diagram = diagram.normal_form()
            curry_functor = Rewriter(['curry'])
            curried_diagram = curry_functor(normalised_diagram)
            circuit = ansatz(curried_diagram)
            return circuit
        else:
            return None
    except Exception as e:
        print(f"Error processing sentence {sentence}")
        return None
def _process_data_for_pool(sentence: str) -> Optional[grammatical_diagram]:
    """Process a single sentence for the multiprocessing pool."""
    return process_data(sentence, _tokenizer, _parser, _rewriter, _ansatz)

def process_sentences(sentences: list[str]) -> list[Optional[grammatical_diagram]]:
    """Process sentences in parallel using multiprocessing.

    Args:
        sentences (list[str]): List of sentences to be processed.

    Returns:
        list[Optional[quantum_circuit]]: List of processed diagrams or None for errors.
    """
    start_time = time.time()
    batch_size = 50
    with multiprocessing.Pool(processes=num_processes, initializer=_initializer) as pool:
        for i in range(0, len(sentences), batch_size):
            batch = sentences[i:i + batch_size]
            print(f"Processing batch {i // batch_size + 1} with {len(batch)} sentences.")
            current_batch = sentences[i:i + batch_size]
            
            batch_results = pool.map(_process_data_for_pool, current_batch)
        # Collect results from all batches
        #results = pool.map(_process_data_for_pool, sentences)
        end_time = time.time()
        print(f"Processed {len(sentences)} sentences in {end_time - start_time:.4f} seconds.")
    return batch_results

In [None]:
for i in range(0, len(sentences1), 50):
    batch1 = sentences1[i:i + 50]
    #batch2 = sentences2[i:i + 50]
    print(f"Processing batch {i // 50 + 1} with {len(batch1)} sentences.")
    _initializer()
    results1 = process_data(sentences1, _tokenizer, _parser, _rewriter, _ansatz)
    #results2 = process_sentences(batch2)
    
    # Here you can handle the results as needed
    # For example, you could save them to a file or process them further

Processing batch 1 with 50 sentences.


In [None]:
if __name__ == "__main__":
    # Process sentences1 and sentences2 in parallel
    print("Processing sentences1...")
    diagrams1 = process_sentences(sentences1)
    # print("Processing sentences2...")
    # diagrams2 = process_sentences(sentences2)

    # # Filter out None values (errors)
    # diagrams1 = [d for d in diagrams1 if d is not None]
    # diagrams2 = [d for d in diagrams2 if d is not None]

    # print(f"Processed {len(diagrams1)} diagrams from sentences1 and {len(diagrams2)} from sentences2.")

Processing sentences1...
Processing batch 1 with 50 sentences.


In [None]:
_initializer()  # Initialize global variables for the first run
for i in range(0, len(sentences1)):
    circuit1 = process_data(sentences1[i], _tokenizer, _parser, _rewriter, _ansatz)
    #circuit1.draw(figsize=(10, 10))
    circuit2 = process_data(sentences2[i], _tokenizer, _parser, _rewriter, _ansatz)
    if circuit1 is None or circuit2 is None:
        print(f"Error processing sentences at index {i}. Skipping comparison.")
        continue
    if circuit1 == circuit2 or value[i]:
        print("The circuits are equivalent.")
        circuit1.draw(figsize=(10, 10))
        circuit2.draw(figsize=(10, 10))
    else:
        print("The circuits are not equivalent.")

In [None]:
import torch
import random
import numpy as np

SEED = 12
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

# Set the seed for reproducibility

train_labels, train_data = zip(sentences1, sentences2), value #this needs to be portioned out of the data for a train/test/validation split

BATCH_SIZE = 32
EPOCHS = 100
from torch import nn
from lambeq import PennyLaneModel
class XORSentenceModel(PennyLaneModel): #QNN LTSM Model
    def __init__(self, **kwargs):
        PennyLaneModel.__init__(self, **kwargs)
        self.xor_net = nn.Sequential(
            nn.Linear(4, 10),  # Adjust input size based on your
            nn.ReLU(),
            nn.Linear(10, 1),
            nn.Sigmoid()
        )

    def forward(self, circuit1, circuit2): #passes through the network
        evaluated_pairs = torch.cat((self.get_diagram_output(circuit1),self.get_diagram_output(circuit2)), dim=1)  # Concatenate the outputs of both diagrams
        evaluated_pairs = 2 * (evaluated_pairs - 0.5)  # Scale to [-1, 1]
        return self.xor_net(evaluated_pairs)
    
model = XORSentenceModel().from_diagrams(a+b, probabilities=True, normalize=True)
model.initialise_weights()
model = model.double()  # Convert model to double precision
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Use Adam optimizer
def accuracy(circs, labels):
    """Calculate the accuracy of the model."""
    predicted = model(circs)
    return (torch.round(torch.flatten(predicted)) == torch.DoubleTensor(labels)).sum().item() / len(circs)

best = {'accuracy': 0.0,
        'epoch': 0,}
for epoch in range(EPOCHS):
    epoch_loss = 0.0
    for circuits, labels in zip(train_data, train_labels):
        optimizer.zero_grad()
        predicted = model(circuits) #use BCE loss for binary classification
        loss = torch.nn.functional.binary_cross_entropy(torch.flatten(predicted), torch.DoubleTensor(labels))
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()
        #eval every 5 epochs
        if epoch % 5 == 0:
            acc = accuracy(circuits, labels)
            print(f"Epoch {epoch}, Loss: {loss.item()}, Accuracy: {acc}")
            if acc > best['accuracy']:
                best['accuracy'] = acc
                best['epoch'] = epoch
                print(f"New best model found at epoch {epoch} with accuracy {acc}")
                model.save('best_model.pth')

NameError: name 'a' is not defined