In [None]:
#load data
import pandas as pd
def load_data(csv_file):
    """Loads Question Pairs from a CSV file

    Args:
        csv_file (str): Path to csv_file
    Returns:
        tuple: A tuple containing supervised data pairs
        returns [],[] on error
    """
    sentences1 = []
    sentences2 = []
    is_duplicate = []
    
    try:
        df = pd.read_csv(csv_file, encoding='utf-8')
        #print("Column names:", df.columns)
        sentence1_series = df['question1']
        sentence2_series = df['question2']
        is_duplicate_series = df['is_duplicate']
        
        sentences1 = sentence1_series.tolist()
        sentences2 = sentence2_series.tolist()
        is_duplicate = is_duplicate_series.tolist()
        
        if len(sentences1) != len(sentences2):
            raise ValueError("The number of sentences in question1 and question2 do not match.")
        else:
            print(f"Loaded {len(sentences1)} sentences.")
        return sentences1, sentences2, is_duplicate
        
    except FileNotFoundError:
        print("Wrong Path")
        return [],[],[]
    
    except Exception as e:
        print(f"An {e} Error Occurred")
        return [],[],[]

DATA_PATH = r'C:/Users/Jash\Documents/Research\Semantic Equivilance\SemanticEquivilance/question_pairs/questions.csv'
sentences1, sentences2, value = load_data(DATA_PATH)


Column names: Index(['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate'], dtype='object')
Loaded 404351 sentences.


In [9]:
from lambeq import BobcatParser, SpacyTokeniser, Rewriter, AtomicType, IQPAnsatz
from lambeq.backend.quantum import Diagram as quantum_circuit
from typing import Optional
import os, time, multiprocessing
os.environ["TOKENIZERS_PARALLELISM"] = "true" #environment variable for multithreading

#Global data sequencing variables
num_processes = multiprocessing.cpu_count()
print(f"Using {num_processes} processes.")

_tokenizer = None
_parser = None
_rewriter = None
_ansatz = None

def _initializer():
    global _tokenizer, _parser, _rewriter, _ansatz
    _tokenizer = SpacyTokeniser()  # Initialize tokenizer
    _parser = BobcatParser(verbose="suppress")  # Initialize parser 
    _rewriter = Rewriter(['prepositional_phrase', 'determiner'])  # Initialize rewriter
    _ansatz = IQPAnsatz({AtomicType.NOUN: 1, AtomicType.SENTENCE: 1}, n_layers=2, n_single_qubit_params=3)  # Initialize ansatz


Using 12 processes.


In [10]:
def process_data(sentence: str, tokeniser, parser, rewriter, ansatz) -> Optional[quantum_circuit]:
    """Process a single sentence to a diagram.

    Args:
        sentence (str): Sentence to be converted to a diagram.

    Returns:
        Optional[quantum_circuit]: Either returns a diagram or None if an error occurs.
    """
    try:
        sentence = sentence.strip().lower()
        tokens = tokeniser.tokenise_sentence(sentence)
        diagram = parser.sentence2diagram(tokens, tokenised=True)
        if diagram is not None:
            diagram = rewriter(diagram)
            normalised_diagram = diagram.normal_form()
            curry_functor = Rewriter(['curry'])
            curried_diagram = curry_functor(normalised_diagram)
            circuit = ansatz(curried_diagram)
            return circuit
        else:
            return None
    except Exception as e:
        print(f"Error processing sentence {sentence}")
        return None
def _process_data_for_pool(sentence: str) -> Optional[quantum_circuit]:
    """Process a single sentence for the multiprocessing pool."""
    return process_data(sentence, _tokenizer, _parser, _rewriter, _ansatz)

def process_sentences(sentences: list[str]) -> list[Optional[quantum_circuit]]:
    """Process sentences in parallel using multiprocessing.

    Args:
        sentences (list[str]): List of sentences to be processed.

    Returns:
        list[Optional[quantum_circuit]]: List of processed diagrams or None for errors.
    """
    start_time = time.time()
    with multiprocessing.Pool(processes=num_processes, initializer=_initializer) as pool:
        results = pool.map(_process_data_for_pool, sentences)
        end_time = time.time()
        print(f"Processed {len(sentences)} sentences in {end_time - start_time:.4f} seconds.")
    return results

In [None]:
if __name__ == "__main__":
    # Process sentences1 and sentences2 in parallel
    print("Processing sentences1...")
    diagrams1 = process_sentences(sentences1)
    print("Processing sentences2...")
    diagrams2 = process_sentences(sentences2)

    # Filter out None values (errors)
    diagrams1 = [d for d in diagrams1 if d is not None]
    diagrams2 = [d for d in diagrams2 if d is not None]

    print(f"Processed {len(diagrams1)} diagrams from sentences1 and {len(diagrams2)} from sentences2.")

Processing sentences1...
