In [None]:
from lambeq import BobcatParser, SpacyTokeniser, Rewriter, AtomicType, IQPAnsatz
from lambeq.backend.grammar import Diagram as grammatical_diagram

from typing import List, Optional, Tuple, Dict

import os, time, multiprocessing

import nltk

try:
    nltk.data.find('tokenizers/punkt_tab/french')
except LookupError:
    nltk.download('punkt_tab/french')
from nltk.tokenize import word_tokenize

os.environ["TOKENIZERS_PARALLELISM"] = "true" #environment variable for multithreading

#Global data sequencing variables
parser = BobcatParser(verbose="suppress")
tokeniser = SpacyTokeniser()
rewriter = Rewriter(['prepositional_phrase', 'determiner']) #potentially add more rules for rewrite
N = AtomicType.NOUN
S = AtomicType.SENTENCE
ansatz = IQPAnsatz({N:1,S:1}, n_layers=2, n_single_qubit_params=3)

num_processes = multiprocessing.cpu_count()
print(f"Using {num_processes} processes.")

def process_sentence(sentence: str, is_french: bool = False) -> Optional[grammatical_diagram]:
    """
    A single sentence process, designed to be the base function for each process
    Args: String to process
    Returns: A semantic diagram of the sentence, or nothing
    """
    try:
        sentence = sentence.lower() #make all lowercase
        if is_french:
             tokens = word_tokenize(sentence, language='french') #switch english to ntlk tokenizer instead of spaCy?
             return tokens #error is a type error, code works but the process to make the diagram should in theory go here later
            # rewritten_diagram = rewriter(diagram)
            # normalised_diagram = rewritten_diagram.normal_form()
            # return normalised_diagram
        else:
            #sentence = tokeniser.split_sentences(sentence) #for more complex sentences this will be needed, but breaks single sentence itterances
            tokens = tokeniser.tokenise_sentence(sentence)
            diagram = parser.sentence2diagram(tokens, tokenised=True)
            if diagram is not None:                
                rewritten_diagram = rewriter(diagram)
                normalised_diagram = rewritten_diagram.normal_form()
                curry_functor = Rewriter(['curry'])
                curried_diagram = curry_functor(normalised_diagram) #another rewrite, this one is for map-state duality
                return curried_diagram
            else:
                return None

    except Exception as e:
        print(f"Failed to parse: {sentence}")
        return None

def process_sentences_batch(sentences: List[str], is_french: bool)-> List[Optional[object]]:
    """
    Processes a batch of sentences
    Args: List of sentences, is_french
    Returns: List of diagrams
    """
    return [process_sentence(sentence, is_french) for sentence in sentences]

def process_sentences(sentences: List[str], batch_size: int = 200, is_french: bool = False) -> Dict[int,Optional[object]]:
    """
    Processes sentences in batches of 200(default only) in a parallel structure
    Args: sentences, list of strings
          batch_size number of sentences per batch
          is_french, flag set for second lang
    
    Returns:
        A dictionary of processed diagrams or none
    """
    start_time = time.time()
    diagrams: Dict[int, Optional[object]] = {}
    with multiprocessing.Pool(processes=num_processes) as pool:
        indexed_sentences = list(enumerate(sentences))
        batches = [indexed_sentences[i:i + batch_size] for i in range(0, len(indexed_sentences), batch_size)]
        results = pool.map(lambda batch: {i: process_sentence(sentence, is_french) for i, sentence in batch}, batches) #lambda function to pass flag
        for result_dict in results:
            diagrams.update(result_dict)
        
    end_time = time.time()
    print(f"Processed {len(sentences)} sentences in {end_time - start_time: .2f} seconds using a batch size of {batch_size}.")
    return diagrams

def read_sentences_from_file(filename: str) -> List[str]:
    """
    Reads sentences from file
    Args: Filename, str
    Returns: list of sentences
    """
    try:
        
        with open(filename, "r", encoding="utf-8") as f:
            sentences = [line.strip() for line in f]
        return sentences
    
    except FileNotFoundError:
        print(f"Wrong file name")
        return [] #return Nullset for list item
    
    except UnicodeDecodeError:
        print(f"missed utf-8 encoding")
        return []
    
    except Exception as e:
        print(f"Unknown Error")
        return []
    
def convert_diagram(diagram:Optional[grammatical_diagram]) -> Optional[object]:
    """Converts a diagram to it's equivilant quantum circuit

    Args:
        diagram (Optional[object]): Diagram Obj

    Returns:
        Optional[object]: Circuit obj, or None
    """
    if diagram is None:
        return None
    try:
        circuit = ansatz(diagram)
        return circuit
    except Exception as e:
        print("Failed to convert diagram to circuit")
        return None
    
def convert_diagrams_batch(diagrams: List[Optional[grammatical_diagram]]) -> List[Optional[object]]:
    return [convert_diagram(diagram) for diagram in diagrams]

def process_diagrams(diagrams: Dict[int, Optional[object]], batch_size: int = 200) -> Dict[int, Optional[object]]:
    """Converts diagrams to circuits in batches parallelizing the work, while maintaining original sentence indicies

    Args:
        diagrams: Dictionary mapping sentence indices to diagram for efficent storage
        batch_size (int, optional): number of diagrams per batch

    Returns:
        Dict[int, Optional[object]]: _description_A dictionary mapping sentence indicies to circuits
    """
    start_time = time.time()
    
    indicies = list(diagrams.keys())
    batched_indices = [indicies[i:i + batch_size] for i in range(0, len(indicies), batch_size)]
    
    diagram_batches = [[diagrams[index] for index in index_batch] for index_batch in batched_indices]
    with multiprocessing.Pool(num_processes) as pool:
        circuit_batches = pool.map(convert_diagrams_batch, diagram_batches)
        
    circuits: Dict[int,Optional[object]] = {}
    for i, index_batch in enumerate(batched_indices):
        for j, index in enumerate(index_batch):
                circuits[index] = circuit_batches[i][j]
    end_time = time.time()
    print(f"Converted {len(diagrams)} diagrams to circuits in {end_time - start_time:.2f} seconds using batch size of {batch_size}.")
    return circuits

def save_circuit(circuit: object, filename:str) -> None:
    """Saves a single diagram to a file."""
    try:
        if circuit is None:
            print(f"Warning Object missing, Skipping {filename}.")
            return
        else:
            circuit.draw(path=filename, figsize=(20,20), draw_type_labels=False)
            return
    except Exception as e:
        print(f"An error occurred while saving {filename}: {e}")
        
def create_circuit_pairs(english_circuits: Dict[int, Optional[object]], french_circuits: Dict[int, Optional[object]]) -> List[Tuple[Optional[object], Optional[object]]]:
    """
    Pairs English and french circuits based on their original sentence indices.

    Args:
        english_circuits: Dictionary mapping English sentence indices to circuits.
        french_circuits: Dictionary mapping french sentence indices to circuits.

    Returns:
        A list of tuples, where each tuple contains an (English circuit, french circuit) pair.
        Only pairs with matching indices are included.  If a circuit is missing for a given
        index in either language, that index is skipped.
    """
    circuit_pairs: List[Tuple[Optional[object], Optional[object]]] = []
    common_indices = sorted(list(english_circuits.keys() & french_circuits.keys()))  # Get sorted common indices

    for index in common_indices:
        en_circuit = english_circuits.get(index)
        fr_circuit = french_circuits.get(index)
        circuit_pairs.append((en_circuit, fr_circuit))

    return circuit_pairs

SyntaxError: invalid syntax (3449715923.py, line 2)

In [2]:
#load data
import pandas as pd
def load_translation_data(csv_file):
    """Loads English and French sentences from a CSV file

    Args:
        csv_file (str): Path to csv_file
    Returns:
        tuple: A typle containing english/french pairs
        returns [],[] on error
    """
    english_sentences=[]
    french_sentences = []
    
    try:
        df = pd.read_csv(csv_file, encoding='utf-8')
        print("Column names:", df.columns)
        english_sentences = df['English words/sentences']
        french_sentences = df['French words/sentences']
            
        print(f"Loaded {len(english_sentences)} English sentences and {len(french_sentences)} French sentences.")
        return english_sentences,french_sentences
        
    except FileNotFoundError:
        print("Wrong Path")
        return [],[]
    
    except Exception as e:
        print(f"An {e} Error Occurred")
        return [],[]

DATA_PATH = r'C:/Users/Jash/Documents/Research/QNLP/eng_-french.csv'
english_sentences, french_sentences = load_translation_data(DATA_PATH)

# if not english_sentences.empty and not french_sentences.empty: # Check that the lists are not empty
#     print("\nFirst 5 English sentences:")
#     print(english_sentences[:5])  # Print the first 5 English sentences

#     print("\nFirst 5 French sentences:")
#     print(french_sentences[:5])  # Print the first 5 French sentences
# else:
#     print("Failed to load the translation data.")

def write_fr_tokens(fr_sentences, output_path):
    try:
        with open(output_path, 'w', encoding='utf-8') as outfile:
            for sentence in fr_sentences:
                tokens = process_sentence(sentence,is_french=True)
                if tokens:
                    outfile.write(' '.join(tokens) + '\n') #puts back into sentence form
                else:
                    outfile.write('\n ')
    except Exception as e:
        print(f"An {e} error occurred")


Column names: Index(['English words/sentences', 'French words/sentences'], dtype='object')
Loaded 175621 English sentences and 175621 French sentences.


In [3]:
write_fr_tokens(french_sentences, output_path="fr_tokens.txt")

In [None]:
from lambeq import CCGTree
#lambeq diagram -> ccg -> combinatory categorial grammar category theory that turns sentences into functions/trees ccgtree, types
def create_lambeq_diagram(ccg_parse_string):
    """
    Converts a CCG parse string to a lambeq CCGTree and then a diagram.

    Args:
        ccg_parse_string (str): A string representing a CCG parse.

    Returns:
        lambeq.diagram.Diagram: The lambeq diagram, or None if parsing fails.
    """
    try:

        # Create a CCGTree from the string
        tree = CCGTree.fromstring(ccg_parse_string)

        # Convert the CCGTree to a diagram
        diagram = tree.to_diagram()  # Or tree.to_biclosed_diagram(), depending on your needs

        return diagram

    except Exception as e:
        print(f"Error processing parse '{ccg_parse_string}': {e}")
        return None

fr_ccg = read_sentences_from_file("FR_CCG.txt")
for sentence in fr_ccg:
    create_lambeq_diagram(sentence)
    
    
    #takes each line
    #reconstructs the tree in memory
    #turns each node into its class
    #converts to diagram from that tree
    
    #machine learning -> classical, hybrid, pure quantum -> LTSM, LLM, CNN
    #takes in diagram and outputs circuit
    
    #english done #parsing done #french diagrams 
    
    

Error processing parse '( (ROOT (SENT (NP (N salut)) (. !))) )': type object 'CCGTree' has no attribute 'fromstring'
Error processing parse '( (ROOT (SENT (NP (N cours)) (. !))) )': type object 'CCGTree' has no attribute 'fromstring'
Error processing parse '( (ROOT (SENT (ADV courez) (. !))) )': type object 'CCGTree' has no attribute 'fromstring'
Error processing parse '( (ROOT (SENT (NP (PRO qui)) (. ?))) )': type object 'CCGTree' has no attribute 'fromstring'
Error processing parse '( (ROOT (SENT (NP (PRO ça) (ADV alors)) (. !))) )': type object 'CCGTree' has no attribute 'fromstring'
Error processing parse '( (ROOT (SENT (PP (P au) (NP (N feu))) (. !))) )': type object 'CCGTree' has no attribute 'fromstring'
Error processing parse '( (ROOT (SENT (VPinf (P à) (VN (V l'aide))) (. !))) )': type object 'CCGTree' has no attribute 'fromstring'
Error processing parse '( (ROOT (SENT (NP (N saute)) (. .))) )': type object 'CCGTree' has no attribute 'fromstring'
Error processing parse '( (ROO

KeyboardInterrupt: 

In [None]:
def main():
    if __name__ == "__main__": #guard 
        multiprocessing.freeze_support() #windows support, not needed for linux line
        #DATA_PATH = r'C:/Users/Jash/Documents/Research/QNLP/eng_-french.csv'
        #english_sentences, french_sentences = load_translation_data(DATA_PATH)

    
        english_diagrams = process_sentences(english_sentences)
        #french_diagrams = process_sentences(french_sentences, is_french=True)

        english_circuits = process_diagrams(english_diagrams)
        #french_circuits = process_diagrams(french_diagrams)
        
        for index, circuit in english_circuits.items():
            if circuit:
                filename = f"english_circuit_{index + 1}.png"
                save_circuit(circuit, filename)
            
        # for index, circuit in french_circuits.items():
        #     if circuit:
        #         filename = f"french_circuit_{index + 1}.png"

        #ml_data = create_circuit_pairs(english_circuits,french_circuits)
        print(f"Number of English circuits: {len(english_circuits)}")
        #print(f"Number of french circuits: {len(french_circuits)}")
        #print(f"Number of matching circuit pairs: {len(ml_data)}")
        #print(f"Data for machine learning (first 10 pairs): {ml_data[:10]}")


#find correct batch size for time constraint

Using 128 processes.


Failed to parse: join us, won't you?
Failed to parse: join us, won't you?


In [12]:
#debug cell
test_diagram = process_sentence("Hi, how are you doing today?")
#test_diagram.draw(figsize=(14,3), fontsize=12)
test_circuit = convert_diagram(test_diagram)
#test_circuit.draw(figsize=(14,3), fontsize=12)
#english_diagrams[15562].draw(figsize=(14,3), fontsize=12)
save_circuit(test_circuit, "test_save.png")

In [None]:
#hyperparameters
BATCH_SIZE = 10
EPOCHS = 15
LEARNING_RATE = 0.1
SEED = 42

import torch
import random
import numpy as np
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)


In [None]:
def acc(y_hat, y):
    return (torch.argmax(y_hat, dim=1) ==
            torch.argmax(y, dim=1)).sum().item()/len(y)

def loss(y_hat, y):
    return torch.nn.functional.mse_loss(y_hat, y)

In [None]:
from lambeq import PennyLaneModel


backend_config = {'backend': 'default.qubit'}
model = PennyLaneModel.from_diagrams(ml_data, probabilities=True, normalize=True,backend_config=backend_config)
#model.initialise_weights()

In [None]:
from lambeq import PytorchTrainer

trainer = PytorchTrainer(
    model=model,
    loss_function=loss,
    optimizer=torch.optim.Adam,
    learning_rate=LEARNING_RATE,
    epochs=EPOCHS,
    evaluate_functions={'acc': acc},
    evaluate_on_train=True,
    use_tensorboard=False,
    verbose='text',
    seed=SEED)

In [None]:
import pennylane as qml
qml.default_config['qiskit.ibmq.ibmqx_token'] = 'my_API_token'
qml.default_config.save(qml.default_config.path)
backend_config = {'backend': 'qiskit.ibmq',
                  'device': 'ibmq_manilia',
                  'shots': 1000}
#q_model = PennyLaneModel.from_diagrams(ml_data, probabilities=True, normalize=True, backend_config=backend_config)
#q_model = initialise_weights()

In [None]:

#trainer.fit(train_dataset, val_dataset)