In [2]:
import spacy

# Load spaCy models
nlp_en = spacy.load("en_core_web_sm")
nlp_ru = spacy.load("ru_core_news_sm")

def process_sentences(filename, nlp):
    """Tokenizes, Lemmatizes, and tags sentences from a file."""
    processed_data = []
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:  # Skip empty lines
                doc = nlp(line)
                sentence_data = [(token.lemma_, token.pos_) for token in doc]
                processed_data.append(sentence_data)
    return processed_data

# Process files
english_sentences = process_sentences("english.txt", nlp_en)
russian_sentences = process_sentences("russian.txt", nlp_ru)

#Save the processed data to a file.
def save_processed_sentences(data, filename):
    with open(filename, "w", encoding = "utf-8") as f:
        for sentence in data:
            f.write(str(sentence) + "\n")

save_processed_sentences(english_sentences, "english_tagged.txt")
save_processed_sentences(russian_sentences, "russian_tagged.txt")

print("Tokenization, Lemmatization and POS tagging complete.")

Tokenization, Lemmatization and POS tagging complete.


In [2]:
from lambeq import BobcatParser, spiders_reader, NumpyModel
import ast

def load_processed_sentences(filename):
    processed_data = []
    with open(filename, "r", encoding = "utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                processed_data.append(ast.literal_eval(line))
    return processed_data

english_sentences = load_processed_sentences("english_tagged.txt")
russian_sentences = load_processed_sentences("russian_tagged.txt")

def create_word_types(data):
    word_types = {}
    for sentence in data:
        for token, pos in sentence:
            if token not in word_types:
                if pos == "NOUN":
                    word_types[token] = "n"
                elif pos == "VERB":
                    word_types[token] = "s"
                elif pos == "ADJ":
                    word_types[token] = "adj"
                elif pos == "ADV":
                    word_types[token] = "adv"
                elif pos == "PRON":
                    word_types[token] = "pron"
                else:
                    word_types[token] = "?"
    return word_types

english_word_types = create_word_types(english_sentences)
russian_word_types = create_word_types(russian_sentences)

parser = BobcatParser(verbose="suppress")

def process_data(data, word_types):
    diagrams = []
    for sentence_tuples in data:
        tokens = [token for token, _ in sentence_tuples]
        sentence = " ".join(tokens)
        diagram = parser.sentence2diagram(sentence)
        diagrams.append(diagram)
    circuits = [spiders_reader.diagram_to_circuit(diagram) for diagram in diagrams]
    return circuits

english_circuits = process_data(english_sentences, english_word_types)
russian_circuits = process_data(russian_sentences, russian_word_types)

model = NumpyModel()
english_results = model(english_circuits)
russian_results = model(russian_circuits)

print("Word diagrams and quantum circuits created.")

KeyboardInterrupt: 