In [1]:
import spacy
from spacy import displacy
from lambeq.backend.drawing import draw
from lambeq.backend.grammar import Cup, Id, Ty, Word
from lambeq import AtomicType, IQPAnsatz, NumpyModel, BinaryCrossEntropyLoss, CrossEntropyLoss, QuantumTrainer, SPSAOptimizer, Dataset
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import re
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 型の定義
n, s = Ty('n'), Ty('s')

# 言語モデルのロード
#nlp = spacy.load("ja_core_web_sm")
nlp = spacy.load("en_core_web_lg")

In [24]:
# ここからラベル付け
#df = pd.read_csv("/Users/horiuchiminori/Desktop/研究/datasets/travel_dataset/travel_dataset_large.csv")

# ユニークなクラスを確認
#classes = df["label"].unique()

# One-vs-Rest用のラベルを作成
#for cls in classes:
    #column_name = f"class_{cls}_vs_rest"
    #df[column_name] = (df["label"] == cls).astype(int)

# 保存（必要に応じて）
#df.to_csv("travel_dataset_large_ovr.csv", index=False)

In [2]:
df = pd.read_csv("/Users/horiuchiminori/Desktop/研究/datasets/travel_dataset/three_class_dataset_ovr_with_label.csv")
# /Users/horiuchiminori/Desktop/研究/datasets/travel_dataset/three_class_dataset_ovr_with_label.csv
# /Users/horiuchiminori/Desktop/研究/datasets/nlp_multiclass_ovr_dataset_cleaned.csv

In [3]:
# 型の割り当て
def assign_types(doc):
    pregroup_types = {}
    doc = nlp(sentence)
    # トークン化
    tokens = [token.text for token in doc]

    # ROOTとそれに繋がる名詞（主語・目的語）のみ、dep関係をリストに格納
    dependencies = []
    for token in doc:
        if token.dep_ == 'ROOT':
            pregroup_types[token.text] = s
            dependencies.append((token.text, token.head.text))
        elif token.pos_ in ['NOUN', 'PRON', 'ADJ', 'ADP', 'VERB']: # 前置詞ひろう
            if token.head.dep_ == 'ROOT':
                pregroup_types[token.text] = n
                dependencies.append((token.text, token.head.text))
            elif token.dep_ == 'conj': # 並列に対応させる
                if token.head.head.dep_ == 'ROOT': # 無理矢理２つまで対応
                    pregroup_types[token.text] = n
                    dependencies.append((token.text, token.head.head.text)) # ROOTとつなげる
            elif token.dep_ == 'pobj': # 前置詞がついた目的後拾う
                if token.head.head.dep_ == 'ROOT':
                    pregroup_types[token.text] = n
                    dependencies.append((token.text, token.head.text))
            elif token.dep_ == 'dobj':
                if token.head.head.dep_ == 'ROOT':
                    pregroup_types[token.text] = n
                    dependencies.append((token.text, token.head.text))

    # dep関係による型の割り当て（dep関係リストに基づき、dep相手との語順で場合分け）
    for token in doc:
        for dep in dependencies:
            if token.text == dep[1]:
                idx1 = tokens.index(dep[0])
                idx2 = tokens.index(dep[1])
                if pregroup_types[dep[0]] == n:
                    if idx1 < idx2:
                        pregroup_types[token.text] = n.r @ pregroup_types[token.text]
                    else:
                        pregroup_types[token.text] = pregroup_types[token.text] @ n.l
    
    return pregroup_types

In [4]:
# diagram作成
def create_diagram(sentence):
    doc = nlp(sentence)
    pregroup_types = assign_types(doc)

    words = []
    types = Ty()

    # 初めの形を作る
    for word, type in pregroup_types.items():
        words.append(Word(word, type))

    diagram = Id().tensor(*words)

    for type in pregroup_types.values():
        types @= type
    
    # cupsの適用
    i = 0
    while i < len(types) - 1:
        if types[i:i + 2] == n @ n.r:  
            diagram = diagram >> types[:i] @ Cup(n, n.r) @ types[i + 2:]
            types = types[:i] @ types[i + 2:]
            i = max(0, i - 1)
        elif types[i:i + 2] == n.l @ n:
            diagram = diagram >> types[:i] @ Cup(n.l, n) @ types[i + 2:]
            types = types[:i] @ types[i + 2:]
            i = max(0, i - 1)
        elif types[i:i + 2] == s @ s.r:
            diagram = diagram >> types[:i] @ Cup(s, s.r) @ types[i + 2:]
            types = types[:i] @ types[i + 2:]
            i = max(0, i - 1)
        elif types[i:i + 2] == s.l @ s:
            diagram = diagram >> types[:i] @ Cup(s.l, s) @ types[i + 2:]
            types = types[:i] @ types[i + 2:]
            i = max(0, i - 1)
        else:
            i += 1
            
    return diagram

In [7]:
train_df, test_df = train_test_split(df, stratify=df['label'], test_size=0.2, random_state=0)
train_sentences = train_df['text']
test_sentences = test_df['text']

In [8]:
# ひとつめ
train_y = train_df['flight']
test_y = test_df['flight']

# ラベルのone-hot表現
train_labels = np.array([[1, 0] if i == 1 else [0, 1] for i in train_y])
test_labels = np.array([[1, 0] if i == 1 else [0, 1] for i in test_y])


In [11]:
# 各データセットのdiagram化
train_diagrams = []
test_diagrams = []

for sentence in train_sentences:
    train_diagrams.append(create_diagram(sentence))

for sentence in test_sentences:
    test_diagrams.append(create_diagram(sentence))

# チェック用
#for d in range(len(train_sentences)):
    #draw(train_diagrams[d])

In [8]:
# 量子回路設計
ansatz = IQPAnsatz({AtomicType.NOUN: 1, AtomicType.SENTENCE: 1}, n_layers=1, n_single_qubit_params=3)

train_circuits = [ansatz(diagram) for diagram in train_diagrams]
test_circuits = [ansatz(diagram) for diagram in test_diagrams]

# チェック用
# train_circuits[0].draw(figsize=(4, 4))

In [9]:
all_circuits = train_circuits + test_circuits # モデル初期化時に全量子回路を提供するための準備
model = NumpyModel.from_diagrams(all_circuits, use_jit=True)

# 損失関数と予測精度の定義
bce = BinaryCrossEntropyLoss(use_jax=True)
acc = lambda y_hat, y: np.mean(np.argmax(y_hat, axis=1) == np.argmax(y, axis=1))

In [None]:
BATCH_SIZE = 30
# LEARNING_RATE = 3e-2
EPOCHS = 400
SEED = 0

# trainerの定義
trainer = QuantumTrainer(
    model,
    loss_function=bce,
    epochs=EPOCHS,
    optimizer=SPSAOptimizer,
    optim_hyperparams={'a': 0.1, 'c': 0.06, 'A':0.01*EPOCHS},
    evaluate_functions={'acc': acc},
    evaluate_on_train=True,
    verbose='text',
    seed=SEED
)

train_dataset = Dataset(train_circuits, train_labels, batch_size=BATCH_SIZE)
test_dataset = Dataset(test_circuits, test_labels, shuffle=False)

In [14]:
trainer.fit(train_dataset, test_dataset, log_interval=100)

Epoch 100:  train/loss: 0.8884   valid/loss: 0.7424   train/time: 2.57s   valid/time: 0.28s   train/acc: 0.8594   valid/acc: 0.7500
Epoch 200:  train/loss: 0.3420   valid/loss: 0.7326   train/time: 2.52s   valid/time: 0.27s   train/acc: 0.8594   valid/acc: 0.6250
Epoch 300:  train/loss: 0.2041   valid/loss: 0.7299   train/time: 2.49s   valid/time: 0.27s   train/acc: 0.9219   valid/acc: 0.6250
Epoch 400:  train/loss: 0.2395   valid/loss: 0.7655   train/time: 2.50s   valid/time: 0.27s   train/acc: 0.9531   valid/acc: 0.6250

Training completed!
train/time: 10.07s   train/time_per_epoch: 0.03s   train/time_per_step: 0.01s   valid/time: 1.10s   valid/time_per_eval: 0.00s


In [15]:
# positive判定確率
y_hat = model.forward(test_circuits) # one-hotで確率を表示
prob_class_0 = y_hat[:, 0]  # positiveクラスの確率
prob_class_0

Array([6.2663120e-01, 5.1853932e-02, 2.4291335e-01, 2.9507074e-01,
       9.2937756e-01, 6.9530469e-01, 1.7896788e-01, 2.6176491e-01,
       5.5618834e-02, 4.1168304e-03, 9.2141396e-01, 1.3006808e-02,
       8.5464347e-04, 1.5265293e-01, 1.9680899e-01, 3.0761484e-02],      dtype=float32)

In [None]:
# ふたつめ
train_y = train_df['hotel']
test_y = test_df['hotel']

train_labels = np.array([[1, 0] if i == 1 else [0, 1] for i in train_y])
test_labels = np.array([[1, 0] if i == 1 else [0, 1] for i in test_y])

train_diagrams = []
test_diagrams = []

for sentence in train_sentences:
    train_diagrams.append(create_diagram(sentence))

for sentence in test_sentences:
    test_diagrams.append(create_diagram(sentence))

ansatz = IQPAnsatz({AtomicType.NOUN: 1, AtomicType.SENTENCE: 1}, n_layers=1, n_single_qubit_params=3)

train_circuits = [ansatz(diagram) for diagram in train_diagrams]
test_circuits = [ansatz(diagram) for diagram in test_diagrams]

all_circuits = train_circuits + test_circuits # モデル初期化時に全量子回路を提供するための準備
model = NumpyModel.from_diagrams(all_circuits, use_jit=True)

# 損失関数と予測精度の定義
bce = BinaryCrossEntropyLoss(use_jax=True)
acc = lambda y_hat, y: np.mean(np.argmax(y_hat, axis=1) == np.argmax(y, axis=1))

BATCH_SIZE = 30
# LEARNING_RATE = 3e-2
EPOCHS = 400
SEED = 0

# trainerの定義
trainer = QuantumTrainer(
    model,
    loss_function=bce,
    epochs=EPOCHS,
    optimizer=SPSAOptimizer,
    optim_hyperparams={'a': 0.1, 'c': 0.06, 'A':0.01*EPOCHS},
    evaluate_functions={'acc': acc},
    evaluate_on_train=True,
    verbose='text',
    seed=SEED
)

train_dataset = Dataset(train_circuits, train_labels, batch_size=BATCH_SIZE)
test_dataset = Dataset(test_circuits, test_labels, shuffle=False)

trainer.fit(train_dataset, test_dataset, log_interval=100)

y_hat = model.forward(test_circuits)
prob_class_1 = y_hat[:, 0]
prob_class_1

Epoch 100:  train/loss: 0.5858   valid/loss: 0.9882   train/time: 9.72s   valid/time: 1.53s   train/acc: 0.8438   valid/acc: 0.5625
Epoch 200:  train/loss: 0.4203   valid/loss: 1.0129   train/time: 2.88s   valid/time: 0.31s   train/acc: 0.9375   valid/acc: 0.5625
Epoch 300:  train/loss: 0.5864   valid/loss: 0.9782   train/time: 2.52s   valid/time: 0.28s   train/acc: 0.9531   valid/acc: 0.5625
Epoch 400:  train/loss: 0.1531   valid/loss: 0.9570   train/time: 2.68s   valid/time: 0.29s   train/acc: 0.9844   valid/acc: 0.5625

Training completed!
train/time: 17.80s   train/time_per_epoch: 0.04s   train/time_per_step: 0.01s   valid/time: 2.42s   valid/time_per_eval: 0.01s


Array([0.95553625, 0.02032341, 0.24291335, 0.05779175, 0.9254728 ,
       0.819933  , 0.55946594, 0.7720055 , 0.60184073, 0.654766  ,
       0.5435744 , 0.12189666, 0.88023776, 0.7604412 , 0.14741237,
       0.02161395], dtype=float32)

In [None]:
# みっつめ
train_y = train_df['restaurant']
test_y = test_df['restaurant']

train_labels = np.array([[1, 0] if i == 1 else [0, 1] for i in train_y])
test_labels = np.array([[1, 0] if i == 1 else [0, 1] for i in test_y])

train_diagrams = []
test_diagrams = []

for sentence in train_sentences:
    train_diagrams.append(create_diagram(sentence))

for sentence in test_sentences:
    test_diagrams.append(create_diagram(sentence))

ansatz = IQPAnsatz({AtomicType.NOUN: 1, AtomicType.SENTENCE: 1}, n_layers=1, n_single_qubit_params=3)

train_circuits = [ansatz(diagram) for diagram in train_diagrams]
test_circuits = [ansatz(diagram) for diagram in test_diagrams]

all_circuits = train_circuits + test_circuits # モデル初期化時に全量子回路を提供するための準備
model = NumpyModel.from_diagrams(all_circuits, use_jit=True)

# 損失関数と予測精度の定義
bce = BinaryCrossEntropyLoss(use_jax=True)
acc = lambda y_hat, y: np.mean(np.argmax(y_hat, axis=1) == np.argmax(y, axis=1))

BATCH_SIZE = 30
# LEARNING_RATE = 3e-2
EPOCHS = 100
SEED = 0

# trainerの定義
trainer = QuantumTrainer(
    model,
    loss_function=bce,
    epochs=EPOCHS,
    optimizer=SPSAOptimizer,
    optim_hyperparams={'a': 0.1, 'c': 0.06, 'A':0.01*EPOCHS},
    evaluate_functions={'acc': acc},
    evaluate_on_train=True,
    verbose='text',
    seed=SEED
)

train_dataset = Dataset(train_circuits, train_labels, batch_size=BATCH_SIZE)
test_dataset = Dataset(test_circuits, test_labels, shuffle=False)

trainer.fit(train_dataset, test_dataset, log_interval=100)

y_hat = model.forward(test_circuits) 
prob_class_2 = y_hat[:, 0]  # positiveクラスの確率
prob_class_2

Epoch 100:  train/loss: 0.3571   valid/loss: 2.7034   train/time: 10.21s   valid/time: 1.82s   train/acc: 0.8438   valid/acc: 0.2500

Training completed!
train/time: 10.21s   train/time_per_epoch: 0.10s   train/time_per_step: 0.03s   valid/time: 1.82s   valid/time_per_eval: 0.02s


Array([8.8579011e-01, 9.5026559e-01, 2.4291335e-01, 7.8096002e-01,
       1.3874125e-03, 9.8753023e-01, 1.9198383e-01, 5.4476327e-01,
       3.9679068e-01, 5.8757165e-04, 9.1705936e-01, 1.3945549e-03,
       1.6872494e-03, 1.2383179e-01, 3.2809169e-03, 6.5965044e-01],      dtype=float32)

In [18]:
# 予測確率をまとめて [num_samples, 3] に
prob_matrix = np.vstack([prob_class_0, prob_class_1, prob_class_2]).T

# 各行で最大の確率のインデックス（クラス番号）を取得
pred_labels = np.argmax(prob_matrix, axis=1)

true_labels = test_df['label'].values
print(pred_labels)
print(true_labels)

# 正解率
accuracy = np.mean(pred_labels == true_labels)
print("Accuracy:", accuracy)

[1 2 0 2 0 2 1 1 1 1 0 1 1 1 0 2]
[1 1 0 0 2 0 2 0 1 1 0 2 2 1 2 0]
Accuracy: 0.375


In [19]:
for idx, (sentence, true, pred) in enumerate(zip(test_sentences, true_labels, pred_labels)):
    if pred != true:
        #print(f"【誤分類】")
        print(f" 文: {sentence}")
        print(f" 予測ラベル: {pred}")
        print(f" 正解ラベル: {true}")
        #print("-" * 40)

 文: Are pets allowed on the train?
 予測ラベル: 2
 正解ラベル: 1
 文: Do you serve breakfast here?
 予測ラベル: 2
 正解ラベル: 0
 文: Is Wi-Fi free?
 予測ラベル: 0
 正解ラベル: 2
 文: Where can I buy snacks?
 予測ラベル: 2
 正解ラベル: 0
 文: Is there room service?
 予測ラベル: 1
 正解ラベル: 2
 文: I would like to order a pizza.
 予測ラベル: 1
 正解ラベル: 0
 文: I need an extra towel.
 予測ラベル: 1
 正解ラベル: 2
 文: Where’s the front desk?
 予測ラベル: 1
 正解ラベル: 2
 文: Is breakfast included?
 予測ラベル: 0
 正解ラベル: 2
 文: Is there a good sushi place nearby?
 予測ラベル: 2
 正解ラベル: 0
