In [None]:
!unzip -oq /home/aistudio/data/data52714/bq_corpus.zip -d data/
!unzip -oq /home/aistudio/data/data52714/paws-x-zh.zip -d data/

In [None]:
!pip install --upgrade paddlenlp -i https://mirror.baidu.com/pypi/simple  

Looking in indexes: https://mirror.baidu.com/pypi/simple
Collecting paddlenlp
[?25l  Downloading https://mirror.baidu.com/pypi/packages/da/77/364cd13f3488bc22297f5e07be2a1faf04f939844a3ea3fd84e3ab79489f/paddlenlp-2.1.1-py3-none-any.whl (735kB)
[K     |████████████████████████████████| 737kB 15.2MB/s eta 0:00:01
Collecting paddlefsl==1.0.0 (from paddlenlp)
[?25l  Downloading https://mirror.baidu.com/pypi/packages/5d/65/9970dd09309eb673303206befc9f2fdc9c2d29d31f002ae8d6c7b442f562/paddlefsl-1.0.0-py3-none-any.whl (55kB)
[K     |████████████████████████████████| 61kB 13.8MB/s eta 0:00:01
Collecting requests~=2.24.0 (from paddlefsl==1.0.0->paddlenlp)
[?25l  Downloading https://mirror.baidu.com/pypi/packages/45/1e/0c169c6a5381e241ba7404532c16a21d86ab872c9bed8bdcd4c423954103/requests-2.24.0-py2.py3-none-any.whl (61kB)
[K     |████████████████████████████████| 71kB 16.4MB/s eta 0:00:01
[?25hCollecting pillow==8.2.0 (from paddlefsl==1.0.0->paddlenlp)
[?25l  Downloading https://mirror.ba

In [None]:
import time
import os
import numpy as np
import paddle
import paddle.nn.functional as F
from paddlenlp.datasets import load_dataset
import paddlenlp
from data import convert_example, create_dataloader, load_my_dataset
from model import PointwiseMatching
from train import train_model
from utils import predict, write_tsv
from functools import partial

In [None]:
train_ds, dev_ds = load_dataset("lcqmc", splits=["train", "dev"])
print("train dataset length:", len(train_ds))
print("dev dataset length:", len(dev_ds))

100%|██████████| 6827/6827 [00:00<00:00, 60732.74it/s]


train dataset length: 238766
dev dataset length: 8802


#### 定义样本转换函数

In [None]:
# 因为是基于预训练模型 ERNIE-Gram 来进行，所以需要首先加载 ERNIE-Gram 的 tokenizer，
# 后续样本转换函数基于 tokenizer 对文本进行切分

tokenizer = paddlenlp.transformers.ErnieGramTokenizer.from_pretrained('ernie-gram-zh')

[2021-11-17 08:50:14,254] [    INFO] - Downloading https://paddlenlp.bj.bcebos.com/models/transformers/ernie_gram_zh/vocab.txt and saved to /home/aistudio/.paddlenlp/models/ernie-gram-zh
[2021-11-17 08:50:14,257] [    INFO] - Downloading vocab.txt from https://paddlenlp.bj.bcebos.com/models/transformers/ernie_gram_zh/vocab.txt
100%|██████████| 78/78 [00:00<00:00, 2026.21it/s]


In [None]:
# 训练集和验证集的样本转换函数
trans_func = partial(
    convert_example,
    tokenizer=tokenizer,
    max_seq_length=512)

#### 定义 Dataloader
下面我们基于组 batchify_fn 函数和样本转换函数 trans_func 来构造训练集的 DataLoader, 支持多卡训练


In [None]:
batch_size = 128
train_data_loader = create_dataloader(dataset=train_ds,
                        trans_fn=trans_func,
                        mode='train',
                        batch_size=batch_size,
                        tokenizer=tokenizer)

dev_data_loader = create_dataloader(dataset=dev_ds,
                        trans_fn=trans_func,
                        mode='dev',
                        batch_size=batch_size,
                        tokenizer=tokenizer)

print("train dataloader length:", len(train_data_loader))
print("dev dataloader length:", len(dev_data_loader))

train dataloader length: 1866
dev dataloader length: 69


In [None]:
import paddle.fluid as fluid


In [None]:
# 我们基于 ERNIE-Gram 模型结构搭建 Point-wise 语义匹配网络
# 所以此处先定义 ERNIE-Gram 的 pretrained_model
pretrained_model = paddlenlp.transformers.ErnieGramModel.from_pretrained('ernie-gram-zh')
# 定义 Point-wise 语义匹配网络
model = PointwiseMatching(pretrained_model)

[2021-11-17 08:50:42,034] [    INFO] - Downloading https://paddlenlp.bj.bcebos.com/models/transformers/ernie_gram_zh/ernie_gram_zh.pdparams and saved to /home/aistudio/.paddlenlp/models/ernie-gram-zh
[2021-11-17 08:50:42,036] [    INFO] - Downloading ernie_gram_zh.pdparams from https://paddlenlp.bj.bcebos.com/models/transformers/ernie_gram_zh/ernie_gram_zh.pdparams
100%|██████████| 583566/583566 [00:09<00:00, 64544.47it/s]
W1117 08:50:51.253046   104 device_context.cc:404] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.1, Runtime API Version: 10.1
W1117 08:50:51.257918   104 device_context.cc:422] device: 0, cuDNN Version: 7.6.


In [None]:
from paddlenlp.transformers import LinearDecayWithWarmup
epochs = 6
num_training_steps = len(train_data_loader) * epochs
lr_scheduler = LinearDecayWithWarmup(learning_rate=5E-5, total_steps=num_training_steps, warmup=0.15)
decay_params = [
    p.name for n, p in model.named_parameters()
    if not any(nd in n for nd in ["bias", "norm"])
]

optimizer = paddle.optimizer.AdamW(
    learning_rate=lr_scheduler,
    parameters=model.parameters(),
    weight_decay=5e-4,
    apply_decay_param_fun=lambda x: x in decay_params)

criterion = paddle.nn.loss.CrossEntropyLoss()

metric = paddle.metric.Accuracy()

In [14]:
from visualdl import LogWriter
writer = LogWriter("./log_lcqmc_2")
save_dir = "checkpoint_lcqmc_2"
use_cuda = True # 如想使用GPU，请设置为 True
https://paddlenlp.bj.bcebos.com/models/text_matching/ernie_gram_zh_pointwise_matching_model.tar(model, optimizer, epochs, criterion, metric, save_dir, tokenizer, loader_list=[train_data_loader, dev_data_loader], lr_scheduler=lr_scheduler, writer=writer)

global step 10, epoch: 1, batch: 10, loss: 0.49566, accu: 0.72344, lr: 0.0000043, speed: 2.63 step/s
global step 20, epoch: 1, batch: 20, loss: 0.45019, accu: 0.73523, lr: 0.0000046, speed: 2.34 step/s
global step 30, epoch: 1, batch: 30, loss: 0.40357, accu: 0.74171, lr: 0.0000049, speed: 2.41 step/s
global step 40, epoch: 1, batch: 40, loss: 0.37280, accu: 0.75094, lr: 0.0000052, speed: 2.38 step/s
global step 50, epoch: 1, batch: 50, loss: 0.31138, accu: 0.76158, lr: 0.0000055, speed: 2.27 step/s
global step 60, epoch: 1, batch: 60, loss: 0.47690, accu: 0.77023, lr: 0.0000058, speed: 2.37 step/s
global step 70, epoch: 1, batch: 70, loss: 0.40897, accu: 0.78028, lr: 0.0000061, speed: 2.37 step/s
global step 80, epoch: 1, batch: 80, loss: 0.40201, accu: 0.78580, lr: 0.0000064, speed: 2.39 step/s
global step 90, epoch: 1, batch: 90, loss: 0.34516, accu: 0.79219, lr: 0.0000067, speed: 2.36 step/s
global step 100, epoch: 1, batch: 100, loss: 0.28976, accu: 0.79832, lr: 0.0000070, speed: 

In [15]:
! head -n3 "${HOME}/.paddlenlp/datasets/LCQMC/lcqmc/lcqmc/test.tsv"

谁有狂三这张高清的	这张高清图，谁有
英雄联盟什么英雄最好	英雄联盟最好英雄是什么
这是什么意思，被蹭网吗	我也是醉了，这是什么意思


#### 定义预测函数

#### 定义预测数据的 data_loader

In [16]:
# 预测数据的转换函数
# predict 数据没有 label, 因此 convert_exmaple 的 is_test 参数设为 True
trans_func = partial(
    convert_example,
    tokenizer=tokenizer,
    max_seq_length=512,
    is_test=True)

# 加载预测数据
test_ds = load_dataset("lcqmc", splits=["test"])
print('test_ds length', len(test_ds))

test_ds length 12500


In [17]:
# 生成预测数据 data_loader
predict_data_loader = create_dataloader(dataset=test_ds,
                        trans_fn=trans_func,
                        mode='test',
                        batch_size=batch_size,
                        tokenizer=tokenizer)

print("predict dataloader length:", len(predict_data_loader))

predict dataloader length: 98


#### 定义预测模型

In [18]:
pretrained_model = paddlenlp.transformers.ErnieGramModel.from_pretrained('ernie-gram-zh')
model = PointwiseMatching(pretrained_model)

[2021-11-17 09:33:12,475] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/ernie-gram-zh/ernie_gram_zh.pdparams


#### 加载已训练好的模型参数

In [19]:
state_dict = paddle.load("checkpoint_lcqmc_2/best_model_state.pdparams")
model.set_dict(state_dict)

#### 开始预测

In [20]:
for idx, batch in enumerate(predict_data_loader):
    if idx < 1:
        print(batch)

[Tensor(shape=[128, 38], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,
       [[1   , 1022, 9   , ..., 0   , 0   , 0   ],
        [1   , 514 , 904 , ..., 0   , 0   , 0   ],
        [1   , 47  , 10  , ..., 0   , 0   , 0   ],
        ...,
        [1   , 936 , 356 , ..., 0   , 0   , 0   ],
        [1   , 614 , 356 , ..., 0   , 0   , 0   ],
        [1   , 630 , 1099, ..., 0   , 0   , 0   ]]), Tensor(shape=[128, 38], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,
       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])]


In [21]:
y_probs = predict(model, predict_data_loader)
y_preds = np.argmax(y_probs, axis=1)

#### 输出预测结果

In [22]:
test_ds = load_dataset("lcqmc", splits=["test"])
tsv_name = 'lcqmc.tsv'
write_tsv(tsv_name, test_ds, y_preds)

{'query': '谁有狂三这张高清的', 'title': '这张高清图，谁有', 'label': 0}
{'query': '英雄联盟什么英雄最好', 'title': '英雄联盟最好英雄是什么', 'label': 1}
{'query': '这是什么意思，被蹭网吗', 'title': '我也是醉了，这是什么意思', 'label': 1}
{'query': '现在有什么动画片好看呢？', 'title': '现在有什么好看的动画片吗？', 'label': 1}
{'query': '请问晶达电子厂现在的工资待遇怎么样要求有哪些', 'title': '三星电子厂工资待遇怎么样啊', 'label': 0}
{'query': '文章真的爱姚笛吗', 'title': '姚笛真的被文章干了吗', 'label': 0}
{'query': '送自己做的闺蜜什么生日礼物好', 'title': '送闺蜜什么生日礼物好', 'label': 1}
{'query': '近期上映的电影', 'title': '近期上映的电影有哪些', 'label': 1}
{'query': '求英雄联盟大神带？', 'title': '英雄联盟，求大神带~', 'label': 1}
{'query': '如加上什么部首', 'title': '给东加上部首是什么字？', 'label': 0}
{'query': '杭州哪里好玩', 'title': '杭州哪里好玩点', 'label': 1}
{'query': '这是什么乌龟值钱吗', 'title': '这是什么乌龟！值钱嘛？', 'label': 1}
{'query': '心各有所属是什么意思？', 'title': '心有所属是什么意思?', 'label': 1}
{'query': '什么东西越热爬得越高', 'title': '什么东西越热爬得很高', 'label': 1}
{'query': '世界杯哪位球员进球最多', 'title': '世界杯单界进球最多是哪位球员', 'label': 1}
{'query': '韭菜多吃什么好处', 'title': '多吃韭菜有什么好处', 'label': 1}
{'query': '云赚钱怎么样', 'title': '怎么才能赚钱', 'lab

## bq_corpus

In [None]:
train_ds, dev_ds = load_my_dataset(splits=["train", "dev"], SPLITS={'train':'data/bq_corpus/train.tsv', 'dev':'data/bq_corpus/dev.tsv'})
print("train dataset length:", len(train_ds))
print("dev dataset length:", len(dev_ds))

# 输出训练集的前 5 条样本
for idx, example in enumerate(train_ds):
    if idx <= 5:
        print(example)

tokenizer = paddlenlp.transformers.ErnieGramTokenizer.from_pretrained('ernie-gram-zh')

# 训练集和验证集的样本转换函数
trans_func = partial(
    convert_example,
    tokenizer=tokenizer,
    max_seq_length=512,
    is_flip=True)

batch_size = 128
train_data_loader = create_dataloader(dataset=train_ds,
                        trans_fn=trans_func,
                        mode='train',
                        batch_size=batch_size,
                        tokenizer=tokenizer)

trans_func = partial(
    convert_example,
    tokenizer=tokenizer,
    max_seq_length=512)

dev_data_loader = create_dataloader(dataset=dev_ds,
                        trans_fn=trans_func,
                        mode='dev',
                        batch_size=batch_size,
                        tokenizer=tokenizer)

print("train dataloader length:", len(train_data_loader))
print("dev dataloader length:", len(dev_data_loader))

<class 'data.BAIDUData'>
train dataset length: 100000
dev dataset length: 10000
{'query': '用微信都6年，微信没有微粒贷功能', 'title': '4。号码来微粒贷', 'label': 0}
{'query': '微信消费算吗', 'title': '还有多少钱没还', 'label': 0}
{'query': '交易密码忘记了找回密码绑定的手机卡也掉了', 'title': '怎么最近安全老是要改密码呢好麻烦', 'label': 0}
{'query': '你好我昨天晚上申请的没有打电话给我今天之内一定会打吗？', 'title': '什么时候可以到账', 'label': 0}
{'query': '“微粒贷开通"', 'title': '你好，我的微粒贷怎么没有开通呢', 'label': 0}
{'query': '为什么借款后一直没有给我回拨电话', 'title': '怎么申请借款后没有打电话过来呢！', 'label': 1}


[2021-06-13 12:57:27,594] [    INFO] - Found /home/aistudio/.paddlenlp/models/ernie-gram-zh/vocab.txt


train dataloader length: 782
dev dataloader length: 79


In [None]:
pretrained_model = paddlenlp.transformers.ErnieGramModel.from_pretrained('ernie-gram-zh')
model = PointwiseMatching(pretrained_model, dropout=0.5)

[2021-06-13 12:57:30,246] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/ernie-gram-zh/ernie_gram_zh.pdparams


In [None]:
from paddlenlp.transformers import LinearDecayWithWarmup
epochs = 10
num_training_steps = len(train_data_loader) * epochs
lr_scheduler = LinearDecayWithWarmup(learning_rate=3e-5, total_steps=num_training_steps, warmup=0.1)
decay_params = [
    p.name for n, p in model.named_parameters()
    if not any(nd in n for nd in ["bias", "norm"])
]

optimizer = paddle.optimizer.AdamW(
    learning_rate=lr_scheduler,
    parameters=model.parameters(),
    weight_decay=0.0,
    apply_decay_param_fun=lambda x: x in decay_params)

criterion = paddle.nn.loss.CrossEntropyLoss()

metric = paddle.metric.Accuracy()

In [None]:
from visualdl import LogWriter
writer = LogWriter("./log_bq_corpus_3")
save_dir = "checkpoint_bq_corpus_3"

train_model(model, optimizer, epochs, criterion, metric, save_dir, tokenizer, loader_list=[train_data_loader, dev_data_loader], patience=patience, lr_scheduler=lr_scheduler, writer=writer, add_softmax=True)

global step 10, epoch: 1, batch: 10, loss: 0.69938, accu: 0.50469, lr: 0.0000003, speed: 2.00 step/s
global step 20, epoch: 1, batch: 20, loss: 0.78842, accu: 0.49688, lr: 0.0000007, speed: 2.01 step/s
global step 30, epoch: 1, batch: 30, loss: 0.73961, accu: 0.50182, lr: 0.0000011, speed: 1.97 step/s
global step 40, epoch: 1, batch: 40, loss: 0.78163, accu: 0.49355, lr: 0.0000015, speed: 2.04 step/s


In [None]:
pretrained_model = paddlenlp.transformers.ErnieGramModel.from_pretrained('ernie-gram-zh')
model = PointwiseMatching(pretrained_model, dropout=0.3)
state_dict = paddle.load("checkpoint_bq_corpus_4/best_model_state.pdparams")
model.set_dict(state_dict)

[2021-06-13 02:41:45,963] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/ernie-gram-zh/ernie_gram_zh.pdparams


In [None]:
trans_func = partial(
    convert_example,
    tokenizer=tokenizer,
    max_seq_length=512,
    is_test=True)
    
test_ds = load_my_dataset(splits=["test"], SPLITS={'test':'data/bq_corpus/test.tsv'})
print('test_ds length', len(test_ds))

predict_data_loader = create_dataloader(dataset=test_ds,
                        trans_fn=trans_func,
                        mode='test',
                        batch_size=batch_size,
                        tokenizer=tokenizer)

print("predict dataloader length:", len(predict_data_loader))

y_probs = predict(model, predict_data_loader)

y_preds = np.argmax(y_probs, axis=1)

test_ds = load_my_dataset(splits=["test"], SPLITS={'test':'data/bq_corpus/test.tsv'})
tsv_name = 'bq_corpus.tsv'

write_tsv(tsv_name, test_ds, y_preds)

<class 'data.BAIDUData'>
test_ds length 10000
predict dataloader length: 79


## paws-x

In [None]:
train_ds, dev_ds = load_my_dataset(splits=["train", "dev"], SPLITS={'train':'data/paws-x-zh/train.tsv', 'dev':'data/paws-x-zh/dev.tsv'})
print("train dataset length:", len(train_ds))
print("dev dataset length:", len(dev_ds))

tokenizer = paddlenlp.transformers.ErnieGramTokenizer.from_pretrained('ernie-gram-zh')

trans_func = partial(
    convert_example,
    tokenizer=tokenizer,
    max_seq_length=512)

batch_size = 128
train_data_loader = create_dataloader(dataset=train_ds,
                        trans_fn=trans_func,
                        mode='train',
                        batch_size=batch_size,
                        tokenizer=tokenizer)

dev_data_loader = create_dataloader(dataset=dev_ds,
                        trans_fn=trans_func,
                        mode='dev',
                        batch_size=batch_size,
                        tokenizer=tokenizer)

print("train dataloader length:", len(train_data_loader))
print("dev dataloader length:", len(dev_data_loader))

<class 'data.BAIDUData'>
train dataset length: 49129
dev dataset length: 2000
{'query': '1560年10月，他在巴黎秘密会见了英国大使Nicolas Throckmorton，要求他通过苏格兰返回英国。', 'title': '1560年10月，他在巴黎秘密会见了英国大使尼古拉斯·斯罗克莫顿，并要求他通过英格兰返回苏格兰的护照。', 'label': 0}
{'query': '1975年的NBA赛季 -  76赛季是全美篮球协会的第30个赛季。', 'title': '1975-76赛季的全国篮球协会是NBA的第30个赛季。', 'label': 1}
{'query': '还有具体的讨论，公众形象辩论和项目讨论。', 'title': '还有公开讨论，特定档案讨论和项目讨论。', 'label': 0}
{'query': '当可以保持相当的流速时，结果很高。', 'title': '当可以保持可比较的流速时，结果很高。', 'label': 1}


[2021-06-11 22:57:40,421] [    INFO] - Downloading vocab.txt from https://paddlenlp.bj.bcebos.com/models/transformers/ernie_gram_zh/vocab.txt
100%|██████████| 78/78 [00:00<00:00, 4909.52it/s]


train dataloader length: 384
dev dataloader length: 16


In [None]:
pretrained_model = paddlenlp.transformers.ErnieGramModel.from_pretrained('ernie-gram-zh')
model = PointwiseMatching(pretrained_model)
state_dict = paddle.load("checkpoint_paws-x-zh_1/best_model_state.pdparams")
model.set_dict(state_dict)

[2021-06-11 22:58:29,048] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/ernie-gram-zh/ernie_gram_zh.pdparams


In [None]:
from paddlenlp.transformers import LinearDecayWithWarmup

epochs = 20
num_training_steps = len(train_data_loader) * epochs

lr_scheduler = LinearDecayWithWarmup(learning_rate=5E-5, total_steps=num_training_steps, warmup=0.1)
decay_params = [
    p.name for n, p in model.named_parameters()
    if not any(nd in n for nd in ["bias", "norm"])
]

optimizer = paddle.optimizer.AdamW(
    learning_rate=lr_scheduler,
    parameters=model.parameters(),
    weight_decay=0.0,
    apply_decay_param_fun=lambda x: x in decay_params)

criterion = paddle.nn.loss.CrossEntropyLoss()

metric = paddle.metric.Accuracy()

In [None]:
from visualdl import LogWriter
writer = LogWriter("./log_paws-x-zh_1")
save_dir = "checkpoint_paws-x-zh_1"

train_model(model, optimizer, epochs, criterion, metric, save_dir, tokenizer, loader_list=[train_data_loader, dev_data_loader], patience=patience, lr_scheduler=lr_scheduler, writer=writer)

global step 10, epoch: 1, batch: 10, loss: 0.74482, accu: 0.46719, lr: 0.0000006, speed: 1.18 step/s
global step 20, epoch: 1, batch: 20, loss: 0.70080, accu: 0.47578, lr: 0.0000012, speed: 1.23 step/s


In [None]:
pretrained_model = paddlenlp.transformers.ErnieGramModel.from_pretrained('ernie-gram-zh')
model = PointwiseMatching(pretrained_model)
state_dict = paddle.load("checkpoint_paws-x-zh/best_model_state.pdparams")
model.set_dict(state_dict)

[2021-06-11 17:47:52,656] [    INFO] - Downloading https://paddlenlp.bj.bcebos.com/models/transformers/ernie_gram_zh/ernie_gram_zh.pdparams and saved to /home/aistudio/.paddlenlp/models/ernie-gram-zh
[2021-06-11 17:47:52,708] [    INFO] - Downloading ernie_gram_zh.pdparams from https://paddlenlp.bj.bcebos.com/models/transformers/ernie_gram_zh/ernie_gram_zh.pdparams
100%|██████████| 583566/583566 [00:13<00:00, 43994.79it/s]


In [None]:
trans_func = partial(
    convert_example,
    tokenizer=tokenizer,
    max_seq_length=512,
    is_test=True)
    
test_ds = load_my_dataset(splits=["test"], SPLITS={'test':'data/paws-x-zh/test.tsv'})
print('test_ds length', len(test_ds))

predict_data_loader = create_dataloader(dataset=test_ds,
                        trans_fn=trans_func,
                        mode='test',
                        batch_size=batch_size,
                        tokenizer=tokenizer)

print("predict dataloader length:", len(predict_data_loader))

y_probs = predict(model, predict_data_loader)

y_preds = np.argmax(y_probs, axis=1)

test_ds = load_my_dataset(splits=["test"], SPLITS={'test':'data/paws-x-zh/test.tsv'})
tsv_name = 'paws-x.tsv'
write_tsv(tsv_name, test_ds, y_preds)

<class 'data.BAIDUData'>
test_ds length 2000
predict dataloader length: 16
<class 'data.BAIDUData'>
{'query': '2005 年末至 2009 年期间是例外，当时他效力于瑞典的卡斯塔德联队、塞尔维亚的查查克足球俱乐部和俄罗斯的格罗兹尼特里克足球俱乐部。', 'title': '例外情况发生于 2005 年末至 2009 年期间，当时他效力于瑞典的卡斯塔德联队、塞尔维亚的查查克足球俱乐部和俄罗斯的格罗兹尼艾卡马特足球俱乐部。', 'label': 1}
{'query': 'Tabaci 河是罗马尼亚 Leurda 河的支流。', 'title': 'Leurda 河是罗马尼亚境内 Tabaci 河的一条支流。', 'label': 0}
{'query': '1993 年，他为 A 级的坎恩郡美洲狮队和 AA 级的波特兰海狗队效力。', 'title': '1993 年，他为 A 级球队波特兰海狗队和 AA 级球队凯恩县美洲狮队效力。', 'label': 0}
{'query': 'Winarsky 是 IEEE、Phi Beta Kappa、ACM 和 Sigma Xi 的成员。', 'title': '温那斯基是 ACM、IEEE、Phi Beta Kappa 和 Sigma Xi 的成员。', 'label': 1}
{'query': '1938 年，他成为英埃苏丹的政府人类学家，并领导对努巴的实地考察工作。', 'title': '1938 年，他成为英埃苏丹政府的人类学家，并与努巴一起从事野外工作。', 'label': 1}
{'query': '比利·比利·贝特森出现在 2008 年末至 2009 年初出版的前四期《黑亚当》中。', 'title': '黑亚当出现在 2008 年末至 2009 年初出版的前四期《比利·贝特森》中。', 'label': 0}
{'query': '利用太阳能满足此项要求的方法是在常规动力飞机上使用太阳能板。', 'title': '利用太阳能满足此项要求的方法是在常规动力飞机上使用太阳能板。', 'label': 1}
{'query': '在调查进行期间，警察还质询了歌手梨美·托米和演员卡薇雅·马德哈万，两人