In [None]:
import torch
import numpy as np
from torch.utils.data import Dataset
from bpemb import BPEmb
from datasets import load_dataset

In [7]:
device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")
device

device(type='cuda')

In [8]:
dataset = load_dataset("copenlu/answerable_tydiqa")
train_set = dataset["train"]

Using custom data configuration copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6
Reusing dataset parquet (/home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
        num_rows: 116067
    })
    validation: Dataset({
        features: ['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
        num_rows: 13325
    })
})

In [10]:
validation_set = dataset["validation"]

In [11]:
def getLanguageDataSet(data, language):
    def printAndL(x):
        return x["language"] == language
    return data.filter(printAndL)

def getEnglishDataSet(data):
    return getLanguageDataSet(data, "english")

# keep only english data
english_train_set = getEnglishDataSet(train_set)
english_validation_set = getEnglishDataSet(validation_set)

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-dbe296a96306378b.arrow
Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-9c5553adc4f99418.arrow


In [12]:
# delete useless data
english_train_set = english_train_set.remove_columns(["document_title", "language", "document_url"])
english_validation_set = english_validation_set.remove_columns(["document_title", "language", "document_url"])

In [13]:
english_train_set

Dataset({
    features: ['question_text', 'annotations', 'document_plaintext'],
    num_rows: 7389
})

In [14]:
def label_map_func(examples):
    labels = []
    for i in examples["annotations"]:
        if i["answer_start"] == [-1]:
            labels.append([0])

        else:
            labels.append([1])
    return {"label": labels}

english_label_train_set = english_train_set.map(label_map_func , batched=True, num_proc=4, remove_columns=["annotations"])

 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-7804682ce4cfa673.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-73c4a07fb746d0f8.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-ca751636e6333f5b.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-e42d7860fb83f452.arrow


In [15]:
english_label_train_set["document_plaintext"][0]

'Quantum field theory naturally began with the study of electromagnetic interactions, as the electromagnetic field was the only known classical field as of the 1920s.[8]:1'

In [16]:
import transformers
from transformers import AutoTokenizer
transformer_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

ModuleNotFoundError: No module named 'transformers'

In [None]:
block_size = 128

def document_tokenize_function(examples):
    return transformer_tokenizer(examples['document_plaintext'], padding="max_length")

def question_tokenize_function(examples):
    return transformer_tokenizer(examples['question_text'], padding="max_length")


In [None]:
document_unsupervised_tok = english_label_train_set.map(document_tokenize_function, batched=True, num_proc=4, remove_columns=["document_plaintext"])
document_unsupervised_tok

In [None]:

def switch_name1(examples):
    return {"document_input_ids": examples['input_ids']}
document_unsupervised_tok = document_unsupervised_tok.map(switch_name1, batched=True, num_proc=4, remove_columns=["input_ids"])


In [None]:
def switch_name2(examples):
    return {"document_attention_mask": examples['attention_mask']}
document_unsupervised_tok = document_unsupervised_tok.map(switch_name2, batched=True, num_proc=4, remove_columns=["attention_mask"])

In [None]:
document_unsupervised_tok

In [None]:
question_and_document_unsupervised_tok = document_unsupervised_tok.map(question_tokenize_function, batched=True, num_proc=4, remove_columns=["question_text"])

In [None]:
def switch_name1(examples):
    return {"question_input_ids": examples['input_ids']}
question_and_document_unsupervised_tok = question_and_document_unsupervised_tok.map(switch_name1, batched=True, num_proc=4, remove_columns=["input_ids"])


In [None]:

def switch_name2(examples):
    return {"question_attention_mask": examples['attention_mask']}
question_and_document_unsupervised_tok = question_and_document_unsupervised_tok.map(switch_name2, batched=True, num_proc=4, remove_columns=["attention_mask"])

In [None]:
question_and_document_unsupervised_tok

In [None]:
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''
@File    : preprocess.py.py
@IDE     : PyCharm
@Author  : Yaokun Li
@Date    : 2022/10/20 15:48
@Description :
'''
import torch
import numpy as np
from torch.utils.data import Dataset
from bpemb import BPEmb
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer

device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")


dataset = load_dataset("copenlu/answerable_tydiqa")
train_set = dataset["train"]
validation_set = dataset["validation"]

def getLanguageDataSet(data, language):
    def printAndL(x):
        return x["language"] == language
    return data.filter(printAndL)

def getEnglishDataSet(data):
    return getLanguageDataSet(data, "english")

# keep only english data
english_train_set = getEnglishDataSet(train_set)
english_validation_set = getEnglishDataSet(validation_set)

# delete useless data
english_train_set = english_train_set.remove_columns(["document_title", "language", "document_url"])
english_validation_set = english_validation_set.remove_columns(["document_title", "language", "document_url"])

def label_map_func(examples):
    labels = []
    for i in examples["annotations"]:
        if i["answer_start"] == [-1]:
            labels.append([0])

        else:
            labels.append([1])
    return {"label": labels}

english_label_train_set = english_train_set.map(label_map_func , batched=True, num_proc=4, remove_columns=["annotations"])
english_label_val_set = english_validation_set.map(label_map_func , batched=True, num_proc=4, remove_columns=["annotations"])

transformer_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

block_size = 128

def document_tokenize_function(examples):
    return transformer_tokenizer(examples['document_plaintext'], padding="max_length")

def question_tokenenglish_label_train_setize_function(examples):
    return transformer_tokenizer(examples['question_text'], padding="max_length")

def switch_doc_name1(examples):
    return {"document_input_ids": examples['input_ids']}

def switch_doc_name2(examples):
    return {"document_attention_mask": examples['attention_mask']}

def switch_ques_name1(examples):
    return {"question_input_ids": examples['input_ids']}

def switch_ques_name2(examples):
    return {"question_attention_mask": examples['attention_mask']}

def get_final_dataset(data_set):
    return data_set\
        .map(document_tokenize_function, batched=True, num_proc=4, remove_columns=["document_plaintext"])\
        .map(switch_doc_name1, batched=True, num_proc=4, remove_columns=["input_ids"])\
        .map(switch_doc_name2, batched=True, num_proc=4, remove_columns=["attention_mask"])\
        .map(question_tokenize_function, batched=True, num_proc=4, remove_columns=["question_text"])\
        .map(switch_ques_name1, batched=True, num_proc=4, remove_columns=["input_ids"])\
        .map(switch_ques_name2, batched=True, num_proc=4, remove_columns=["attention_mask"])


train_dl = get_final_dataset(english_label_train_set).remove_columns("token_type_ids")
val_dl = get_final_dataset(english_label_val_set).remove_columns("token_type_ids")

In [None]:
def collate_batch_bilstm(dataset):
    print(dataset)
    label = torch.Tensor(dataset["label"]).cuda()
    document = torch.Tensor(dataset["document_input_ids"]).cuda()
    document_mask = torch.tensor(dataset["document_attention_mask"]).cuda()
    question = torch.tensor(dataset["question_input_ids"]).cuda()
    question_mask = torch.tensor(dataset["question_attention_mask"]).cuda()

    return document, document_mask, question, question_mask, label


In [None]:
train_dl = torch.utils.data.DataLoader(train_dl, batch_size=32, collate_fn=collate_batch_bilstm)
valid_dl = torch.utils.data.DataLoader(val_dl, batch_size=32, collate_fn=collate_batch_bilstm)

In [286]:
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''
@File    : preprocess.py.py
@IDE     : PyCharm
@Author  : Yaokun Li
@Date    : 2022/10/20 15:48
@Description :
'''
import torch
import numpy as np
from torch.utils.data import Dataset
from bpemb import BPEmb
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer

device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")


dataset = load_dataset("copenlu/answerable_tydiqa")
train_set = dataset["train"]
validation_set = dataset["validation"]

def getLanguageDataSet(data, language):
    def printAndL(x):
        return x["language"] == language
    return data.filter(printAndL)

def getEnglishDataSet(data):
    return getLanguageDataSet(data, "english")

# keep only english data
english_train_set = getEnglishDataSet(train_set)
english_validation_set = getEnglishDataSet(validation_set)

# delete useless data
english_train_set = english_train_set.remove_columns(["document_title", "language", "document_url"])
english_validation_set = english_validation_set.remove_columns(["document_title", "language", "document_url"])

def label_map_func(examples):
    labels = []
    for i in examples["annotations"]:
        if i["answer_start"] == [-1]:
            labels.append([0])

        else:
            labels.append([1])
    return {"label": labels}

english_label_train_set = english_train_set.map(label_map_func , batched=True, num_proc=4, remove_columns=["annotations"])
english_label_val_set = english_validation_set.map(label_map_func , batched=True, num_proc=4, remove_columns=["annotations"])

transformer_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

block_size = 128

def document_tokenize_function(examples):
    return transformer_tokenizer(examples['document_plaintext'], padding="max_length")

def question_tokenize_function(examples):
    return transformer_tokenizer(examples['question_text'], padding="max_length")

def switch_doc_name1(examples):
    return {"document_input_ids": examples['input_ids']}

def switch_doc_name2(examples):
    return {"document_attention_mask": examples['attention_mask']}

def switch_ques_name1(examples):
    return {"question_input_ids": examples['input_ids']}

def switch_ques_name2(examples):
    return {"question_attention_mask": examples['attention_mask']}

def get_final_dataset(data_set):
    return data_set\
        .map(document_tokenize_function, batched=True, num_proc=4, remove_columns=["document_plaintext"])\
        .map(switch_doc_name1, batched=True, num_proc=4, remove_columns=["input_ids"])\
        .map(switch_doc_name2, batched=True, num_proc=4, remove_columns=["attention_mask"])\
        .map(question_tokenize_function, batched=True, num_proc=4, remove_columns=["question_text"])\
        .map(switch_ques_name1, batched=True, num_proc=4, remove_columns=["input_ids"])\
        .map(switch_ques_name2, batched=True, num_proc=4, remove_columns=["attention_mask"])


train_dl = get_final_dataset(english_label_train_set).remove_columns("token_type_ids")
val_dl = get_final_dataset(english_label_val_set).remove_columns("token_type_ids")


Using custom data configuration copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6
Reusing dataset parquet (/home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-dbe296a96306378b.arrow
Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-9c5553adc4f99418.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-7804682ce4cfa673.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-73c4a07fb746d0f8.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-ca751636e6333f5b.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-e42d7860fb83f452.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-af59bc06862ed862.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-f4f593d38f6bd03f.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-efe2549cec1e00c1.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-a82c147adfac6e4a.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-80de4fb47c24e9d5.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-fb99d278a5a5db09.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-6ffd45ab2b1d88e3.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-c25a61f3d91e947a.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-b148c4fb260f83f3.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-43b57756fffc5180.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-b78f2de3970b5461.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-82c583eaa42341e4.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-554500dc1fbe0bf7.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-32b99672e2b1b026.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-966e4b547e068431.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-d1c55db869c181e6.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-9a3314ee29213a9d.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-a36a6bd66c5d68a6.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-48295ac5144ae6f2.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-3efc1616c1933780.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-9eeec115dbab7cea.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-1602791098c1686d.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-a36493d8cdabd18b.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-07c65a502debf478.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-20340675e9f35fb7.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-502f051628037aca.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-51705be2788373ba.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-132fe2296c8b858d.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-2e2e0352ddc06c01.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-23795fa239b4b779.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-3a8177c4c631f7a3.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-a27d6b5377ea799d.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-9d8d43df5df4c6d9.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-37a940c0cb242e9f.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-59e8182f4974493c.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-9640dbe1ad08e415.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-b8408da431d8b570.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-7bdc3885c59a89b4.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-93afe52c7795f310.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-2462ca218a7b4c9c.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-4eaa9c080746f273.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-2efcccef6fe54d23.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-2975685915440e60.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-3aaf73f8a7216c23.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-4f600cb617a176eb.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-53653043ac775146.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-22348c98501860e3.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-1eb7b1b8514d83e7.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-7af38acfe5bffd57.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-bf24248d1e3a002d.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-a279d569ddfc314c.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-e97e83d5a2959e59.arrow


In [287]:
def collate_batch_bilstm(dataset):
    label = []
    document = []
    document_mask = []
    question = []
    question_mask = []
    for data in dataset:
        label.append(data["label"])
        document.append(data["document_input_ids"])
        document_mask.append(data["document_attention_mask"])
        question.append(data["question_input_ids"])
        question_mask.append(data["question_attention_mask"])

    label = torch.Tensor(label).cuda()
    document = torch.Tensor(document).cuda()
    document_mask = torch.tensor(document_mask).cuda()
    question = torch.tensor(question).cuda()
    question_mask = torch.tensor(question_mask).cuda()

    return document, document_mask, question, question_mask, label


train_dl = torch.utils.data.DataLoader(train_dl, batch_size=32, collate_fn=collate_batch_bilstm)
valid_dl = torch.utils.data.DataLoader(val_dl, batch_size=32, collate_fn=collate_batch_bilstm)

In [288]:
for batch in train_dl:
    print(batch)

(tensor([[  101., 25231.,  1768.,  ...,     0.,     0.,     0.],
        [  101.,  1109., 10412.,  ...,     0.,     0.,     0.],
        [  101., 12120.,  7531.,  ...,     0.,     0.,     0.],
        ...,
        [  101.,  3637., 20756.,  ...,     0.,     0.,     0.],
        [  101.,  1109.,  1148.,  ...,     0.,     0.,     0.],
        [  101., 10117.,  1867.,  ...,     0.,     0.,     0.]],
       device='cuda:0'), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'), tensor([[ 101, 1332, 1108,  ...,    0,    0,    0],
        [ 101, 2627, 1108,  ...,    0,    0,    0],
        [ 101, 1332, 1110,  ...,    0,    0,    0],
        ...,
        [ 101, 2627, 1108,  ...,    0,    0,    0],
        [ 101, 1332, 1108,  ...,    0,    0,    0],
        [ 101, 1332, 1225,  ...,    0,    0,    0]], device='cuda:0

ValueError: expected sequence of length 512 at dim 1 (got 1023)

In [289]:
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''
@File    : preprocess.py.py
@IDE     : PyCharm
@Author  : Yaokun Li
@Date    : 2022/10/20 15:48
@Description :
'''
import torch
import numpy as np
from torch.utils.data import Dataset
from bpemb import BPEmb
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer

device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")


dataset = load_dataset("copenlu/answerable_tydiqa")
train_set = dataset["train"]
validation_set = dataset["validation"]

def getLanguageDataSet(data, language):
    def printAndL(x):
        return x["language"] == language
    return data.filter(printAndL)

def getEnglishDataSet(data):
    return getLanguageDataSet(data, "english")

# keep only english data
english_train_set = getEnglishDataSet(train_set)
english_validation_set = getEnglishDataSet(validation_set)

# delete useless data
english_train_set = english_train_set.remove_columns(["document_title", "language", "document_url"])
english_validation_set = english_validation_set.remove_columns(["document_title", "language", "document_url"])

def label_map_func(examples):
    labels = []
    for i in examples["annotations"]:
        if i["answer_start"] == [-1]:
            labels.append([0])

        else:
            labels.append([1])
    return {"label": labels}

english_label_train_set = english_train_set.map(label_map_func , batched=True, num_proc=4, remove_columns=["annotations"])
english_label_val_set = english_validation_set.map(label_map_func , batched=True, num_proc=4, remove_columns=["annotations"])

transformer_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

block_size = 128

def document_tokenize_function(examples):
    return transformer_tokenizer(examples['document_plaintext'], padding=True)

def question_tokenize_function(examples):
    return transformer_tokenizer(examples['question_text'], padding=True)

def switch_doc_name1(examples):
    return {"document_input_ids": examples['input_ids']}

def switch_doc_name2(examples):
    return {"document_attention_mask": examples['attention_mask']}

def switch_ques_name1(examples):
    return {"question_input_ids": examples['input_ids']}

def switch_ques_name2(examples):
    return {"question_attention_mask": examples['attention_mask']}

def get_final_dataset(data_set):
    return data_set\
        .map(document_tokenize_function, batched=True, num_proc=4, remove_columns=["document_plaintext"])\
        .map(switch_doc_name1, batched=True, num_proc=4, remove_columns=["input_ids"])\
        .map(switch_doc_name2, batched=True, num_proc=4, remove_columns=["attention_mask"])\
        .map(question_tokenize_function, batched=True, num_proc=4, remove_columns=["question_text"])\
        .map(switch_ques_name1, batched=True, num_proc=4, remove_columns=["input_ids"])\
        .map(switch_ques_name2, batched=True, num_proc=4, remove_columns=["attention_mask"])


train_dl = get_final_dataset(english_label_train_set).remove_columns("token_type_ids")
val_dl = get_final_dataset(english_label_val_set).remove_columns("token_type_ids")


def collate_batch_bilstm(dataset):
    label = []
    document = []
    document_mask = []
    question = []
    question_mask = []
    for data in dataset:
        label.append(data["label"])
        document.append(data["document_input_ids"])
        document_mask.append(data["document_attention_mask"])
        question.append(data["question_input_ids"])
        question_mask.append(data["question_attention_mask"])

    label = torch.Tensor(label).cuda()
    document = torch.Tensor(document).cuda()
    document_mask = torch.tensor(document_mask).cuda()
    question = torch.tensor(question).cuda()
    question_mask = torch.tensor(question_mask).cuda()

    return document, document_mask, question, question_mask, label


train_dl = torch.utils.data.DataLoader(train_dl, batch_size=32, collate_fn=collate_batch_bilstm)
valid_dl = torch.utils.data.DataLoader(val_dl, batch_size=32, collate_fn=collate_batch_bilstm)

for batch in train_dl:
    print(batch)

Using custom data configuration copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6
Reusing dataset parquet (/home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-dbe296a96306378b.arrow
Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-9c5553adc4f99418.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-7804682ce4cfa673.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-73c4a07fb746d0f8.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-ca751636e6333f5b.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-e42d7860fb83f452.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-af59bc06862ed862.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-f4f593d38f6bd03f.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-efe2549cec1e00c1.arrow


 

Loading cached processed dataset at /home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-a82c147adfac6e4a.arrow


       

#2:   0%|          | 0/2 [00:00<?, ?ba/s]

#1:   0%|          | 0/2 [00:00<?, ?ba/s]

#3:   0%|          | 0/2 [00:00<?, ?ba/s]

 

#0:   0%|          | 0/2 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1111 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1153 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2039 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (4202 > 512). Running this sequence through the model will result in indexing errors


        

#1:   0%|          | 0/2 [00:00<?, ?ba/s]

#2:   0%|          | 0/2 [00:00<?, ?ba/s]

#3:   0%|          | 0/2 [00:00<?, ?ba/s]

#0:   0%|          | 0/2 [00:00<?, ?ba/s]

       

#1:   0%|          | 0/2 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/2 [00:00<?, ?ba/s]

#3:   0%|          | 0/2 [00:00<?, ?ba/s]

#0:   0%|          | 0/2 [00:00<?, ?ba/s]

       

#1:   0%|          | 0/2 [00:00<?, ?ba/s]

#2:   0%|          | 0/2 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/2 [00:00<?, ?ba/s]

#0:   0%|          | 0/2 [00:00<?, ?ba/s]

       

#1:   0%|          | 0/2 [00:00<?, ?ba/s]

#2:   0%|          | 0/2 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/2 [00:00<?, ?ba/s]

#0:   0%|          | 0/2 [00:00<?, ?ba/s]

        

#1:   0%|          | 0/2 [00:00<?, ?ba/s]

#2:   0%|          | 0/2 [00:00<?, ?ba/s]

#3:   0%|          | 0/2 [00:00<?, ?ba/s]

#0:   0%|          | 0/2 [00:00<?, ?ba/s]

       

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (724 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1179 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (666 > 512). Running this sequence through the model will result in indexing errors


       

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

        

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

       

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

       

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

        

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

(tensor([[  101., 25231.,  1768.,  ...,     0.,     0.,     0.],
        [  101.,  1109., 10412.,  ...,     0.,     0.,     0.],
        [  101., 12120.,  7531.,  ...,     0.,     0.,     0.],
        ...,
        [  101.,  3637., 20756.,  ...,     0.,     0.,     0.],
        [  101.,  1109.,  1148.,  ...,     0.,     0.,     0.],
        [  101., 10117.,  1867.,  ...,     0.,     0.,     0.]],
       device='cuda:0'), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'), tensor([[  101,  1332,  1108,  9539,  1768,  2749,  1872,   136,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0],
        [  101,  2627,  1108,  1103,  1148, 10412,  4716,  2981,  1111,  7594,
       

ValueError: expected sequence of length 1153 at dim 1 (got 526)