# 4章 多言語の固有表現認識

## 4.1 データセット

In [1]:
import pandas as pd
toks = "Jeff Dean is a computer scientist at Google in California".split()
lbls = ["B-PER", "I-PER", "O", "O", "O", "O", "O", "B-ORG", "O", "B-LOC"]
df = pd.DataFrame(data=[toks, lbls], index=['Tokens', 'Tags'])
df
     

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,Jeff,Dean,is,a,computer,scientist,at,Google,in,California
Tags,B-PER,I-PER,O,O,O,O,O,B-ORG,O,B-LOC


In [3]:
from datasets import get_dataset_config_names

xtreme_subsets = get_dataset_config_names("xtreme")
print(f"XTREME has {len(xtreme_subsets)} configurations")

  from .autonotebook import tqdm as notebook_tqdm
Downloading builder script: 100%|██████████| 37.5k/37.5k [00:00<00:00, 271kB/s]
Downloading metadata: 100%|██████████| 593k/593k [00:00<00:00, 1.05MB/s]
Downloading readme: 100%|██████████| 105k/105k [00:00<00:00, 709kB/s] 


XTREME has 183 configurations


In [5]:
from datasets import load_dataset

load_dataset("xtreme", name="PAN-X.de")

Downloading data: 100%|██████████| 234M/234M [00:56<00:00, 4.17MB/s] 
Generating train split: 100%|██████████| 20000/20000 [00:01<00:00, 14074.83 examples/s]
Generating validation split: 100%|██████████| 10000/10000 [00:00<00:00, 14974.94 examples/s]
Generating test split: 100%|██████████| 10000/10000 [00:00<00:00, 13819.91 examples/s]


DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

In [6]:
from collections import defaultdict
from datasets import DatasetDict

langs = ["de", "fr", "it", "en"]
fracs = [0.629, 0.229, 0.084, 0.059]
# Return a DatasetDict if a key doesn't exist
panx_ch = defaultdict(DatasetDict)

for lang, frac in zip(langs, fracs):
    # Load monolingual corpus
    ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
    # Shuffle and downsample each split according to spoken proportion
    for split in ds:
        panx_ch[lang][split] = (
            ds[split]
            .shuffle(seed=0)
            .select(range(int(frac * ds[split].num_rows))))

Generating train split: 100%|██████████| 20000/20000 [00:01<00:00, 12347.61 examples/s]
Generating validation split: 100%|██████████| 10000/10000 [00:01<00:00, 8402.77 examples/s]
Generating test split: 100%|██████████| 10000/10000 [00:01<00:00, 9115.01 examples/s]
Generating train split: 100%|██████████| 20000/20000 [00:01<00:00, 12831.98 examples/s]
Generating validation split: 100%|██████████| 10000/10000 [00:01<00:00, 9801.87 examples/s]
Generating test split: 100%|██████████| 10000/10000 [00:00<00:00, 10467.03 examples/s]
Generating train split: 100%|██████████| 20000/20000 [00:01<00:00, 15817.98 examples/s]
Generating validation split: 100%|██████████| 10000/10000 [00:00<00:00, 17073.02 examples/s]
Generating test split: 100%|██████████| 10000/10000 [00:00<00:00, 15144.64 examples/s]


In [7]:
import pandas as pd

pd.DataFrame({lang: [panx_ch[lang]["train"].num_rows] for lang in langs},
             index=["Number of training examples"])

Unnamed: 0,de,fr,it,en
Number of training examples,12580,4580,1680,1180


In [8]:
element = panx_ch["de"]["train"][0]
for key, value in element.items():
    print(f"{key}: {value}")

tokens: ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
ner_tags: [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]
langs: ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']


In [9]:
for key, value in panx_ch["de"]["train"].features.items():
    print(f"{key}: {value}")

tokens: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags: Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)


In [10]:
tags = panx_ch["de"]["train"].features["ner_tags"].feature
print(tags)
     

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


In [11]:
def create_tag_names(batch):
    return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

panx_de = panx_ch["de"].map(create_tag_names)

Map: 100%|██████████| 12580/12580 [00:01<00:00, 12436.71 examples/s]
Map: 100%|██████████| 6290/6290 [00:00<00:00, 11384.17 examples/s]
Map: 100%|██████████| 6290/6290 [00:00<00:00, 12751.83 examples/s]


In [12]:
de_example = panx_de["train"][0]
pd.DataFrame([de_example["tokens"], de_example["ner_tags_str"]],
['Tokens', 'Tags'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Tokens,2.000,Einwohnern,an,der,Danziger,Bucht,in,der,polnischen,Woiwodschaft,Pommern,.
Tags,O,O,O,O,B-LOC,I-LOC,O,O,B-LOC,B-LOC,I-LOC,O


In [13]:
from collections import Counter

split2freqs = defaultdict(Counter)
for split, dataset in panx_de.items():
    for row in dataset["ner_tags_str"]:
        for tag in row:
            if tag.startswith("B"):
                tag_type = tag.split("-")[1]
                split2freqs[split][tag_type] += 1
pd.DataFrame.from_dict(split2freqs, orient="index")

Unnamed: 0,LOC,ORG,PER
train,6186,5366,5810
validation,3172,2683,2893
test,3180,2573,3071
