<a href="https://colab.research.google.com/github/IMOKURI/chaii-Hindi-and-Tamil-QA/blob/main/chaii_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📔 About this notebook ...

- 目的
    - postprocess の改善案模索
        - oof の結果と ground truth を比較する

# Prepare for Colab

In [1]:
import os
import sys
import zipfile

if os.path.exists('init.txt'):
    print("Already initialized.")

else:
    if 'google.colab' in sys.modules:
        from google.colab import drive
        drive.mount('/content/drive')
        dataset_dir = "/content/drive/MyDrive/Datasets"

        # ====================================================
        # Competition datasets
        # ====================================================
        with zipfile.ZipFile(f"{dataset_dir}/chaii-hindi-and-tamil-question-answering.zip", "r") as zp:
            zp.extractall(path="./")
        # with zipfile.ZipFile(f"{dataset_dir}/chaii-external-data-mlqa-xquad-preprocessing.zip", "r") as zp:
        #     zp.extractall(path="./")
        # with zipfile.ZipFile(f"{dataset_dir}/chaii-Squad_Translated_to_Tamil.zip", "r") as zp:
        #     zp.extractall(path="./")

    # for StratifiedGroupKFold
    # !pip uninstall -y scikit-learn
    # !pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple scikit-learn

    # for MultilabelStratifiedKFold
    # !pip install -q iterative-stratification

    # for CosineAnnealingWarmupRestarts
    # !pip install -qU 'git+https://github.com/katsura-jp/pytorch-cosine-annealing-with-warmup'

    !pip install -q wandb
    # !pip install -q optuna

    # ====================================================
    # Competition specific libraries
    # ====================================================
    !pip install -q transformers
    !pip install -q sentencepiece
    # !pip install -q textstat
    # !pip install -q nlpaug

    !touch init.txt


Mounted at /content/drive
[K     |████████████████████████████████| 1.7 MB 5.1 MB/s 
[K     |████████████████████████████████| 170 kB 47.2 MB/s 
[K     |████████████████████████████████| 97 kB 6.1 MB/s 
[K     |████████████████████████████████| 133 kB 49.8 MB/s 
[K     |████████████████████████████████| 63 kB 1.1 MB/s 
[?25h  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone
  Building wheel for pathtools (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 2.8 MB 5.0 MB/s 
[K     |████████████████████████████████| 52 kB 1.6 MB/s 
[K     |████████████████████████████████| 3.3 MB 45.0 MB/s 
[K     |████████████████████████████████| 636 kB 22.2 MB/s 
[K     |████████████████████████████████| 895 kB 70.4 MB/s 
[K     |████████████████████████████████| 1.2 MB 5.3 MB/s 
[?25h

# Library

In [2]:
# General libraries
import collections
import glob
import json
import math
import os
import random
import re
import statistics
import time
import warnings
from contextlib import contextmanager

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sns
import torch
import torch.cuda.amp as amp
import torch.nn as nn
import torch.nn.functional as F
import wandb
# from cosine_annealing_warmup import CosineAnnealingWarmupRestarts
# from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.metrics import mean_squared_error, jaccard_score
from sklearn.model_selection import KFold, StratifiedKFold  # , StratifiedGroupKFold
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm


In [3]:
# Competition specific libraries
# import nlpaug.augmenter.word as naw
# import nlpaug.augmenter.sentence as nas
# import nltk
# import textstat
import transformers as T


In [4]:
warnings.filterwarnings("ignore")


In [5]:
netrc = "/content/drive/MyDrive/.netrc" if 'google.colab' in sys.modules else "../input/wandbtoken/.netrc"
!cp -f {netrc} ~/
!wandb login

wandb_tags = []


[34m[1mwandb[0m: Currently logged in as: [33mimokuri[0m (use `wandb login --relogin` to force relogin)


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
    wandb_tags.append(torch.cuda.get_device_name(0))


# Load Data

In [7]:
DATA_DIR = "./" if 'google.colab' in sys.modules else "../input/chaii-hindi-and-tamil-question-answering/"
OUTPUT_DIR = "./"
MODEL_DIR = "./models/"

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)


In [8]:
train = pd.read_csv(DATA_DIR + "train.csv")
test = pd.read_csv(DATA_DIR + "test.csv")
sub = pd.read_csv(DATA_DIR + "sample_submission.csv")

#external_squad_translated_tamil = pd.read_csv(DATA_DIR + "squad_translated_tamil.csv")
#external_mlqa = pd.read_csv(DATA_DIR + "mlqa_hindi.csv")
#external_xquad = pd.read_csv(DATA_DIR + "xquad.csv")


# Config

In [9]:
# seed = random.randrange(10000)
seed = 440
print(seed)


440


In [10]:
class Config:
    wandb_entity = "imokuri"
    wandb_project = "chaii"
    print_freq = 100

    preprocess = False
    train = True
    validate = False
    inference = False

    debug = False
    num_debug_data = 50

    amp = False


In [11]:
config_defaults = {
    "seed": seed,
    # "n_class": 1,
    "n_fold": 5,
    "epochs": 2,
    "batch_size": 4,
    "gradient_accumulation_steps": 5,
    "max_grad_norm": 1.0,
    "criterion": "ChaiiCrossEntropyLoss",
    "optimizer": "BertAdamW",
    "scheduler": "get_cosine_schedule_with_warmup",
    "max_lr": 5e-5,
    "lr": 2e-5,
    "min_lr": 1e-5,
    "weight_decay": 0.01,
    "model_name": "deepset/xlm-roberta-large-squad2",
    # "model_name": "deepset/xlm-roberta-base-squad2",
    # "model_name": "google/rembert",
    "model_class": "bare", # bare, qa
    "max_len": 384,
    "doc_stride": 128,
    "dropout": 0.0,
    "init_weights": True,
    "init_layers": 1,
    # "freeze_layers": 0,
    "datasets": [
        "mlqa:v1",
        "xquad:v1",
        "squad_translated_tamil:v1",
    ],
    "models": [
        "base-models:v1",
    ],
    "runs": [
        "16cifk7p",
    ],
}


In [12]:
if Config.debug:
    config_defaults["n_fold"] = 3
    config_defaults["epochs"] = 1
    Config.print_freq = 10


In [13]:
class Struct:
    def __init__(self, entries):
        self.__dict__.update(**entries)


In [14]:
config = Struct(config_defaults)


# Load Artifacts

In [15]:
api = wandb.Api()

for n, run_id in enumerate(config.runs):
    if not os.path.exists(run_id):
        os.makedirs(run_id)

    run_path = f"{Config.wandb_entity}/{Config.wandb_project}/{run_id}"
    run = api.run(run_path)

    try:
        run.file("oof_df.csv").download(run_id)
    except wandb.CommError:
        # Already downloaded.
        pass

    oof = pd.read_csv(f"{run_id}/oof_df.csv")
    break


In [17]:
oof

Unnamed: 0,id,context,question,answer_text,answer_start,language,answers,fold,prediction,jaccard
0,41660850a,"குழந்தையின் அழுகையை நிறுத்தவும், தூங்க வைக்கவ...",தமிழ்நாட்டில் குழந்தைகளை தூங்க வைக்க பாடும் பா...,தாலாட்டு,68.0,tamil,"{'answer_start': [68], 'text': ['தாலாட்டு']}",0,தாலாட்டு,1.00
1,d419db018,மின்னணுவியல் (Electronics) மின்னணுக்கள் அல்லது...,திரிதடையங்களைப் பயன்படுத்திய முதல் நிறுவனம் எது?,IBM,4171.0,tamil,"{'answer_start': [4171], 'text': ['IBM']}",0,IBM,1.00
2,10ff95f4c,இந்தியாவின் தேசிய மனித உரிமை ஆணையம் ஒரு தன்னா...,இந்தியாவில் மனித உரிமை ஆணையம் எப்போது நிறுவப்ப...,"அக்டோபர் 12, 1993",90.0,tamil,"{'answer_start': [90], 'text': ['அக்டோபர் 12, ...",0,"அக்டோபர் 12, 1993",1.00
3,321e80660,மூன்று ஆழ்பள்ளத்தாக்கு அணை (Three Gorges Dam) ...,மூன்று ஆழ்பள்ளத்தாக்கு அணையின் உயரம் எவ்வளவு?,185 மீட்டர்,4241.0,tamil,"{'answer_start': [4241], 'text': ['185 மீட்டர்']}",0,172.5 மீ,0.00
4,af0f1f714,Coordinates: \n\nவால்ட் டிஸ்னி உலகம் (Walt Dis...,டிஸ்னி வேர்ல்ட் எங்கு உள்ளது?,புளோரிடாவில்,213.0,tamil,"{'answer_start': [213], 'text': ['புளோரிடாவில்']}",0,அமெரிக்காவின் புளோரிடா,0.00
...,...,...,...,...,...,...,...,...,...,...
1109,2d4a8e922,Main Page\n\nट्वाइलाइट एक रूमानी-फंतासी फ़िल...,2008 की ट्वाइलाइट रूमानी-फंतासी फ़िल्म में निक...,रोज़ाली हेल,4221.0,hindi,"{'answer_start': [4221], 'text': ['रोज़ाली हेल']}",4,रोज़ाली हेल,1.00
1110,cc4c69225,हख़ामनी वंश या अजमीढ़ साम्राज्य(अंग्रेज़ी तथा ...,आचमेनिड साम्राज्य का अंत किस साल में हुआ था?,सन ३३० ईसापूर्व,522.0,hindi,"{'answer_start': [522], 'text': ['सन ३३० ईसापू...",4,ईसापूर्व 330,0.25
1111,6c0aa4c03,इंडोनेशिया गणराज्य (दीपान्तर गणराज्य) दक्षिण प...,इंडोनेशिया की राजधानी क्या है,जकार्ता,245.0,hindi,"{'answer_start': [245], 'text': ['जकार्ता']}",4,जकार्ता,1.00
1112,26f356026,स्वामी निगमानन्द परमहंस (18 अगस्त 1880 - 29 नव...,स्वामी निगमानन्द परमहंस के तन्त्र गुरु कौन थे?,बामाक्षेपा,2691.0,hindi,"{'answer_start': [2691], 'text': ['बामाक्षेपा']}",4,उदासिनाचार्य सुमेरुदास महाराज,0.00


# Tokenize

In [18]:
tokenizer = T.AutoTokenizer.from_pretrained("deepset/xlm-roberta-base-squad2")


Downloading:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/605 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [20]:
context_tokens = [tokenizer(txt)["input_ids"] for txt in oof["context"]]
question_tokens = [tokenizer(txt)["input_ids"] for txt in oof["question"]]
answer_tokens = [tokenizer(txt)["input_ids"] for txt in oof["answer_text"]]
prediction_tokens = [tokenizer(txt)["input_ids"] for txt in oof["prediction"]]


In [22]:
oof["num_tokens_context"] = [len(tok) for tok in context_tokens]
oof["num_chars_context"] = [len(tok) for tok in oof["context"]]
oof["num_tokens_question"] = [len(tok) for tok in question_tokens]
oof["num_chars_question"] = [len(tok) for tok in oof["question"]]
oof["num_tokens_answer"] = [len(tok) for tok in answer_tokens]
oof["num_chars_answer"] = [len(tok) for tok in oof["answer_text"]]
oof["num_tokens_prediction"] = [len(tok) for tok in prediction_tokens]
oof["num_chars_prediction"] = [len(tok) for tok in oof["prediction"]]

In [23]:
oof.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1114 entries, 0 to 1113
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     1114 non-null   object 
 1   context                1114 non-null   object 
 2   question               1114 non-null   object 
 3   answer_text            1114 non-null   object 
 4   answer_start           1114 non-null   float64
 5   language               1114 non-null   object 
 6   answers                1114 non-null   object 
 7   fold                   1114 non-null   int64  
 8   prediction             1114 non-null   object 
 9   jaccard                1114 non-null   float64
 10  num_tokens_context     1114 non-null   int64  
 11  num_chars_context      1114 non-null   int64  
 12  num_tokens_question    1114 non-null   int64  
 13  num_chars_question     1114 non-null   int64  
 14  num_tokens_answer      1114 non-null   int64  
 15  num_

In [24]:

oof

Unnamed: 0,id,context,question,answer_text,answer_start,language,answers,fold,prediction,jaccard,num_tokens_context,num_chars_context,num_tokens_question,num_chars_question,num_tokens_answer,num_chars_answer,num_tokens_prediction,num_chars_prediction
0,41660850a,"குழந்தையின் அழுகையை நிறுத்தவும், தூங்க வைக்கவ...",தமிழ்நாட்டில் குழந்தைகளை தூங்க வைக்க பாடும் பா...,தாலாட்டு,68.0,tamil,"{'answer_start': [68], 'text': ['தாலாட்டு']}",0,தாலாட்டு,1.00,2715,8493,20,75,5,8,5,9
1,d419db018,மின்னணுவியல் (Electronics) மின்னணுக்கள் அல்லது...,திரிதடையங்களைப் பயன்படுத்திய முதல் நிறுவனம் எது?,IBM,4171.0,tamil,"{'answer_start': [4171], 'text': ['IBM']}",0,IBM,1.00,2572,8796,14,48,3,3,3,4
2,10ff95f4c,இந்தியாவின் தேசிய மனித உரிமை ஆணையம் ஒரு தன்னா...,இந்தியாவில் மனித உரிமை ஆணையம் எப்போது நிறுவப்ப...,"அக்டோபர் 12, 1993",90.0,tamil,"{'answer_start': [90], 'text': ['அக்டோபர் 12, ...",0,"அக்டோபர் 12, 1993",1.00,1069,4217,12,52,6,17,6,18
3,321e80660,மூன்று ஆழ்பள்ளத்தாக்கு அணை (Three Gorges Dam) ...,மூன்று ஆழ்பள்ளத்தாக்கு அணையின் உயரம் எவ்வளவு?,185 மீட்டர்,4241.0,tamil,"{'answer_start': [4241], 'text': ['185 மீட்டர்']}",0,172.5 மீ,0.00,4189,15681,15,45,5,11,5,9
4,af0f1f714,Coordinates: \n\nவால்ட் டிஸ்னி உலகம் (Walt Dis...,டிஸ்னி வேர்ல்ட் எங்கு உள்ளது?,புளோரிடாவில்,213.0,tamil,"{'answer_start': [213], 'text': ['புளோரிடாவில்']}",0,அமெரிக்காவின் புளோரிடா,0.00,219,733,13,29,7,12,8,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1109,2d4a8e922,Main Page\n\nट्वाइलाइट एक रूमानी-फंतासी फ़िल...,2008 की ट्वाइलाइट रूमानी-फंतासी फ़िल्म में निक...,रोज़ाली हेल,4221.0,hindi,"{'answer_start': [4221], 'text': ['रोज़ाली हेल']}",4,रोज़ाली हेल,1.00,7197,23772,32,79,7,11,7,12
1110,cc4c69225,हख़ामनी वंश या अजमीढ़ साम्राज्य(अंग्रेज़ी तथा ...,आचमेनिड साम्राज्य का अंत किस साल में हुआ था?,सन ३३० ईसापूर्व,522.0,hindi,"{'answer_start': [522], 'text': ['सन ३३० ईसापू...",4,ईसापूर्व 330,0.25,2400,7983,15,44,8,15,6,13
1111,6c0aa4c03,इंडोनेशिया गणराज्य (दीपान्तर गणराज्य) दक्षिण प...,इंडोनेशिया की राजधानी क्या है,जकार्ता,245.0,hindi,"{'answer_start': [245], 'text': ['जकार्ता']}",4,जकार्ता,1.00,1126,4101,10,29,5,7,5,8
1112,26f356026,स्वामी निगमानन्द परमहंस (18 अगस्त 1880 - 29 नव...,स्वामी निगमानन्द परमहंस के तन्त्र गुरु कौन थे?,बामाक्षेपा,2691.0,hindi,"{'answer_start': [2691], 'text': ['बामाक्षेपा']}",4,उदासिनाचार्य सुमेरुदास महाराज,0.00,2863,9832,15,46,6,10,10,30


# Checking for unknown tokens

In [25]:
context_tokens_flat = sum(context_tokens, [])
question_tokens_flat = sum(question_tokens, [])
answer_tokens_flat =  sum(answer_tokens, [])
prediction_tokens_flat =  sum(prediction_tokens, [])


In [26]:
# context 以外には未知の token はない
unk = tokenizer.unk_token_id

unk in context_tokens_flat, unk in question_tokens_flat, unk in answer_tokens_flat, unk in prediction_tokens_flat

(True, False, False, False)

In [27]:
# context に含まれる未知の token の数と割合 -> 無視できるくらい低い？
num_unk_tokens = sum([tok == unk for tok in context_tokens_flat])
num_unk_tokens, num_unk_tokens/len(context_tokens_flat)

(212, 5.972017716610294e-05)

# Same with Muril

In [31]:
tokenizer_muril = T.AutoTokenizer.from_pretrained("google/muril-base-cased")

context_tokens_muril = [tokenizer_muril(txt)["input_ids"] for txt in oof["context"]]
question_tokens_muril = [tokenizer_muril(txt)["input_ids"] for txt in oof["question"]]
answer_tokens_muril = [tokenizer_muril(txt)["input_ids"] for txt in oof["answer_text"]]
prediction_tokens_muril = [tokenizer_muril(txt)["input_ids"] for txt in oof["prediction"]]

context_tokens_flat_muril = sum(context_tokens_muril, [])
question_tokens_flat_muril = sum(question_tokens_muril, [])
answer_tokens_flat_muril =  sum(answer_tokens_muril, [])
prediction_tokens_flat_muril =  sum(prediction_tokens_muril, [])

unk_muril = tokenizer_muril.unk_token_id

print("Unk token in context, question, answer, prediction")
print(unk_muril in context_tokens_flat_muril, unk_muril in question_tokens_flat_muril, unk_muril in answer_tokens_flat_muril, unk_muril in prediction_tokens_flat_muril)

print("Num unk tokens in context, question, answer, prediction")
sum([tok == unk_muril for tok in context_tokens_flat_muril]), sum([tok == unk_muril for tok in question_tokens_flat_muril]), sum([tok == unk_muril for tok in answer_tokens_flat_muril]), sum([tok == unk_muril for tok in prediction_tokens_flat_muril])


Unk token in context, question, answer, prediction
True True True True
Num unk tokens in context, question, answer, prediction


(1403, 19, 2, 1)

# Same with RemBERT

In [32]:
tokenizer_rembert = T.AutoTokenizer.from_pretrained("google/rembert")

context_tokens_rembert = [tokenizer_rembert(txt)["input_ids"] for txt in oof["context"]]
question_tokens_rembert = [tokenizer_rembert(txt)["input_ids"] for txt in oof["question"]]
answer_tokens_rembert = [tokenizer_rembert(txt)["input_ids"] for txt in oof["answer_text"]]
prediction_tokens_rembert = [tokenizer_rembert(txt)["input_ids"] for txt in oof["prediction"]]

context_tokens_flat_rembert = sum(context_tokens_rembert, [])
question_tokens_flat_rembert = sum(question_tokens_rembert, [])
answer_tokens_flat_rembert =  sum(answer_tokens_rembert, [])
prediction_tokens_flat_rembert =  sum(prediction_tokens_rembert, [])

unk_rembert = tokenizer_rembert.unk_token_id

print("Unk token in context, question, answer, prediction")
print(unk_rembert in context_tokens_flat_rembert, unk_rembert in question_tokens_flat_rembert, unk_rembert in answer_tokens_flat_rembert, unk_rembert in prediction_tokens_flat_rembert)

print("Num unk tokens in context, question, answer, prediction")
sum([tok == unk_rembert for tok in context_tokens_flat_rembert]), sum([tok == unk_rembert for tok in question_tokens_flat_rembert]), sum([tok == unk_rembert for tok in answer_tokens_flat_rembert]), sum([tok == unk_rembert for tok in prediction_tokens_flat_rembert])

Downloading:   0%|          | 0.00/263 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.70M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.71M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/156 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3137 > 256). Running this sequence through the model will result in indexing errors


Unk token in context, question, answer, prediction
True False False False
Num unk tokens in context, question, answer, prediction


(119, 0, 0, 0)

# Looking at char level

In [33]:
contexts = oof["context"]
answers = oof["answer_text"]
predictions = oof["prediction"]

all_chars_ctx = "".join(contexts)
all_chars_ans = "".join(answers)
all_chars_preds = "".join(predictions)

unq_chars_ctx = sorted(list(set(all_chars_ctx)))
unq_chars_ans = sorted(list(set(all_chars_ans)))
unq_chars_preds = sorted(list(set(all_chars_preds)))


In [34]:
print("Contexts: ", len(contexts), contexts.nunique())
print("Answers: ", len(answers), answers.nunique())
print("Predictions: ", len(predictions), predictions.nunique())


Contexts:  1114 924
Answers:  1114 990
Predictions:  1114 1007


In [35]:
# 含まれている文字の一覧
"".join(unq_chars_ctx)


'\t\n !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\xa0¢£¨«®¯°±²³´µ·º»½¾ÁÉÍÎÖ×ÚÜßàáâãäåæçèéêëìíîïðñóôö÷øùúüýĀāăąćčďĐēěğīİıķŁłńņŋŌōŚśŠšťūźžƒǎǐǔɐɑɒɔɖəɛɜɟɡɣɦɨɪɫɲɳɹɽɾʀʁʂʃʈʊʋʌʏʒʔʝʰʲʿˈˌː˙˚̞̥̪̰̀́̃̚ͰͱͲͳͶͷΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίαβγδεζηθικλμνξοπρςστυφχψωόύώϘϙϚϛϜϝϞϟϠϡϷϸϺϻАБВДЗИКЛМНОПРСТУФХЧЭЯабвгдежзийклмнопрстухцчшыьюяёҙԵանրևְִֵֶַָֹּׁׂאבהחיכלמנסרשתءآأؤإئابةتثجحخدذرزسشصضطظعغفقكلمنهوىيَُِّْپچډښکگہۂیܒܠऀँंःऄअआइईउऊऋऌऍऎएऐऑऒओऔकखगघङचछजझञटठडढणतथदधनऩपफबभमयरऱलळऴवशषसहऺऻ़ऽािीुूृॄॅॆेैॉॊोौ्ॎॏॐ॒॑॓॔ॕॖॗक़ख़ग़ज़ड़ढ़फ़य़ॠॡॢॣ।॥०१२३४५६७८९॰ॱॲॳॴॵॶॷॸॹॺॻॼॽॾॿংঅআউএওকগঙচছজঞটঠডঢণতথদধনপবভমযরলশষসহ়ািীুৃেো্ৎ০১২৬ਕਗਜਦਨਮਰਲਸਾਿੀੁੂ੍ੱંઈટણદનપબભમરલળવસાિીુેો્ஃஅஆஇஈஉஊஎஏஐஒஓஔகஙசஜஞடணதநனபமயரறலளழவஷஸஹாிீுூெேைொோௌ்ௗ௦௧௭௯ంగజటడదనబమయరసహాిుూెైో్ಂಅಕಗಟದನಬರಳಸಾಿುೂೆ್ംകഗതദനപബമരലവശസാിീുൊ്กคงญดทนบปพมรวสหอะัาีุเ่๋་གདལསྭကဋတပမယရသာိီူေ်ဿባዓይគងតរវអ្ḗḥḭḲṃṅṇṣṭẓảậἀἄἈἐἒἤἦἴἶἸὀὂὕὖὦὨὰὶᾶῆῑῖῥῦῶ\u2009\u200b\u200c\u200d\u200e‑–—‘’“”„†‡•…‰′⁄\u2061€₹←↑→↓↦⇒∅∆∈∑−∗∘√∝∞∠∫∮≈≠≡≤≥⊂⋅⋆⋯⏟◆●ⲁⲓⲣⲫⲱ【】あいうえかくこしだちつてとなにのはふほまやゅょらんアスプル与世中九京人住信備儲元克分初券加

In [36]:
"".join(unq_chars_ans)


' "%\',-./0123456789:;ABCEGHILMOPSUWabcdeghijklmnopqrstuvxyz°²ँंःअआइईउऊऋएऐऑओऔकखगघचछजझञटठडणतथदधनपफबभमयरलवशषसह़ािीुूृॅेैॉोौ्ॠ।०१२३४५६७८९॰ஃஅஆஇஈஉஎஏஐஒஓகஙசஜஞடணதநனபமயரறலளழவஷஸஹாிீுூெேைொோௌ்௭\u200d–‘’₹'

In [37]:
"".join(unq_chars_preds)


'\n "%\'(),-./0123456789:;ABCDEGHILMNORSTUWabcdeghijklmnopqrstuvyz°²ँंःअआइईउऊऋएऑओऔकखगघचछजझञटठडढणतथदधनपफबभमयरलवशषसह़ािीुूृॅेैॉोौ्ॠ।०१२३४५६७८९॰ஃஅஆஇஈஉஎஏஐஒகஙசஜஞடணதநனபமயரறலளழவஷஸஹாிீுூெேைொோ்\u200d–’₹'

## prediction にだけ含まれる文字の確認

In [38]:
set(all_chars_preds) - set(all_chars_ans)

{'\n', '(', ')', 'D', 'N', 'R', 'T', 'ढ'}

In [62]:
set(all_chars_ans) - set(all_chars_preds)

{'P', 'x', 'ऐ', 'ஓ', 'ௌ', '௭', '‘'}

In [40]:
# 先頭と末尾の改行は除外して良さそう

oof[oof["prediction"].str.contains("\n")][["answer_text", "prediction"]]

Unnamed: 0,answer_text,prediction
46,1639ஆம் ஆண்டு ஆகஸ்ட் மாதம் 22,\n1639
71,1955 ஆம் ஆண்டு செப்டம்பர்,\n1955
190,होरडियम डिस्टिन,\nहोरडियम डिस्टिन
208,बाबा रामदेव,\nबाबा रामदेव
225,உடல் திசு ஆய்வு,\nஉடல் திசு ஆய்வு
231,நெமடோடா,\nஎக்டிசாசோவாக்கள்
263,வெள்ளி,\nசூரியன்
264,1757,\n1757
275,1757,\n1837 மற்றும் 1857
277,தமிழகத்திற்கு,\nஆசியா


In [43]:
# 前後にある空白も除外して良さそう

oof[oof["prediction"].str.contains('\(')][["answer_text", "prediction"]]

Unnamed: 0,answer_text,prediction
6,"சனவரி 28, 1892",(2012)
143,Hg,(Hg)
347,रिक्टर पैमाना,(MW
360,11 सितम्बर 1895,(11 सितम्बर 1895
421,15 अप्रैल 1469,(15 अप्रैल 1469
603,1926,(1926
604,27 ई.पू.,(27 ई.पू.
641,"महाराष्ट्र के कोंकण क्षेत्र में एक गांव है, गा...",(11 सितम्बर 1895 - 15 नवम्बर 1982) भारत
645,1775,(1775–1783
660,ग्राथस ड्रेपर,ग्राथस ड्रेपर (Grathus Draper) नियम


In [56]:
oof[oof["prediction"].str.contains('\)')][["answer_text", "prediction"]]

Unnamed: 0,answer_text,prediction
6,"சனவரி 28, 1892",(2012)
65,5 டிசம்பர் 2016,5 டிசம்பர் 2016)
143,Hg,(Hg)
186,1852,1852)
209,22 जनवरी 2008,2008)
224,30368609,km²)
399,1799,-1799)
441,"21 जून, 1852","21 जून, 1852)"
443,11 फरवरी 1977,11 फरवरी 1977)
601,4 सितम्बर 2006,4 सितम्बर 2006)


In [63]:
oof[oof["prediction"].str.contains('ढ')][["answer_text", "prediction"]]

Unnamed: 0,answer_text,prediction
1006,प्रोटोज़ोआ,संक्रमित रक्त को चढ़ाने से


## Ground truth に含まれる文字を確認

In [57]:
oof[oof["answer_text"].str.contains("கி.பி")][["answer_text", "prediction"]]

Unnamed: 0,answer_text,prediction
669,கி.பி.1510,\nகி.பி.1510


In [58]:
oof[oof["answer_text"].str.contains("கி.மு")][["answer_text", "prediction"]]

Unnamed: 0,answer_text,prediction
250,கி.மு 3000,கி.மு 6000
260,கி.மு 1400-1000,1839
670,கி.மு. ஐந்தாம் நூற்றாண்டில்,கி.மு 470/469


In [59]:
oof[oof["answer_text"].str.contains("கி.மீ")][["answer_text", "prediction"]]

Unnamed: 0,answer_text,prediction
58,"10,180,000 ச.கி;மீகள்","10,180,000 ச.கி;மீகள்"
267,6400 கி.மீ.கள்,6400 கி.மீ.கள்
688,6400 கி.மீ.கள்,6400 கி.மீ.கள்
701,6650 கி.மீ,6650 கி.மீ
721,147.99 கி.மீ,147.99 கி.மீ.2
728,147.99 கி.மீ.,147.99 கி.மீ.2
922,174 கி.மீ²,174 கி.மீ².


In [60]:
oof[oof["answer_text"].str.contains("ई")][["answer_text", "prediction"]]

Unnamed: 0,answer_text,prediction
81,ईक्वस कैबेलस,ईक्वस
121,चग़ताई,चग़ताई भाषा
128,२७ मई १९५७,२७ मई १९५७
135,थाई,सेंट्रल थाई
174,वाई एस आर कांग्रेस पार्टी,वाई एस आर कांग्रेस पार्टी
185,563 ईसा पूर्व,563 ईसा पूर्व
195,लातवियाई भाषा,लातवियाई
211,328 ई.,328 ई.
301,ईसाई धर्म,ईसाई
303,अप्रैल से जुलाई,अप्रैल से जुलाई तक


In [61]:
oof[oof["answer_text"].str.contains("ई.पू")][["answer_text", "prediction"]]

Unnamed: 0,answer_text,prediction
413,331 ई.पू.,331 ई.पू.
604,27 ई.पू.,(27 ई.पू.
854,287 ई.पू.,287 ई.पू.
1090,400 ई॰पू॰ 100 ई॰ सन् के बीच,400 ई॰पू॰ 100 ई॰ सन्
