# 중복 제거 통계 내기

In [None]:
from pathlib import Path
import json
import pandas as pd
import re
from transformers import T5Tokenizer
import numpy as np

uuid_pattern = re.compile(r'[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}')
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")

p_cap_dir = Path('../data/Ego4D-processed/captions/VideoRecap/caption_2s')
records = []
for i, p_cap in enumerate(p_cap_dir.glob('**/*.json')):
    if not uuid_pattern.match(p_cap.stem):
        continue
    with open(p_cap, 'r') as f:
        cap = json.load(f)
    sr = pd.Series(cap['captions']['text'])
    sr_comp = sr.loc[sr.shift(-1) != sr]
    tokens = tokenizer(cap['captions']['text'], return_tensors='np')['attention_mask']
    record = {
        'clip_uid': cap['clip_uid'],
        'num_caps': len(sr),
        'num_caps_comp': len(sr_comp),
        'num_tokens': sum(map(np.sum, tokens)),
        'num_tokens_comp': sum(map(np.sum, tokens[sr_comp.index])),
    }
    records.append(record)
df = pd.DataFrame(records)
df

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Unnamed: 0,clip_uid,num_caps,num_caps_comp,num_tokens,num_tokens_comp
0,bcfcd2b4-7ca3-45ea-bbeb-0c6bd1ddcaac,228,141,1433,950
1,3cd6f6c1-b89f-4241-8b17-dc2dbd09d9e1,241,159,1716,1191
2,fcad17ad-ac2e-4d39-9a0f-da305e027ee7,241,177,1402,1076
3,7ba2bb39-2c15-42b5-990f-214ef1b730c3,241,177,2430,1797
4,156f510b-d740-44e8-83a7-96af31eaad5a,241,177,1428,1070
...,...,...,...,...,...
1678,2ed232bc-dc21-42d6-88be-d6ebe92f5b2a,126,88,940,684
1679,b5ae8df0-4825-4021-a01a-722947019865,241,203,2532,2232
1680,58fa07ae-2992-4dc9-842a-e5a73ee3d345,241,193,1624,1343
1681,d37ab6fe-4f57-41ef-b6ff-cb193be15303,241,138,1501,899


In [None]:
df[['num_caps', 'num_caps_comp', 'num_tokens', 'num_tokens_comp']].describe()

Unnamed: 0,num_caps,num_caps_comp,num_tokens,num_tokens_comp
count,1683.0,1683.0,1683.0,1683.0
mean,242.146168,177.080808,1897.291147,1468.387998
std,78.839344,62.716351,739.637152,669.493362
min,10.0,1.0,79.0,5.0
25%,241.0,149.0,1500.0,1060.0
50%,241.0,179.0,1776.0,1408.0
75%,241.0,199.0,2145.0,1783.5
max,601.0,547.0,7701.0,6399.0


# NER로 object, action 뽑기(를 위한 json 만들기)

기본 구조
```json
[
    {
        "id": "CrossNER_AI_0",
        "conversations": [
            {
                "from": "human",
                "value": "Text: Typical generative model approaches include naive Bayes classifier s , Gaussian mixture model s , variational autoencoders and others ."
            },
            {
                "from": "gpt",
                "value": "I've read this text."
            },
            {
                "from": "human",
                "value": "What describes organization in the text?"
            },
            {
                "from": "gpt",
                "value": "[]"
            }
        ]
    },
    ...
]
```

In [16]:
from pathlib import Path
import json
import pandas as pd
import re
from tqdm.auto import tqdm

pattern_uuid = re.compile(r'[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}')

def generate_json_element(cap_id, cap_text, entity_type='object', **cap_info_kwargs):
    return {
        'id': cap_id,
        'conversations': [
            {"from": "human", "value": f"Text: {cap_text} ."},
            {"from": "gpt",   "value": "I've read this text."},
            {"from": "human", "value": f"What describes {entity_type} in the text?"},
            {"from": "gpt",   "value": "[]"}
        ],
        'entity_type': entity_type,
        'info': {
            **cap_info_kwargs,
            'caption': cap_text,
        }
    }

p_cap_dir = Path('../data/Ego4D-processed/captions/VideoRecap/caption_2s')
p_out_inputs = p_cap_dir.with_name(p_cap_dir.stem + '_uniner_inputs')
p_out_inputs.mkdir(exist_ok=True, parents=True)
for p_cap in tqdm(list(p_cap_dir.glob('**/*.json'))):
    if not pattern_uuid.match(p_cap.stem):
        continue
    with open(p_cap, 'r') as f:
        cap = json.load(f)
    df_cap = pd.DataFrame(cap['captions'])
    df_cap = df_cap[df_cap['text'] != df_cap['text'].shift(1)]  # take first ones if consecutively same
    duration = cap['video_end_sec'] - cap['video_start_sec']
    df_cap['end'] = df_cap['start'].shift(-1, fill_value=duration)
    df_cap = df_cap.reset_index(drop=True)
    df_cap['text_orig'] = df_cap['text'].str.strip()
    df_cap['text'] = df_cap['text'].str.strip().str.replace(r'\s+', ' ', regex=True)
    df_cap['text'] = df_cap['text'].str.replace(r'^[cC]\s+', 'The camera wearer ', regex=True)
    df_cap['text'] = df_cap['text'].str.replace(r'^#\w\s+', '', regex=True)
    df_cap['cap_id'] = df_cap.index.map(str) + '_' + df_cap['start'].map('{:.0f}'.format) + '_' + df_cap['end'].map('{:.0f}'.format)
    json_uniner_input = df_cap[['start', 'end', 'cap_id', 'text', 'text_orig']].apply(lambda row: generate_json_element(row['cap_id'], row['text'], caption_=re.sub(r'^[cC] ', '#C C ', row['text_orig']), start=row['start'], end=row['end']), axis=1).tolist()
    p_out = p_out_inputs / (p_cap.stem + '.json')
    with p_out.open('w') as f:
        json.dump(json_uniner_input, f)

  0%|          | 0/1685 [00:00<?, ?it/s]

```bash
srun -x 'ariel-v[10,12]' -p debug_grad -t 4:00:00 \
    --gres=gpu:1 \
    --cpus-per-gpu=8 \
    --mem-per-gpu=52G \
    --chdir=/data/gunsbrother/prjs/ltvu/llms/universal-ner \
    python -m src.eval.evaluate \
    --model_path Universal-NER/UniNER-7B-type \
    --data_path '/data/gunsbrother/prjs/ltvu/ours/notebooks/uniner_input.json' \
    --tensor_parallel_size 1
```