In [1]:
import json
import copy
import logging
from dataclasses import dataclass, field
from typing import Optional, Dict, Sequence
import gc
import tqdm
import pandas as pd
import io
import torch
import transformers
from torch.utils.data import Dataset
from transformers import LlamaModel, LlamaConfig
from transformers.models.llama.modeling_llama import LlamaRMSNorm, LlamaRotaryEmbedding, LlamaDecoderLayer
from transformers.activations import ACT2FN
from transformers import AutoTokenizer
import os
from transformers import Trainer, AdamW, get_linear_schedule_with_warmup
from transformers.trainer_pt_utils import get_parameter_names
from transformers.utils import is_sagemaker_mp_enabled
from transformers.trainer_utils import ShardedDDPOption
from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS

def jload(f_name, mode="r"):
    lines = []
    with open(f_name, 'r') as f:
        for line in f:
            lines.append(json.loads(line))
    return lines

## 2023 모두의 말뭉치 text generation data processing

In [2]:
tok = AutoTokenizer.from_pretrained('./KoLLaVA-KoVicuna-7b/', use_fast=False)
TC='<table_col>'#, '<table_row>'
TI, TH='<table_important>', '<table_header>'
TSR1, TSR2, TSRL = '<table_SR1>','<table_SR2>','<table_SRL>'
TCR1, TCR2, TCRL = '<table_CR1>','<table_CR2>','<table_CRL>'

tok.add_tokens([f'<eval_token_{i}>' for i in [2, 5]])
tok.add_tokens([TC, TI, TH, TSR1, TSR2, TSRL, TCR1, TCR2, TCRL])

9

In [3]:
DEBUG=False

In [4]:
## Load 이미지 캡션 데이터셋
lines_new = []
f_name = '2023_NLG/02/nikluge-2022-image-dev.jsonl'
lines = jload(f_name)
f_name = '2023_NLG/02/nikluge-2022-image-train.jsonl'
lines.extend(jload(f_name))
if DEBUG: lines = lines[:10]
## formatter
def format_2(line):
    # 다음 이미지를 설명하시오. 
    input_ = "<image><eval_token_2>"
    image = f"{line['input'][:3]}/{line['input']}.jpg"
    output_= [f"{o}</s>" for o in line['output']]
    for oo in output_:
        assert tok.encode(oo, add_special_tokens=False)[-1]==2
    outputs= [{'image':image,
                   'conversations':[{'from':'human','value':input_},
                                 {'from':'gpt','value':f'{o}'},],} for o in output_]
    return outputs
for l in lines:
    lines_new.extend(format_2(l))
print(len(lines_new), lines_new[-1])
with open('2023_nlg_2.json', 'w') as f:
    json.dump(lines_new, f)

46820 {'image': 'S1C/S1C0152.jpg', 'conversations': [{'from': 'human', 'value': '<image><eval_token_2>'}, {'from': 'gpt', 'value': '사직구장에서 붉은 유니폼을 입은 야구선수들이 훈련을 진행하는 중이다.</s>'}]}


In [5]:
def table_reformulator(line):
    table= line['input']['table'] 
    try:
        important_cells  = line['input']['metadata']['highlighted_cells']
    except: 
        important_cells = []
    str_normal = ''
    str_import = ''
    for i, row in enumerate(table):
        str_normal += TC
        for j, col in enumerate(row):
            #str_ += TR
            text = col['value']
            if col['is_header']: text += TH
            if (i, j) in important_cells: text+= TI
            #if row['is_important']: text = f"{TI}{text}"
            
            rs, cs = int(col['row_span']), int(col['column_span'])
            if rs==0: text += TSR1
            elif rs==1: text+= TSR2
            else: text += TSRL
            
            if cs==0: text += TCR1
            elif cs==1: text+= TCR2
            else: text += TCRL            

            str_normal += f'{text}'
            if (i, j) in important_cells: str_import += text
    return str_normal, str_import

In [6]:
## Load 테이블 데이터셋
lines_new = []
f_name = '2023_NLG/05/NIKL_TABLE_2023_COMPETITION_v1.0/nikluge-2022-table-dev.jsonl'
lines = jload(f_name)
f_name = '2023_NLG/05/NIKL_TABLE_2023_COMPETITION_v1.0/nikluge-2022-table-train.jsonl'
lines.extend(jload(f_name))
if DEBUG: lines = lines[:10]
## formatter
def format_5(line):
    table_str, table_str_highlighted = table_reformulator(line)
    #if hasattr(line['input']['metadata'], 'url'):
    try:
        line['input']['metadata'].pop('url')
    except:
        pass
    #if hasattr(line['input']['metadata'], 'highlighted_cells'):
    try:
        line['input']['metadata'].pop('highlighted_cells')
    except:
        pass
    #line['input']['metadata'].pop('highlighted_cells')

    
    input_ = f"{table_str}{line['input']['metadata']}{table_str_highlighted}<eval_token_5>"
    output_= [f"{o}</s>" for o in line['output']]
    for oo in output_:
        assert tok.encode(oo, add_special_tokens=False)[-1]==2
    outputs= [{'conversations':[{'from':'human','value':input_},
                                 {'from':'gpt','value':f'{o}'},],} for o in output_]
    return outputs
for l in lines:
    tmp = format_5(l)
    if len(tok.encode(tmp[0]['conversations'][0]['value']))+len(tok.encode(tmp[0]['conversations'][1]['value']))<=768:
        lines_new.extend(tmp)
print(len(lines_new), lines_new[-1])
with open('2023_nlg_5.json', 'w') as f:
    json.dump(lines_new, f)

Token indices sequence length is longer than the specified maximum sequence length for this model (2182 > 2048). Running this sequence through the model will result in indexing errors


29985 {'conversations': [{'from': 'human', 'value': "<table_col><table_header><table_SR2><table_CR2><table_header><table_SR2><table_CR2>보육시설<table_header><table_SR2><table_CR2>백분율(100%)<table_header><table_SR2><table_CR2><table_col>배식대 보유<table_SRL><table_CR2>예<table_SR2><table_CR2>276<table_SR2><table_CR2>35.1<table_SR2><table_CR2><table_col>아니오<table_SR2><table_CR2>511<table_SR2><table_CR2>64.9<table_SR2><table_CR2><table_col>소 계<table_SR2><table_CR2>787<table_SR2><table_CR2>100.0<table_SR2><table_CR2>{'title': '서울지역 어린이 대상 급식실태 조사 및 개선방안 연구', 'date': '2008-11-00', 'publisher': '식품의약품안전처', 'table_title': '배식대 보유'}<eval_token_5>"}, {'from': 'gpt', 'value': '배식대를 보유하고 있는 곳과 그렇지 않은 곳은 조사한 보육시설 중 각각 35.1%(276곳)와 64.9%(511곳)에 해당하는 것으로 조사되었다.</s>'}]}


In [7]:
combined = []
for i in [2, 5]:
    with open(f'2023_nlg_{i}.json', 'r') as f:
        data = json.load(f)
        combined.extend(data)
with open('2023_nlg_combined.json', 'w') as f:
    json.dump(combined, f)

In [8]:
## Load 이미지 캡션 데이터셋
lines = jload('2023_NLG/02/nikluge-2022-image-test.jsonl')
lines_test = []
def format_test(line):
    input_ = "<image><eval_token_2>"
    image = f"{line['input'][:3]}/{line['input']}.jpg"
    return {'id':line['id'], 'input': line['input'], 
               'image':image, 'question': input_, 'output':''}
lines_test = [format_test(l) for l in lines]
print(len(lines_test), lines_test[-1])
with open('test_2.json', 'w') as f:
    json.dump(lines_test, f)

1040 {'id': 'nikluge-2022-image-test-001040', 'input': 'S1C0148', 'image': 'S1C/S1C0148.jpg', 'question': '<image><eval_token_2>', 'output': ''}


In [9]:
## Load 테이블 데이터셋
lines = jload('2023_NLG/05/NIKL_TABLE_2023_COMPETITION_v1.0/nikluge-2022-table-test.jsonl')

def format_test(line):
    table_str, table_str_highlighted = table_reformulator(line)
    try:
        line['input']['metadata'].pop('url')
    except:
        pass
    try:
        line['input']['metadata'].pop('highlighted_cells')
    except:
        pass
    input_ = f"{table_str}{line['input']['metadata']}{table_str_highlighted}<eval_token_5>"
    return {'id': line['id'], 'input': line['input'],
            'question': input_}
lines_test = [format_test(l) for l in lines]
print(len(lines_test), lines_test[-1])
with open('test_5.json', 'w') as f:
    json.dump(lines_test, f)

1001 {'id': 'nikluge-2022-table-test-001001', 'input': {'metadata': {'title': '서울지역 어린이 대상 급식실태 조사 및 개선방안 연구', 'date': '2008-11-00', 'publisher': '식품의약품안전처', 'table_title': '보육시설 및 유치원 위생점검횟수'}, 'table': [[{'value': '', 'is_header': True, 'row_span': 1, 'column_span': 1}, {'value': '', 'is_header': True, 'row_span': 1, 'column_span': 1}, {'value': '보육시설', 'is_header': True, 'row_span': 1, 'column_span': 1}, {'value': '유치원', 'is_header': True, 'row_span': 1, 'column_span': 1}], [{'value': ' 시,구청의위생점검횟수 ', 'is_header': False, 'row_span': '6', 'column_span': 1}, {'value': '전혀 받아 본 적이 없다', 'is_header': False, 'row_span': 1, 'column_span': 1}, {'value': '61(20.54%)', 'is_header': False, 'row_span': 1, 'column_span': 1}, {'value': '51(15.09%)', 'is_header': False, 'row_span': 1, 'column_span': 1}], [{'value': '평균3년에1회정도', 'is_header': False, 'row_span': 1, 'column_span': 1}, {'value': '9(1.14%)', 'is_header': False, 'row_span': 1, 'column_span': 1}, {'value': '6(1.78%)', 'is_header': False, 