In [1]:
import pandas as pd
import numpy as np
import datasets
from collections import Counter 
from transformers import AutoTokenizer

import pandas as pd
from datasets import load_dataset,Dataset,concatenate_datasets
import jsonlines
import json

In [2]:
translation_template='''You are a professional translator. Translate the following {} source text into {}. Refer to the word pairs in the glossary if the glossary exists when you translate. Do not translate the glossary it self.
{}
Translation:
'''
response_template="Translation:"
glossary_template="Glossary:"
sentence_template="Source:"

In [23]:
import os,sys
import random
import numpy as np
import torch
import pandas as pd
from datasets import load_dataset,Dataset,concatenate_datasets
import jsonlines
import ipdb
import json
from kiwipiepy import Kiwi
from nltk.tokenize import sent_tokenize
import ast
import ipdb

kiwi = Kiwi()

def formatting_prompt_func(template:str,*args:str):
  '''
  Template can be like ;
  ### Input :
  {}
  ### Output :
  {}
  '''

  return template.format(*args)


def make_translation_input_from_dataset(data,
                                  tokenizer,
                                  prompt_template,
                                  glossary_template=None,
                                  sentence_template=None,
                                  src:str=None, 
                                  tgt:str=None,
                                  return_output=True,
                                  text_split=True,
                                  **kwargs
                                  ):

    lang_dict={"korean":"Korean","english":"English","ko":"Korean","eng":"English","en":"English"}
    src_tgt_dict={"en":"english","eng":"english","english":"english","ko":"korean","kor":"korean","korean":"korean"}

    if not src and not tgt:
        if "src" in data.keys() and "tgt" in data.keys():
            src=src_tgt_dict[data["src"]]
            tgt=src_tgt_dict[data["tgt"]]
        else:
            raise Exception("'src'와 'tgt'가 주어지거나, data의 key로 존재해야합니다.")

    src_text=data[src]
    formatted_text=None

    if text_split:
        splited_sents=split_sents(lang_dict[src],src_text)
        sent2terms = []
        
        if data["term_dict"] is not None and len(data["term_dict"]):
            term_dict = ast.literal_eval(data["term_dict"])

            for s in splited_sents:
                new_sent_parts = {}
                for k, v in term_dict.items():
                    if k in s:
                        new_sent_parts[k]=v

                if len(new_sent_parts):
                    new_sent_parts=formatting_glossary(new_sent_parts,glossary_template)
                    new_s = f"{sentence_template}\n{s}\n{new_sent_parts}\n"
                else:
                    new_s=f"{sentence_template}\n{s}\n"

                sent2terms.append(new_s)
        else:
            # Handle case of empty term_dict (e.g., directly append sentences)
            for s in splited_sents:
                new_s = f"{sentence_template}\n{s}\n"
                sent2terms.append(new_s)

        formatted_text="".join(sent2terms).rstrip()
            

    else:
        if data["term_dict"] is not None and len(data["term_dict"]):
            term_dict = ast.literal_eval(data["term_dict"])
            term_dict=formatting_glossary(term_dict,glossary_template)
            formatted_text=f"{sentence_template}\n{src_text}\n{term_dict}"

    template=formatting_prompt_func(prompt_template,lang_dict[src],lang_dict[tgt],formatted_text)

    if return_output:
        template=template+data[tgt]+tokenizer.eos_token
        
    return {"text":template}


def formatting_glossary(term_dict,glossary_template):
    glossary=[f"{k}={v}" for k,v in term_dict.items()]
    glossary_str="\n".join(glossary)
    glossary_str=f"{glossary_template}\n{glossary_str}".strip()

    return glossary_str

def split_sents(lang,
                text,
                ):

    lang_dict={"korean":"korean","ko":"korean","kor":"korean","eng":"english","english":"english","en":"english"}
    src = lang_dict[lang.lower()]
    
    splited_sents=[]
    paras=text.split("\n") #split text into paragraphs based on linebreak to keep its original format.
    
    for idx,para in enumerate(paras):
        if len(para.strip()):
            if src=="korean":
                temp_sents=[s.text for s in kiwi.split_into_sents(para)]
            else:
                temp_sents=sent_tokenize(para)
            if idx<len(paras)-1:
                temp_sents[-1]+="\n" #keep linebreak
            splited_sents.extend(temp_sents)
        else:
            splited_sents[-1]+="\n"

    return splited_sents
            

In [4]:
train_dataset=Dataset.load_from_disk("/azurestorage/data/translation_data/aligned_dataset/prepared_for_training/training_dataset_10k/")

In [10]:
tokenizer=AutoTokenizer.from_pretrained("/azurestorage/models/production/tmt-eeve16k/")

In [39]:
result=make_translation_input_from_dataset(train_dataset[9],
                                  tokenizer,
                                  prompt_template=translation_template,
                                  glossary_template=glossary_template,
                                  sentence_template=sentence_template,
                                  return_output=True,
                                  text_split=False,
                                  )

print(result["text"])

You are a professional translator. Translate the following English source text into Korean. Refer to the word pairs in the glossary if the glossary exists when you translate. Do not translate the glossary it self.
Source:
Article 48-2 (Inspections of Large Shareholders)(1)Where any of the following persons (hereafter referred to as "large shareholder, etc." in this Article) fall under any of the following items, the Financial Services Commission may have the Governor of the Financial Supervisory Service inspect business and asset conditions of the relevant large shareholder, etc. within the minimum extent necessary for the purposes thereof: <Amended on Aug. 13, 2013>1.A person subject to conversion:(a)Where necessary to check the inspection result underArticle 16-3(2);(b)Where it is acknowledged that a person subject to conversion will be highly likely to engage in illegal transactions with a bank due to unhealthy financial status, such as rapid increase in debts and occurrence of huge

# New train dataset

In [30]:
train_dataset=datasets.Dataset.load_from_disk("/azurestorage/data/translation_data/alinged_dataset/prepared_for_training/training_dataset_20k/")

In [85]:
train_df=train_dataset.to_pandas()

In [100]:
train_df_under_100=train_df[train_df["korean"].str.len()<100].sample(frac=0.35)
train_df_over_100=train_df[train_df["korean"].str.len()>=100]

In [104]:
train_df_sample_10k=pd.concat([train_df_under_100,train_df_over_100]).sample(frac=1)
train_df_sample_10k.head(5)

Unnamed: 0,english,korean,category,term_dict,id,src,tgt,length_group
19010,Inigmaville with area for exclusive use of 244...,서울 강남구 청담동 이니그마빌 전용 244.77㎡도 2014년 10월 32억 원에 ...,경제,{'서울 강남구 청담동 이니그마빌': 'Inigmaville in Cheongdam...,618344.0,korean,english,2.0
1927,As Icarus does not itself contain a genetic ma...,Icarus 자체에는 유전자 표지자가 포함되어 있지 않기 때문에 snw 검사를 통해...,food,,1336861.0,english,korean,
6779,"Therefore, the method of obtaining ETR can be ...",이에 ETR을 구하는 방법은 크게 GAAP ETR과 Cash ETR로 나눌 수 있으...,법률,"{'ETR': 'ETR', 'GAAP ETR': 'GAAP ETR', 'Cash E...",1249742.0,english,korean,
9508,"Consequently, in this study, element decomposi...",이에 본 연구에서는 1997∼2008년과 2008∼2011년의 두 기간으로 구분하여...,경제,,1265691.0,english,korean,
15476,"In France, when signing a legal document, it i...","프랑스에서는 통상적으로 법적 문서에 서명할 경우, 서명에 앞서 ""읽고 동의함""(lu...",법률,,1274058.0,english,korean,


# Add tax dictionary & tax paper pairs

In [13]:
dict_df=dict_df.drop_duplicates(subset=[0,1])

In [20]:
dict_df=dict_df.drop_duplicates(subset=[1])

In [113]:
dict_df=dict_df.sample(frac=1)[[0,1]]

In [114]:
dict_df.columns=["korean","english"]

In [116]:
tax_law=pd.read_excel("/azurestorage/data/translation_data/raw_data/세법논문_pairing.xlsx")[["Korean","English"]]
tax_law=tax_law.dropna(axis=0)
tax_law.columns=["korean","english"]

In [118]:
idx=34

print(tax_law["korean"].iloc[idx])
print("-----------------------------")
print(tax_law["english"].iloc[idx])

많은 다국적기업이 해외 시장을 개척하거나 시장점유율을 높이는 데 소요되는 광고선전비를 유통기능을 담당하는 각국의 자회사에 부담시키고, 나아가 그룹 차원의 국제마케팅활동에 소요된 비용을 각국의 자회사들로부터 국제마케팅비 명목으로 지급받는 방식으로 전가하고 있다. 자회사는 국내 광고선전비와 해외 본사에 지급한 국제마케팅비를 손금처리하기 때문에, 원천지국 입장에서는 세원이 잠식되고 소득이 이전되는 결과가 발생한다. 다국적기업의 광고선전비를 이용한 조세회피를 막기 위해, 자회사의 국내마케팅활동에 대해서는 마케팅 무형자산을 인정하고, 자회사가 부담하는 국제마케팅비에 대해서는 사용료소득으로 인식함으로써 원천지국의 과세권을 인정하는 방안에 관하여 논의할 필요가 있다. 세계 각국은 세수 확보를 위해 마케팅 무형자산 개념을 적극 활용하여 원천지국의 과세권을 행사하고 있고, 이러한 흐름은 2017년 개정된 OECD 이전가격지침과 UN 이전가격 매뉴얼에도 반영되었다. 우리나라도 국내세법에 마케팅 무형자산 개념을 도입하여 자회사의 마케팅활동에 대한 적정한 보상을 인정함으로써 원천지국과 거주지국 간 과세소득의 공정한 분배를 도모해야 한다. 자회사가 해외 본사에 국제마케팅비를 지급하는 경우, 소득구분에 따라 원천지국 과세권의 유무와 범위가 달라진다. 그 때문에 유통업자는 원가분담약정에 따른 분담금이나 그룹 내부 용역거래에 대한 대가로 보아 원천징수를 피하려고 하고, 과세관청은 사용료소득으로 보아 원천징수의무를 부과하려고 한다. 국제마케팅비는 해외 본사가 보유한 상표의 가치를 강화하는 국제마케팅활동에 쓰였으므로, 국제마케팅활동으로 인해 상승된 상표권의 사용대가, 즉 사용료소득으로 보는 것이 타당하다.
-----------------------------
Many multinational enterprises are burdening their subsidiaries in charge of distribution function with advertising, marketing and p

In [138]:
dict_df["src"]="korean"
dict_df["tgt"]="english"
dict_df["src"].iloc[:len(dict_df)//2]="english"
dict_df["tgt"].iloc[:len(dict_df)//2]="korean"
dict_df=dict_df.sample(frac=1)

In [153]:
dict_df_sample_100=dict_df.iloc[:100]

In [146]:
tax_law["src"]="korean"
tax_law["tgt"]="english"
tax_law["src"].iloc[:len(tax_law)//2]="english"
tax_law["tgt"].iloc[:len(tax_law)//2]="korean"
tax_law=tax_law.sample(frac=1)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  tax_law["src"].iloc[:len(tax_law)//2]="english"
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original Data

In [161]:
train_df_concat=pd.concat([train_df_sample_10k,dict_df_sample_100,tax_law]).sample(frac=1).reset_index(drop=True)

In [162]:
train_dataset_10k=datasets.Dataset.from_pandas(train_df_concat)

In [238]:
train_dataset_10k[67]

{'english': 'Article XII Paragraph 1 of Article 22 (Elimination of Double Taxation) of the Convention shall be deleted and replaced by the following: “1. Subject to the provisions of Korean tax law regarding the allowance as a credit against Korean tax of tax payable in any country other than Korea (which shall not affect the general principle hereof): (a) the Swiss tax payable (excluding, in the case of a dividend, tax payable in respect of profits out of which the dividend is paid) under the laws of Switzerland and in accordance with this Convention, whether directly or by deduction, in respect of income from sources within Switzerland shall be allowed as a credit against Korean tax payable in respect of that income. The credit shall not, however, exceed that portion of Korean tax which the income from sources within Switzerland bears to the entire income subject to Korean tax. (b) In the case of a dividend paid by a company which is a resident of Switzerland to a company which is a 

In [239]:
train_dataset_10k.save_to_disk("/azurestorage/data/translation_data/alinged_dataset/prepared_for_training/training_dataset_10k/")

Saving the dataset (0/1 shards):   0%|          | 0/10819 [00:00<?, ? examples/s]

# few shot dataset

In [2]:
dastset_few=Dataset.load_from_disk("/azurestorage/data/translation_data/aligned_dataset/prepared_for_training/training_dataset_124shots/")

In [3]:
dataset_10k=Dataset.load_from_disk("/azurestorage/data/translation_data/aligned_dataset/prepared_for_training/training_dataset_10k/")

In [5]:
filtered_dataset_10k=dataset_10k.filter(lambda x:x["category"]==None or x["category"] in ["ele_sci","etc","food","human_art","경제","shatgpt",'구어체_대화체'])
filtered_dataset_10k=filtered_dataset_10k.filter(lambda x:"AAA" not in x["korean"])

In [6]:
filtered_dataset_few=dastset_few.filter(lambda x:x["category"] in [ '법률','육운·항공·관광','재경·경제일반',"ifrs_esg","pilar"])

In [79]:
new_dataset_7k=concatenate_datasets([filtered_dataset_10k,filtered_dataset_few]).shuffle()

In [80]:
new_dataset_7k_df=new_dataset_7k.to_pandas()

In [85]:
new_dataset_7k_df=new_dataset_7k_df.drop_duplicates(subset=["english","korean"])

In [86]:
new_dataset_7k=Dataset.from_pandas(new_dataset_7k_df)
new_dataset_7k.save_to_disk("/azurestorage/data/translation_data/aligned_dataset/prepared_for_training/training_dataset_7k/")

### check 7k dataset

In [2]:
dataset_7k=Dataset.load_from_disk("/azurestorage/data/translation_data/aligned_dataset/prepared_for_training/training_dataset_7k/")

In [4]:
Counter(dataset_7k["category"])

Counter({'경제': 4927,
         'etc': 1393,
         None: 277,
         'shatgpt': 246,
         'food': 29,
         'pilar': 21,
         'ele_sci': 20,
         '구어체_대화체': 14,
         'human_art': 7,
         '법률': 5,
         'ifrs esg': 3,
         '육운·항공·관광': 1,
         '재경·경제일반': 1})

In [5]:
dataset_7k_df=dataset_7k.to_pandas()

In [32]:
dataset_full=Dataset.load_from_disk("/azurestorage/data/translation_data/aligned_dataset/prepared_for_training/translation_dataset_training_290k/")
dataset_full_df=dataset_full.to_pandas()

a=dataset_full_df[dataset_full_df["category"]=="human_art"].sample(frac=1)
b=dataset_full_df[dataset_full_df["category"]=="food"].sample(frac=1)
c=dataset_full_df[dataset_full_df["category"]=="ele_sci"].sample(frac=1)
d=dataset_full_df[dataset_full_df["category"]=="구어체_대화체"].sample(frac=1)
e=dataset_full_df[dataset_full_df["category"]=="etc"].sample(frac=1).iloc[:3000]
f=dataset_full_df[dataset_full_df["category"]=="shatgpt"].sample(frac=1).iloc[:1000]


sampled_df=pd.concat([a,b,c,d,e,f]).sample(frac=1).reset_index(drop=True).drop_duplicates(["english","korean"]).sample(frac=1).reset_index(drop=True)

In [36]:
df_w_term_dict=sampled_df[sampled_df["term_dict"].str.len()>0]
df_wo_term_dict=sampled_df[sampled_df["term_dict"].str.len()<1].sample(frac=0.6)

df_11k=pd.concat([dataset_7k_df,df_w_term_dict,df_wo_term_dict]).drop_duplicates(["english","korean"]).sample(frac=1).reset_index(drop=True).drop("__index_level_0__",axis=1)

In [37]:
dataset_11k=Dataset.from_pandas(df_11k)
dataset_11k.save_to_disk("/azurestorage/data/translation_data/aligned_dataset/prepared_for_training/training_dataset_11k/")

Saving the dataset (0/1 shards):   0%|          | 0/11803 [00:00<?, ? examples/s]

In [38]:
dataset_11k

Dataset({
    features: ['english', 'korean', 'category', 'term_dict', 'id', 'src', 'tgt', 'length_group', 'length'],
    num_rows: 11803
})

In [39]:
Counter(dataset_11k["category"])

Counter({'경제': 4927,
         'etc': 3447,
         'shatgpt': 953,
         'food': 668,
         'ele_sci': 616,
         '구어체_대화체': 601,
         'human_art': 283,
         None: 277,
         'pilar': 21,
         '법률': 5,
         'ifrs esg': 3,
         '육운·항공·관광': 1,
         '재경·경제일반': 1})

### 7k dataset 

## sampling

In [9]:
Counter(filtered_dataset_10k["category"])

Counter({'경제': 4927,
         'etc': 1393,
         None: 279,
         'shatgpt': 246,
         'food': 29,
         'ele_sci': 20,
         '구어체_대화체': 14,
         'human_art': 7})

In [None]:
filtered_dataset_10k["category"]

In [28]:
filtered_dataset_10k_df["category"].unique()

array(['경제', 'etc', 'shatgpt', 'food', None, '구어체_대화체', 'human_art',
       'ele_sci'], dtype=object)

In [26]:
none_sampled=filtered_dataset_10k_df[filtered_dataset_10k_df["category"] is None].sample(frac=0.1)

KeyError: False

In [27]:
filtered_dataset_10k_df=filtered_dataset_10k.to_pandas()

In [42]:
filtered_dataset_10k_df=filtered_dataset_10k.to_pandas()
eco_sampled=filtered_dataset_10k_df[filtered_dataset_10k_df["category"]=="경제"].sample(frac=0.1)
etc_sampled=filtered_dataset_10k_df[filtered_dataset_10k_df["category"]=="etc"].sample(frac=0.1)
none_sampled=filtered_dataset_10k_df[~filtered_dataset_10k_df["category"].isin(['경제', 'etc', 'shatgpt', 'food', '구어체_대화체', 'human_art','ele_sci'])].sample(frac=0.2)

rest=filtered_dataset_10k_df[filtered_dataset_10k_df["category"].isin(["food","shatgpt","ele_sci","구어체_대화체","human_art"])]

new_dataset_1k_df=pd.concat([eco_sampled,etc_sampled,none_sampled,rest])

In [47]:
new_dataset_1k=Dataset.from_pandas(new_dataset_1k_df)
new_dataset_1k.save_to_disk("/azurestorage/data/translation_data/aligned_dataset/prepared_for_training/training_dataset_1k/")

Saving the dataset (0/1 shards):   0%|          | 0/1004 [00:00<?, ? examples/s]

## 1k + diverse topic

In [2]:
dataset_1k=Dataset.load_from_disk("/azurestorage/data/translation_data/aligned_dataset/prepared_for_training/training_dataset_1k/")

In [95]:
dataset_1k[1053]["korean"]

'그 외에도 카지노 관련주인 강원랜드(-1.06%), GKL(-1.26%), 파라다이스(-2.17%) 등도 중국인 관광객 등의 수요가 줄 것이라는 우려로 주가가 하락하고 있다.'

In [4]:
Counter(dataset_1k["category"])

Counter({'경제': 493,
         'shatgpt': 246,
         'etc': 139,
         None: 56,
         'food': 29,
         'ele_sci': 20,
         '구어체_대화체': 14,
         'human_art': 7})

In [5]:
dataset_few_shot=Dataset.load_from_disk("/azurestorage/data/translation_data/aligned_dataset/prepared_for_training/training_dataset_124shots/")

In [6]:
Counter(dataset_few_shot["category"])

Counter({'경제': 36,
         None: 23,
         'pilar': 21,
         'shatgpt': 20,
         'etc': 12,
         '법률': 5,
         'ifrs esg': 3,
         '구어체_대화체': 2,
         '재경·경제일반': 1,
         '육운·항공·관광': 1})

In [9]:
dataset_1k_df=dataset_1k.to_pandas()
dataset_few_shot_df=dataset_few_shot.to_pandas()

dataset_new_1k_df=pd.concat([dataset_1k_df,dataset_few_shot_df]).drop_duplicates(subset=["korean","english"]).sample(frac=1).drop("__index_level_0__",axis=1)
dataset_new_1k=Dataset.from_pandas(dataset_new_1k_df)
dataset_new_1k=dataset_new_1k.remove_columns( '__index_level_0__')

In [12]:
dataset_new_1k.save_to_disk("/nvme0/data/training_dataset_1.1k/")

Saving the dataset (0/1 shards):   0%|          | 0/1100 [00:00<?, ? examples/s]

In [26]:
Counter(dataset_new_1k["category"])

Counter({'경제': 528,
         'shatgpt': 246,
         'etc': 149,
         None: 76,
         'food': 29,
         'pilar': 21,
         'ele_sci': 20,
         '구어체_대화체': 14,
         'human_art': 7,
         '법률': 5,
         'ifrs esg': 3,
         '육운·항공·관광': 1,
         '재경·경제일반': 1})

In [4]:
dataset_full=Dataset.load_from_disk("/azurestorage/data/translation_data/aligned_dataset/prepared_for_training/translation_dataset_training_290k/")

In [29]:
Counter(dataset_full["category"])

Counter({'경제': 119417,
         'shatgpt': 65634,
         '법률': 50264,
         'etc': 48388,
         '구어체_대화체': 1000,
         '재경·경제일반': 961,
         'food': 946,
         'ele_sci': 903,
         '환경·노동': 649,
         '상업·무역·공업': 623,
         '주택·건축·도로': 535,
         'human_art': 412,
         '농업·수산': 388,
         '육운·항공·관광': 358,
         '민사법': 353,
         'ifrs esg': 171,
         '정보통신': 169})

In [27]:
dataset_full_df=dataset_full.to_pandas()

In [57]:
a=dataset_full_df[dataset_full_df["category"]=="human_art"].sample(frac=1).iloc[:100]
b=dataset_full_df[dataset_full_df["category"]=="food"].sample(frac=1).iloc[:100]
c=dataset_full_df[dataset_full_df["category"]=="ele_sci"].sample(frac=1).iloc[:100]
d=dataset_full_df[dataset_full_df["category"]=="구어체 대화체"].sample(frac=1).iloc[:100]
e=dataset_full_df[dataset_full_df["category"]=="etc"].sample(frac=1).iloc[:100]

sampled_df=pd.concat([a,b,c,d,e]).sample(frac=1).reset_index(drop=True)
sampled_df["length"]=np.nan
# sampled_ds=Dataset.from_pandas(sampled_df)

dataset_new_1poipd.concat([dataset_new_1k_df,sampled_df]).sample(frac=1).reset_index(drop=True)

Unnamed: 0,english,korean,category,term_dict,id,src,tgt,length_group,length
321,The most common types of loans with errors wer...,"오류가 발생한 대출 유형은 개인사업자 대출이 200건으로 가장 많았고, 이어 가계대...",경제,{'오류가 발생한 대출 유형': 'types of loans with errors'...,450035.0,korean,english,,
537,The actuator 260 receives the fourth actuator ...,"액추에이터(260)는 프로세서(210)로부터 제4 액추에이터 신호를 수신하고, 터치...",etc,,1248504.0,english,korean,,
393,All explanatory variables are the levels obser...,"모든 설명 변수는 기준 연도에 관찰된 수준이고 TFP, 상용 근로자 수(L), 매출...",경제,,1260517.0,english,korean,,
895,As Icarus does not itself contain a genetic ma...,Icarus 자체에는 유전자 표지자가 포함되어 있지 않기 때문에 snw 검사를 통해...,food,,1336861.0,english,korean,,
611,Have suggested suicide and gone missing on the...,신재민 전 기획재정부 사무관이 3일 오전 극단적 선택을 암시하고 잠적한 뒤 이날 1...,etc,,1216738.0,english,korean,,
...,...,...,...,...,...,...,...,...,...
395,"Naju-Geuman Hanok Village, Jeollanam-do; A vil...",전남 나주-금안한옥마을 전라남도 나주시의 명산 금성산을 등지고 있는 곳에 부지면적 ...,human_art,,1268643.0,english,korean,2.0,
396,"As a result, the fact that Hokusai played a de...",그 결과 ≪후지산 36경≫ 연작을 기점으로 호쿠사이가 풍경 우키요에의 등장은 물론 ...,human_art,"{'Mt. Fuji 36 Views': '후지산 36경', 'Hokusai': '호...",1224661.0,english,korean,2.0,
397,The remaining membranes bearing rPt-API-1 were...,rPt-API-1을 포함하는 나머지 막을 파렐라포스트롱길루스 테누이스에 감염된 흰꼬...,food,"{'rPt-API-1': 'rPt-API-1', 'remaining membrane...",1284583.0,english,korean,1.0,
398,The N95 mask supported by Adidas is a first-cl...,아디다스가 지원하는 N95 마스크는 미세입자를 95%까지 걸러내는 1급 방진 마스크다.,etc,,1252300.0,english,korean,0.0,


In [56]:
sampled_df

Unnamed: 0,english,korean,category,src,tgt,id,length_group,term_dict,length
0,The signal-to-noise ratio was calculated after...,"신호대 잡음 비는 0~15dB 까지 3dB 단위로 변화시키면서 20,000개 의 1...",ele_sci,english,korean,1324122,1,"{'signal-to-noise ratio': '신호대 잡음 비', 'varying...",
1,Sanggeum Lee was born in 1907 and died in 1950...,이상 금은 1907년에 태어나 1950년 한국전쟁 중에 사망하였다.,human_art,english,korean,1236263,0,"{'Sanggeum Lee': '이상 금', '1907': '1907년', '195...",
2,"The reason ramen, biscuits, biscuits, etc. do ...","라면, 비스킷, 건빵류 등이 알파 형태로 존재하면서도 오랫동안 두어도 노화가 잘 일...",ele_sci,english,korean,1221735,2,"{'ramen': '라면', 'biscuits': '건빵류', 'alpha form...",
3,"Nevertheless, GTI has maintained its instituti...","그러나 GTI는 지난 25년 동안 미국과 중국이 패권경쟁을 벌이고, 한중일 3국이 ...",human_art,english,korean,1297164,2,,
4,"Just a week earlier, on November 23, Schleiche...","슐라이허와 히틀러는 바로 1주일 전인 11월 23일에 만났고, 슐라이허는 히틀러에게...",human_art,english,korean,1292845,1,"{'Schleicher': '슐라이허', 'Hitler': '히틀러', 'a wee...",
...,...,...,...,...,...,...,...,...,...
395,The in vitro formation of an intermediate of r...,Rif-Orf20 및 아세틸-CoA를 사용하는 효소 분석 동안 리파마이신 생합성 중...,food,english,korean,1275111,2,,
396,Transit peptides mediate import by interacting...,전이 펩티드는 GTP 결합 단백질 Toc159와 상호작용한 다음 Toc 34와 상호...,food,english,korean,1223516,1,,
397,No rain fell on those plants dug on 25 October...,1966년 10월 25일과 1968년 10월 25일에 파낸 식물에는 비가 내리지 않...,food,english,korean,1220811,2,"{'25 October 1966': '1966년 10월 25일', '1968': '...",
398,"Under the theme of ""Fantastic Water Travel,"" 4...",'환상적인 물의 여행'을 테마로 16개 테마존에서 650여 종 4만여 마리의 수중생...,etc,english,korean,1336979,2,"{'Fantastic Water Travel': '환상적인 물의 여행', '16 t...",


### Datase 1.1k refined manually

In [3]:
ds=Dataset.load_from_disk("/azurestorage/data/translation_data/aligned_dataset/prepared_for_training/training_dataset_1.1k_refined/")

In [27]:
ds_df=ds.to_pandas()

In [5]:
ds7k=Dataset.load_from_disk("/azurestorage/data/translation_data/aligned_dataset/prepared_for_training/training_dataset_7k/")

In [7]:
ds7k=ds7k.filter(lambda x:x["category"]!="shatgpt")

Filter:   0%|          | 0/6944 [00:00<?, ? examples/s]

In [15]:
Counter(ds7k["category"])

Counter({'경제': 4927,
         'etc': 1393,
         None: 277,
         'food': 29,
         'pilar': 21,
         'ele_sci': 20,
         '구어체_대화체': 14,
         'human_art': 7,
         '법률': 5,
         'ifrs esg': 3,
         '육운·항공·관광': 1,
         '재경·경제일반': 1})

In [10]:
ds7k_df=ds7k.to_pandas()

In [14]:
ds7k_df["category"]=ds7k_df["category"].fillna("accounting_etc")

In [23]:
ds7k_df=ds7k_df.drop([512,538,589,1139,1399,1590,1671,1916,2205,2591,2889,2944,3068,3196,3331,3576,3642,3785,4182,4278],axis=0)

In [44]:
df_new7k=pd.concat([ds_df,ds7k_df]).reset_index(drop=True)

In [45]:
df_new7k=df_new7k.drop_duplicates(subset=["english","korean"]).reset_index(drop=True).drop("__index_level_0__",axis=1)

In [51]:
ds_new7k=Dataset.from_pandas(df_new7k)

In [53]:
ds_new7k.save_to_disk("/azurestorage/data/translation_data/aligned_dataset/prepared_for_training/training_dataset_new_7k/")

Saving the dataset (0/1 shards):   0%|          | 0/6925 [00:00<?, ? examples/s]