In [1]:
from typing import Dict, Optional
from datasets import Dataset, load_dataset, concatenate_datasets
import os
import glob
import json
import copy
import threading
import time
import random
from typing import Dict, Optional
import requests
import numpy as np
import re

import fasttext
lang_detect = fasttext.load_model('../fastchat/modules/fasttext/lid.176.bin')

from fastchat.modules.answer_refiner import generate_refiner

import chromadb
from chromadb.config import Settings
import random
from fastchat.modules.embedder_adapter import Embedder, get_embedder
from fastchat.conversation import (
    SeparatorStyle,
)
from fastchat.model.model_adapter import get_conversation_template
import copy
from fastchat.train.data_modules.sft_dataset import load_sft_dataset, combine_dataset
from fastchat.train.data_modules.dedup import (
    dedup_by_similarity,
    dedup_non_pair,
    dedup_repetition,
    dedup_math,
    dedup_too_much_token,
    dedup_short,
)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
split = 'train'
cache_dir = None
dataset = load_dataset("Anthropic/hh-rlhf", split=split, cache_dir=cache_dir)

In [None]:
from typing import Dict, Optional
from datasets import Dataset, load_dataset

from fastchat.model.model_adapter import get_conversation_template

def extract_anthropic_prompt(prompt_and_response, search_term="\n\nAssistant:"):
    """Extract the anthropic prompt from a prompt and response pair."""
    search_term_idx = prompt_and_response.rfind(search_term)
    assert search_term_idx != -1, f"Prompt and response does not contain '{search_term}'"
    return prompt_and_response[: search_term_idx + len(search_term)]

class hankang_DPODataset:
    def __init__(
        self, 
        dataset_path="/data/llm_datasets/Ultrafeedback_binarized.ko.hankang/",
        data_format='chat-orca',
        search_term='\n\n### Assistant:',
        num_train=None,
        num_eval=None,
    ):
        self.dataset_path = dataset_path
        self.data_format = data_format
        self.search_term = search_term
        self.num_train = num_train
        self.num_eval = num_eval
    
    def get_prompt_and_response(self, data):
        conv = get_conversation_template(self.data_format)

        for idx, _conv in enumerate(data):
            role = _conv['role']
            content = _conv['content_kr']
            if idx % 2 == 0 and role == 'user':
                conv.append_message(conv.roles[0], content)
            elif idx % 2 == 1 and role == 'assistant':
                conv.append_message(conv.roles[1], content)
            else:
                print("Warning: data type invaild")

        if len(conv.messages) == 0:
            print("Warning: data is empty")
        if len(conv.messages) % 2 != 0:
            print("Warning: data has weird pair")

        return conv.get_prompt()
    
    def make_dpo_data_module(self):
        def validate_prompt_and_responses(data) -> bool:
            try:
                prompt_and_response = self.get_prompt_and_response(data['chosen'])
                prompt_and_response_rejected = self.get_prompt_and_response(data['rejected'])
                prompt = extract_anthropic_prompt(prompt_and_response, self.search_term)
                promopt_rejected = extract_anthropic_prompt(prompt_and_response_rejected, self.search_term)
            except AssertionError:
                return False

            return True

        def split_prompt_and_responses(data) -> Dict[str, str]:
            prompt_and_response = self.get_prompt_and_response(data['chosen'])
            prompt_and_response_rejected = self.get_prompt_and_response(data['rejected'])
            prompt = extract_anthropic_prompt(prompt_and_response, self.search_term)
            promopt_rejected = extract_anthropic_prompt(prompt_and_response_rejected, self.search_term)
            return {
                "prompt": prompt,
                "chosen": prompt_and_response[len(prompt) :],
                "rejected": prompt_and_response_rejected[len(promopt_rejected) :],
            }
                             
                             
        dataset = load_dataset(self.dataset_path)

        train_dataset = dataset['train']
        eval_dataset = dataset['test']

        original_columns = list(train_dataset.features.keys())

        if self.num_train is not None:
            train_dataset = train_dataset.select(range(min(len(train_dataset), self.num_train)))
        if self.num_eval is not None:
            eval_dataset = eval_dataset.select(range(min(len(train_dataset), self.num_eval)))

        train_dataset = train_dataset.filter(validate_prompt_and_responses)
        train_dataset = train_dataset.map(split_prompt_and_responses, remove_columns=original_columns)

        eval_dataset = eval_dataset.filter(validate_prompt_and_responses)
        eval_dataset = eval_dataset.map(split_prompt_and_responses, remove_columns=original_columns)

        return dict(train_dataset=train_dataset, eval_dataset=eval_dataset)

In [50]:
from fastchat.train.data_modules.dpo_dataset import hankang_DPODataset

dpo_dataset = hankang_DPODataset()
dpo_datamodule = dpo_dataset.make_dpo_data_module()

In [56]:
dpo_datamodule['train_dataset'][0]

{'rejected': ' 1. 수영: 심혈관 건강과 지구력, 유연성 및 근력을 증진하는 재미있고 상쾌한 활동입니다. 아이들이 적절한 수영복을 착용하고 물에서 가까운 곳에서 감독을 철저히 해야 합니다.\n2. 하이킹: 자연을 탐험하고 균형 감각과 조정 능력을 향상시키며 지역 동식물에 대해 배울 수 있는 좋은 기회입니다. 튼튼한 신발과 충분한 물, 간식을 준비하세요\n3. 가드닝: 호기심과 자연에 대한 감사함을 키우고 신체 활동 및 건강한 식습관을 장려하는 풍요로운 활동입니다. 땅(정원 또는 화분)과 씨앗/식물, 도구 등을 제공합니다\n4. 요리 수업: 요리 기술, 창의력 및 건강한 식습관을 개발할 수 있는 훌륭한 방법입니다. 어린이들은 책임감 있는 성인이 감독하고 적절한 주방 공간(가정이나 정원)에 접근할 수 있어야 합니다\n5 . 예술 및 공예 : 상상력과 표현력을 자극하는 동시에 소근육 운동 능력과 눈썰미를 향상시키는 다재다능한 활동입니다.. 페인트, 마커, 종이 등 미술 재료와 안전한 작업 공간을 제공합니다 \n6 . 팀 스포츠 : 체력 단련, 사회성 발달 및 스포츠맨십을 촉진하는 사교적인 팀 기반 활동입니다.. 적절한 장비와 감독을 제공하고 연령에 적합한 규칙과 기술 수준을 고려하세요 ^^7 . 요가 : 유연성과 균형 감각을 향상시키고 정신 건강을 증진하는 온화한 마음의 휴식 시간 입니다.. 편안한 매트와 숙련된 강사를 제공하며 이상적으로 아이들과 함께 일하는 강사가 좋습니다 ^^(영어).8 댄스 : 음악이 흐르는 동안 아이들의 리듬감각, 조정능력 그리고 자신감을 길러주는 재미있는 놀이활동 입니다... 연령에 맞는 음악과 움직임을 제공하고 이동하기 편안하고 정리 정돈이 잘 된 공간에서 책임감 있는 어른이 감독하도록 하세요 ^^(영어).9 무술: 힘 , 조정 , 집중력을 기르는 훈련 방식이다 ... 편안한 옷 과 장비를 제공하고 어린이들과 함께 일하는 경험이 풍부 한 전문 강사를 선택하십시오 (영어).10 달리기 : 지구력이 증가하고 심장과 폐 기능을 개

In [None]:
# def dedup_by_similarity(dataset, prompt_template='chat-orca', target_text_len=100, n_results=100, distance_threshold = 0.6):
_dataset = train_dataset
prompt_template='vicuna'
target_text_len=100
n_results=100
distance_threshold = 0.35
    
if prompt_template == 'chat-orca':
    conv = get_conversation_template(prompt_template)
    system_message = conv.system_message
    sep_style = conv.sep_style
    sep = conv.sep
    prompt_user, prompt_bot = conv.roles

    len_sep_style = 0
    if sep_style == SeparatorStyle.ADD_COLON_TWO:
        len_sep_style = 1

    len_front = len(system_message) + len(sep) + len(prompt_user) + len_sep_style + 1
    len_rear = len(sep) + len(prompt_bot) + len_sep_style
    def filter_question(data):
        return { 
            # **data,
            'prompt': data['prompt'][len_front:-len_rear][:target_text_len]
        }

if prompt_template == 'vicuna':
    def filter_question(data):
        return {
            'prompt': data['conversations'][0]['value'][:target_text_len]
        }

question_dataset = _dataset.map(filter_question)

chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))
embedder = get_embedder("ddobokki/klue-roberta-base-nli-sts-ko-en")
collection = chroma_client.create_collection(name="context", embedding_function=embedder.embed, metadata={"hnsw:space": "cosine"})
ids = []
# add
texts = question_dataset['prompt']
last_id = -1
new_ids = [f"id{i+last_id+1}" for i in range(len(texts))]
ids += new_ids
collection.add(documents=texts, ids=new_ids)

query_ids = copy.deepcopy(new_ids)
selected_ids = []
duplicated_ids = []

weird_ids = []
error_ids = []
while query_ids:
    current_id = random.choice(query_ids)
    if current_id in selected_ids:
        print("Warning: this is weird..")
        weird_ids.append(current_id)
        continue
    selected_ids.append(current_id)
    search_strings = [texts[int(current_id[2:])]]
    if collection.count() == 0:
        print("Warning: collection is empty. Forced break")
        break
    result = collection.query(query_texts=search_strings, n_results=min(n_results, len(query_ids)), include=['distances']) #'documents'

    search_ids = result['ids'][0]
    distances = result['distances'][0]
    remove_ids = []
    for idx in range(len(search_ids)):
        sid = search_ids[idx]
        dist = distances[idx]
        if dist < distance_threshold:
            remove_ids.append(sid)

    for rid in remove_ids:
        if rid in query_ids:
            query_ids.remove(rid)
            
    if remove_ids:
        duplicated_ids += remove_ids
        collection.delete(ids=remove_ids)
    else:
        print("Warning: this is error..")
        error_ids.append(current_id)

    print(f"Total:{len(new_ids)} Selected:{len(selected_ids)} current_dup:{len(remove_ids)} vector_store:{collection.count()} remained:{len(query_ids)} total_dup:{len(duplicated_ids)}", '\t\t\t\t\t', end='\r')

print('finished dedup data:', f"Total:{len(new_ids)} Selected:{len(selected_ids)} current_dup:{len(remove_ids)} vector_store:{collection.count()} remained:{len(query_ids)} total_dup:{len(duplicated_ids)}")

selected_ids = [int(sid[2:]) for sid in set(selected_ids)]

_dataset = _dataset.select(selected_ids)

# return dataset, selected_ids, query_ids

In [None]:
qna_list = [
    "/data/llm_datasets/custom/vicuna_format/gpt_evol_1.3k-vicuna.json",
    "/data/llm_datasets/custom/vicuna_format/koalpaca_v1.1-vicuna.json",
    "/data/llm_datasets/custom/deduped/alpaca-gpt4-korean_dedup/",
    "/data/llm_datasets/custom/vicuna_format/korquad-chat-vicuna.json",
    "/data/llm_datasets/custom/vicuna_format/wizardlm_orca_vicuna.json",
    "/data/llm_datasets/sharegpt_gpt4/sharegpt_gpt4.jsonl",
    "/data/llm_datasets/custom/vicuna_format/sharegpt_V3_format_others.json",
    "/data/llm_datasets/custom/deduped/sharegpt_V3_format_ko_selected.json",
    "/data/llm_datasets/custom/vicuna_format/lima_vicuna_format_ko.json",
]

correction_list = [
    "/data/llm_datasets/custom/vicuna_format/KoreaSpellingCorrection/"
]

summary_list = [
    "/data/llm_datasets/custom/deduped/aihub_summary_data_tech_dedup/",
    "/data/llm_datasets/aihub_summary_data/도서/",
    "/data/llm_datasets/aihub_summary_data/법률/",
    "/data/llm_datasets/custom/deduped/naver-news-summarization-ko-vicuna_dedup/",
    
]

translation_list = [
    "/data/llm_datasets/custom/vicuna_format/sharegpt_V3_format_translation(enko).json",
    "/data/llm_datasets/custom/vicuna_format/sharegpt_V3_format_translation(koen).json",
]


dataset_list = qna_list + correction_list + summary_list + translation_list

In [None]:
# dedup2
qna_list = [
    "/data/llm_datasets/custom/vicuna_format/gpt_evol_1.3k-vicuna.json",
    "/data/llm_datasets/custom/vicuna_format/koalpaca_v1.1-vicuna.json",
    "/data/llm_datasets/custom/deduped2/alpaca-gpt4-korean_dedup2.json",
    "/data/llm_datasets/custom/vicuna_format/korquad-chat-vicuna.json",
    "/data/llm_datasets/custom/deduped2/wizardlm_orca_vicuna_dedup2.json",
    "/data/llm_datasets/sharegpt_gpt4/sharegpt_gpt4.jsonl",#
    "/data/llm_datasets/custom/vicuna_format/sharegpt_V3_format_others.json",#
    "/data/llm_datasets/custom/deduped2/sharegpt_V3_format_ko_selected_dedup2.json",
    "/data/llm_datasets/custom/deduped2/lima_vicuna_format_ko.json",
]

# correction_list = [
#     "/data/llm_datasets/custom/deduped2/KoreaSpellingCorrection-10000.json",
# ]

summary_list = [
    "/data/llm_datasets/custom/deduped2/aihub_summary_data_tech_dedup-5000.json",
    "/data/llm_datasets/custom/deduped2/aihub_summary_data_book-5000.json",
    "/data/llm_datasets/custom/deduped2/aihub_summary_data_law-5000.json",
    "/data/llm_datasets/custom/deduped2/naver-news-summarization-ko-vicuna_dedup-5000.json",
    
]

translation_list = [
    "/data/llm_datasets/custom/deduped2/sharegpt_V3_format_translation(enko)-10000.json",
    "/data/llm_datasets/custom/deduped2/sharegpt_V3_format_translation(koen)-10000.json",
]


dataset_list = qna_list + summary_list + translation_list

In [99]:
# refine
qna_list = [
    "/data/llm_datasets/custom/vicuna_format/gpt_evol_1.3k-vicuna.json",
    "/data/llm_datasets/custom/vicuna_format/koalpaca_v1.1-vicuna.json",
    "/data/llm_datasets/custom/refined/alpaca-gpt4-korean_dedup2.json",
    "/data/llm_datasets/custom/vicuna_format/korquad-chat-vicuna.json",
    "/data/llm_datasets/custom/refined/wizardlm_orca_vicuna_dedup2.json",
    "/data/llm_datasets/custom/vicuna_format/sharegpt_gpt4.json",#
    "/data/llm_datasets/custom/vicuna_format/sharegpt_V3_format_others.json",#
    "/data/llm_datasets/custom/refined/sharegpt_V3_format_ko_selected_dedup2.json",
    "/data/llm_datasets/custom/refined/lima_vicuna_format_ko.json",
]

# correction_list = [
#     "/data/llm_datasets/custom/deduped2/KoreaSpellingCorrection-10000.json",
# ]

summary_list = [
    "/data/llm_datasets/custom/deduped2/aihub_summary_data_tech_dedup-5000.json",
    "/data/llm_datasets/custom/deduped2/aihub_summary_data_book-5000.json",
    "/data/llm_datasets/custom/deduped2/aihub_summary_data_law-5000.json",
    "/data/llm_datasets/custom/deduped2/naver-news-summarization-ko-vicuna_dedup-5000.json",
    
]

translation_list = [
    "/data/llm_datasets/custom/deduped2/sharegpt_V3_format_translation(enko)-10000.json",
    "/data/llm_datasets/custom/deduped2/sharegpt_V3_format_translation(koen)-10000.json",
]


dataset_list = qna_list + summary_list + translation_list

In [98]:
# dpo v2
dpo_list = [
    "/data/llm_datasets/ultrafeedback_binarized/data/train_prefs-00000-of-00001-17309c769bfe5733.parquet",
    "/data/llm_datasets/orca_dpo_pairs/",
    "/data/llm_datasets/distilabel-math-preference-dpo/data/",
]

dpo_list2 = [
    "/data/llm_datasets/custom/kodpo/untranslated/ultrafeedback_binarized.json",
    "/data/llm_datasets/custom/kodpo/untranslated/orca_dpo_pairs.json",
    "/data/llm_datasets/custom/kodpo/untranslated/distilabel-math-preference-dpo.json",
]


In [45]:
def load_dpo_dataset(dataset_path, split='train'):
    if dataset_path.endswith("json"):
        dataset = load_dataset("json", data_files=dataset_path, split=split)
    elif dataset_path.endswith("parquet"):
        dataset = load_dataset("parquet", data_files=dataset_path, split=split)
    else:
        dataset = load_dataset(dataset_path, split=split)
        
    return dataset

dataset = load_dpo_dataset(dpo_list[2])

In [None]:
new_dataset_list = []
for d in dataset_list:
    new_dataset_list.append("\"" + d + "\"")
print(' '.join(new_dataset_list))

In [None]:
dataset_path = dataset_list[7]
print(dataset_path)
dataset_train = load_sft_dataset(dataset_path)
dataset_train

In [100]:
# from fastchat.train.data_modules.sft_dataset import load_sft_dataset, combine_dataset
combined_dataset = concatenate_datasets([load_sft_dataset(dataset_path) for dataset_path in dataset_list])
combined_dataset.features

{'id': Value(dtype='string', id=None),
 'conversations': [{'from': Value(dtype='string', id=None),
   'value': Value(dtype='string', id=None)}],
 'instruction': Value(dtype='string', id=None),
 'task_name': Value(dtype='string', id=None),
 'system': Value(dtype='string', id=None),
 'task': Value(dtype='string', id=None)}

In [151]:
new_dataset = []
for data in train_dataset:
    new_dataset.append(data)    


In [156]:
# combined_dataset.to_json("/data/llm_datasets/custom/ados_sft_v4.json")
with open("/data/llm_datasets/custom/ados_sft_v4.1.json", "w") as json_file:
    json.dump(new_dataset, json_file)

In [136]:
""" find odd code blocks"""
dataset_path = "/data/llm_datasets/custom/ados_sft_v4.json"
# dataset = load_dataset("json", dataset_path)
dataset = load_sft_dataset(dataset_path, split=None)
train_dataset = dataset['train']

# new_dataset = []
code_prefixes = []

odd_dataset = []
oddd_dataset = []
odd_idxs = set()
normal_dataset = []
flag_normal = True
flag_code = False
for idx, data in enumerate(train_dataset):
    conversations = data['conversations']
    flag_normal = True
    for conv in conversations:
        _from = conv['from']
        if _from == 'human': continue
        _value = conv['value']
        flag_code = False
        find_iter = re.finditer('```', _value)
        temp_num = 0
        for fidx, ftext in enumerate(find_iter):
            flag_code = True
            start_index = ftext.start() + 3
            # new_dataset.append(data)
            #TODO: 스페이스바가 바로 오는 경우..
            candidate = re.split(r'[\n]', _value[start_index:])[0]
            if fidx % 2 == 0 and '```' not in candidate and candidate not in available_code_prefixes:
                odd_dataset.append((candidate, data))
                code_prefixes.append(candidate)
                odd_idxs.add(idx)
                flag_normal = False
                break
            temp_num += 1
        if temp_num % 2 != 0:
            oddd_dataset.append(('odd', data))
            odd_idxs.add(idx)
            flag_normal = False
        
        if not flag_normal:
            break
    
    if flag_code and flag_normal:
        normal_dataset.append(data)
        num_normal += 1

print(len(train_dataset), len(normal_dataset), len(odd_dataset), len(oddd_dataset), len(odd_idxs))

165129 19379 27 21 48


In [146]:
odd_dataset[4]

('python ',
 {'system': None,
  'task_name': None,
  'conversations': [{'from': 'human',
    'value': '주어진 배열을 역순으로 정렬하는 Python 스크립트를 만드세요.\n'},
   {'from': 'gpt',
    'value': '주어진 배열을 역순으로 정렬하는 파이썬 스크립트는 다음과 같습니다:\n\n```python \narray = [10, 2, 5, -4, 92, 101]\narray.sort(reverse=True)\nprint(array)\n```\n\n결과는 다음과 같습니다:\n```python\n[101, 92, 10, 5, 2, -4]\n```\n\n이 스크립트는 리스트의 `sort()` 메소드를 사용하여 배열을 오름차순으로 정렬하며, `reverse` 매개 변수를 `True`로 설정하여 정렬된 리스트의 순서를 뒤집습니다.'}],
  'task': None,
  'instruction': None,
  'id': '35752'})

In [147]:
selected_idxs = list(range(len(train_dataset)))
for od in odd_idxs:
    selected_idxs.remove(od)
train_dataset = train_dataset.select(selected_idxs)

In [157]:
load_sft_dataset("/data/llm_datasets/custom/ados_sft_v4.1.json", split=None)

Downloading data files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 4848.91it/s]
Extracting data files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1036.65it/s]
Generating train split: 165081 examples [00:20, 8243.63 examples/s]


DatasetDict({
    train: Dataset({
        features: ['system', 'task_name', 'conversations', 'task', 'instruction', 'id'],
        num_rows: 165081
    })
})

In [127]:
code_prefixes = set(code_prefixes)
code_prefixes

{' ',
 '                      ',
 '  _|←_cW_→_|_↓_',
 ' - This is similar to the first pattern, but uses the shorthand character class \\d to represent any digit.',
 ' - This pattern includes word boundaries (\\b) at the beginning and end, which means it will only match strings that contain a single lowercase hexadecimal word, separated by spaces or other non-word characters. ',
 ' - This pattern matches any alphanumeric word, including uppercase letters and non-hexadecimal characters. ',
 ' - This pattern matches any string that consists only of lowercase letters a-f, but does not require the string to be a valid hexadecimal word.',
 ' - This pattern matches uppercase hexadecimal words, not lowercase.',
 ' - 참조용으로 MSDN을 참조하세요.',
 ' command is used to make the book title bold for emphasis.',
 ' command. ',
 ' find -type l |',
 ' find -type l | while IFS= read -r lnkname; do if [ "$(readlink \'$lnkname\')" == "/your/exact/path" ]; then rm -- "$lnkname"; fi; done',
 ' package, you can us

In [125]:
available_code_prefixes = set([
    '',
    'CSS',
    'HTML',
    'JavaScript',
    'Python',
    'SQL',
    'bash',
    'c',
    'c++',
    'cpp',
    'csharp',
    'css',
    'for',
    'html',
    'java',
    'javascript',
    'js',
    'json',
    'php',
    'python',
    'ruby',
    'sass',
    'scss',
    'sql',
    'sum',
    'svg',
    'swift',
    'xml',
    'yaml',
    'C#',
    'C++',
    'CSS',
    'Go',
    'HTML',
    'Java',
    'LaTeX',
    'MATLAB',
    'Markdown',
    'Proposals',
    'Python',
    'R',
    'SELECT',
    'SQL',
    'Swift',
    'VBA',
    'echo',
    'excel-vba',
    'find',
    'go',
    'gpg',
    'jsx',
    'kotlin',
    'latex',
    'markdown',
    'math',
    'matlab',
    'meditation',
    'mermaid',
    'mutt',
    'nano',
    'r',
    'rust',
    'scala',
    'sh',
    'shell',
    'sudo',
    'xpath',
    '{r}',
    '.',
    '$',
    'curl',
    'xslt',
    'Apex',
    'DAX',
    'Dockerfile',
    'apex',
    'applescript',
    'arduino',
    'asm',
    'assembly',
    'astro',
    'autoit',
    'batch',
    'bicep',
    'blade',
    'cmake',
    'cmd',
    'coffee',
    'coffeescript',
    'cql',
    'csv',
    'cypher',
    'dart',
    'delphi',
    'diff',
    'dockerfile',
    'dot',
    'emacs',
    'erb',
    'fsharp',
    'glsl',
    'gradle',
    'graphql',
    'graphviz',
    'groovy',
    'haskell',
    'hcl',
    'hlsl',
    'html+erb',
    'ini',
    'jinja',
    'ladder',
    'lasso',
    'less',
    'lisp',
    'lldb',
    'llvm',
    'logo',
    'lua',
    'makefile',
    'mathematica',
    'metal',
    'nginx',
    'nix',
    'objc',
    'objective',
    'objectivec',
    'pascal',
    'perl',
    'plaintext',
    'plantuml',
    'powershell',
    'prisma',
    'properties',
    'proto',
    'protobuf',
    'py',
    'reg',
    'rego',
    'scheme',
    'scratch',
    'solidity',
    'spss',
    'stata',
    'stencil',
    'terraform',
    'toml',
    'ts',
    'tsx',
    'txt',
    'typescript',
    'vb',
    'vba',
    'vbnet',
    'verilog',
    'yml',
    'jsp',
    'prolog',
    'razor',
    'CMD',
    'G',
    'GraphQL',
    'Makefile',
    'apache',
    'c#',
    'cython',
    'elixir',
    'jinja2',
    'julia',
    'ocaml',
    'systemverilog',
    'vbscript',
    'vhdl',
    'vue',
    'wasm',
    'wolfram',
    'zsh',
    'regex',
    ' Java',
    ' Python',
    ' c++',
    ' python',
    ' java',
     ' css',
     ' html',
     ' js',
     ' python',
     ' scala',
     'md',
])

# for pre in available_code_prefixes:
with open("available_code_prefixes.txt", "w") as f:
    f.write('\n'.join(available_code_prefixes))

In [None]:
available_code_prefixes[0]

In [49]:
from collections import defaultdict

def get_lang_distribution(dataset_list):
    lang_distribution = {}
    global_lang_dict = {'train': defaultdict(int), 'test': defaultdict(int)}
    for dataset_path in dataset_list:
        # dataset = load_sft_dataset(dataset_path, split=None)
        dataset = load_dpo_dataset(dataset_path, split=None)

        lang_splits = {}
        for split in list(dataset.keys()):
            print(f"{dataset_path}:{split}")
            _dataset = dataset[split]

            lang_dict = defaultdict(int)
            for data in _dataset:
                # conversations = data['conversations']
                conversations = data['chosen_response']
                langs = {}
                len_conv = 0
#                 for conv in conversations:
#                     # _from = conv['from']
#                     # _value = conv['value']
#                     _from = conv['role']
#                     _value = conv['content']

#                     len_conv += len(_value)
#                     lang, conf = lang_detect.predict(_value.replace('\n', ' '))
#                     lang = lang[0]
#                     if lang not in langs:
#                         langs[lang] = 1
#                     else:
#                         langs[lang] += 1
                _value = conversations

                len_conv += len(_value)
                lang, conf = lang_detect.predict(_value.replace('\n', ' '))
                lang = lang[0]
                if lang not in langs:
                    langs[lang] = 1
                else:
                    langs[lang] += 1

#                 if '__label__en' in langs:
#                     langs['__label__en'] -= 1

                if len(langs) == 0:
                    dominent_lang = "empty"
                else:
                    dominent_lang = max(langs)
                
                # if dominent_lang not in lang_dict:
                #     lang_dict[dominent_lang] = 1
                # else:
                global_lang_dict[split][dominent_lang] += 1
                lang_dict[dominent_lang] += 1
            lang_splits[split] = lang_dict
        lang_distribution[dataset_path] = lang_splits
    
    lang_distribution['total'] = global_lang_dict
    
    stat_dict = {'train': {}, 'test': {}}
    for split in ['train', 'test']:
        total_cnt = sum([value for value in global_lang_dict[split].values()])
        for key, value in global_lang_dict[split].items():
            stat_dict[split][key] = f"{value / total_cnt:.2%}"
    
    lang_distribution['stat'] = stat_dict
    
    return lang_distribution

lang_distribution = get_lang_distribution([dpo_list[2]])
lang_distribution

/data/llm_datasets/distilabel-math-preference-dpo/data/:train


{'/data/llm_datasets/distilabel-math-preference-dpo/data/': {'train': defaultdict(int,
              {'__label__en': 2418})},
 'total': {'train': defaultdict(int, {'__label__en': 2418}),
  'test': defaultdict(int, {})},
 'stat': {'train': {'__label__en': '100.00%'}, 'test': {}}}

In [6]:
with open("/workspaces/data/llm_datasets/custom/lang_distribution_SFT_v4.json", "w") as json_file:
    json.dump(lang_distribution, json_file, indent=4)

In [None]:
dataset = load_dataset("json", data_files="/workspaces/data/llm_datasets/custom/deduped/translated_sharegpt_V3_format_ko.json")
dataset

In [None]:
dataset_koen = dataset['train'].select(range(10000, 15000))
dataset_enko = dataset['train'].select(range(15000, 20000))

In [None]:
dataset_koen[1]

In [None]:
new_dataset = []
for data in dataset_enko:
    conversations_ko = data['conversations']
    _id = data['id']
    
    conversations_en = lang_dict['__label__en'][_id]['conversations']
    
    for _idx in range(len(conversations_ko)):
        value_ko = conversations_ko[_idx]['value']
        value_en = conversations_en[_idx]['value']
        if len(value_ko) > 150: # 너무 짧은 데이터는 품질이 좋지 않아 제거
            new_dataset.append({
                'id': f"sharegpt_V3_format_{_id}_{_idx}",
                'task': 'enkotranslation',
                'conversations': [
                                    {'from': 'human', 'value': value_en},
                                    {'from': 'gpt', 'value': value_ko},
                                 ],
            })
        
    

In [None]:
file_paths = glob.glob("/workspaces/data/llm_datasets/aihub/*[!tar|!sh]")

dataset_dict = {}
for file_path in file_paths:
    file_name = os.path.basename(file_path)
    
    new_dataset = []
    idx = 0

    paths = glob.glob(os.path.join(file_path, '*.json'))
    
    for path in paths:
        with open(path, "r") as f:
            json_data = json.load(f)

        context_info = json_data['dataset']['context_info']
        for context_data in context_info:
            context = context_data['context']
            summary = context_data['summary']

            data_row = {
                'id': f"{file_name}_{idx}",
                'task': 'summarization',
                'conversations': [
                                    {'from': 'human', 'value': context},
                                    {'from': 'gpt', 'value': summary},
                                 ],
            }
            new_dataset.append(data_row)
            idx += 1
        
    print(f"file_name:{file_name} idx:{idx}", '\t\t\t\t\t\t', end='\r')
    dataset_dict[file_name] = new_dataset

In [None]:
train_dataset_list = dataset_dict['TL_EE_train'] + dataset_dict['TL_LA_train'] + dataset_dict['TL_ED_train'] + dataset_dict['TL_NA_train']
eval_dataset_list = dataset_dict['TL_EE_val'] + dataset_dict['TL_LA_val'] + dataset_dict['TL_ED_val'] + dataset_dict['TL_NA_val']

In [None]:
with open("/workspaces/data/llm_datasets/aihub_summary_data/train.json", "w") as json_file:
    json.dump(train_dataset_list, json_file)
with open("/workspaces/data/llm_datasets/aihub_summary_data/test.json", "w") as json_file:
    json.dump(eval_dataset_list, json_file)

In [None]:
dataset = load_dataset("/workspaces/data/llm_datasets/aihub_summary_data")

In [None]:

file_paths = glob.glob("/workspaces/data/llm_datasets/aihub/*summary*[!tar|!sh]")

dataset_dict = {}
for file_path in file_paths:
    file_name = os.path.basename(file_path)
    
    new_dataset = []
    idx = 0

    paths = glob.glob(os.path.join(file_path, '*.json'))
    
    for path in paths:
        with open(path, "r") as f:
            json_data = json.load(f)

        documents = json_data['documents']
        for document in documents:
            text = document['text']
            abstractive = document['abstractive']

            summary = abstractive[0]
            context = []
            for _text in text:
                _context = ' '.join([_index_text['sentence'] for _index_text in _text])
                context.append(_context)
            context = '\n'.join(context)
            
            data_row = {
                'id': f"{file_name}_{idx}",
                'task': 'summarization',
                'conversations': [
                                    {'from': 'human', 'value': context},
                                    {'from': 'gpt', 'value': summary},
                                 ],
            }
            new_dataset.append(data_row)
            idx += 1
        
        print(f"file_name:{file_name} idx:{idx}", '\t\t\t\t\t\t', end='\r')
    dataset_dict[file_name] = new_dataset

In [None]:
for key in dataset_dict.keys():
    print(key, len(dataset_dict[key]))

In [None]:
train_dataset_list = dataset_dict['summary_law_train']
eval_dataset_list = dataset_dict['summary_law_val']

In [None]:
with open("/workspaces/data/llm_datasets/aihub_summary_data/법률/train.json", "w") as json_file:
    json.dump(train_dataset_list, json_file)
with open("/workspaces/data/llm_datasets/aihub_summary_data/법률/test.json", "w") as json_file:
    json.dump(eval_dataset_list, json_file)

In [None]:
file_paths = glob.glob("/workspaces/data/llm_datasets/aihub/*summary_book*[!tar|!sh]")

dataset_dict = {}
for file_path in file_paths:
    file_name = os.path.basename(file_path)
    
    new_dataset = []
    idx = 0

    paths = glob.glob(os.path.join(file_path, '**/*.json'))
    
    for path in paths:
        with open(path, "r") as f:
            json_data = json.load(f)

        context = json_data['passage']
        summary = json_data['summary']
        
        data_row = {
            'id': f"{file_name}_{idx}",
            'task': 'summarization',
            'conversations': [
                                {'from': 'human', 'value': context},
                                {'from': 'gpt', 'value': summary},
                             ],
        }
        new_dataset.append(data_row)
        idx += 1
        
        print(f"file_name:{file_name} idx:{idx}", '\t\t\t\t\t\t', end='\r')
    dataset_dict[file_name] = new_dataset

In [None]:
for key in dataset_dict.keys():
    print(key, len(dataset_dict[key]))

In [None]:
train_dataset_list = dataset_dict['summary_book_train']
eval_dataset_list = dataset_dict['summary_book_val']

In [None]:
with open("/workspaces/data/llm_datasets/aihub_summary_data/도서/train.json", "w") as json_file:
    json.dump(train_dataset_list, json_file)
with open("/workspaces/data/llm_datasets/aihub_summary_data/도서/test.json", "w") as json_file:
    json.dump(eval_dataset_list, json_file)

In [None]:
dataset = load_dataset("/workspaces/data/llm_datasets/aihub_summary_data/도서")
dataset

In [None]:
dataset['train'][1]

In [None]:
qas

In [None]:

file_paths = glob.glob("/workspaces/data/llm_datasets/aihub/*VL*[!tar|!sh]")

dataset_dict = {}
for file_path in file_paths:
    file_name = os.path.basename(file_path)
    
    new_dataset = []
    idx = 0

    paths = glob.glob(os.path.join(file_path, '*.json'))
    
    for path in paths:
        with open(path, "r") as f:
            json_data = json.load(f)

        context_info = json_data['dataset']['context_info']
        for context_data in context_info:
            context = context_data['context']
            qas = context_data['qas']

            for _qas in qas:
                question = _qas['question-1']
                answer = _qas['answer']
                question_level = _qas['question_level']
                if question_level != '상': continue
                data_row = {
                    'id': f"{file_name}_{idx}",
                    'task': 'contextqa',
                    'context': context,
                    'question': question,
                    'answer': answer,
                }
                new_dataset.append(data_row)
                idx += 1
        
        print(f"file_name:{file_name} idx:{idx}", '\t\t\t\t\t\t', end='\r')
    dataset_dict[file_name] = new_dataset

In [None]:
for key in dataset_dict.keys():
    print(key, len(dataset_dict[key]))

In [None]:
train_dataset_list = dataset_dict['VL_EE_train'] + dataset_dict['VL_NA_train'] + dataset_dict['VL_LA_train']+ dataset_dict['VL_ED_train']
eval_dataset_list = dataset_dict['VL_EE_val'] + dataset_dict['VL_NA_val'] + dataset_dict['VL_LA_val']+ dataset_dict['VL_ED_val']

In [None]:
with open("/workspaces/data/llm_datasets/aihub_contextqa_data_hard/기술과학/train.json", "w") as json_file:
    json.dump(train_dataset_list, json_file)
with open("/workspaces/data/llm_datasets/aihub_contextqa_data_hard/기술과학/test.json", "w") as json_file:
    json.dump(eval_dataset_list, json_file)

In [None]:
train_dataset_list_0 = train_dataset_list[:120000]
train_dataset_list_1 = train_dataset_list[120000:240000]
train_dataset_list_2 = train_dataset_list[240000:]

with open("/workspaces/data/llm_datasets/aihub_contextqa_data/기술과학/train_split0.json", "w") as json_file:
    json.dump(train_dataset_list_0, json_file)
    
with open("/workspaces/data/llm_datasets/aihub_contextqa_data/기술과학/train_split1.json", "w") as json_file:
    json.dump(train_dataset_list_1, json_file)
    
with open("/workspaces/data/llm_datasets/aihub_contextqa_data/기술과학/train_split2.json", "w") as json_file:
    json.dump(train_dataset_list_2, json_file)

In [None]:
dataset = load_dataset("/workspaces/data/llm_datasets/aihub_contextqa_data_hard/기술과학")
dataset

In [None]:
dataset['train'][0]

In [None]:
dataset = load_dataset("/workspaces/data/llm_datasets/gpt4_evol_1.3k/data/")

In [None]:
data = dataset['train'][0]
# answer = data['answer']
# question = data['question']
data

In [None]:
new_dataset = []
idx = 0
for data in dataset['train']:
    answer = data['answer']
    question = data['question']

    data_row = {
        'id': f"gpt_evol_1.3k_{idx}",
        'conversations': [
                            {'from': 'human', 'value': question},
                            {'from': 'gpt', 'value': answer},
                         ],
    }
    new_dataset.append(data_row)
    idx += 1

In [None]:
new_dataset[2]

In [None]:
with open("/data/llm_datasets/custom/vicuna_format/gpt_evol_1.3k-vicuna.json", "w") as json_file:
    json.dump(new_dataset, json_file)

In [None]:
dataset = load_dataset("json", data_files="/data/llm_datasets/WizardLM_Orca/wizardlm_orca.json")

In [None]:
new_dataset = []
idx = 0
for data in dataset['train']:
    output = data['output']
    system = data['system']
    instruction = data['instruction']
    data_row = {
        'id': f"WizardLM_Orca_{idx}",
        'conversations': [
                            {'from': 'human', 'value': instruction},
                            {'from': 'gpt', 'value': output},
                         ],
        'task': 'system_instruct',
        'system': system,
    }
    new_dataset.append(data_row)
    idx += 1
    

In [None]:
with open("/data/llm_datasets/custom/vicuna_format/wizardlm_orca_vicuna.json", "w") as json_file:
    json.dump(new_dataset, json_file)

In [None]:
dataset = load_dataset("/data/llm_datasets/KoreaSpellingCorrection/")

In [None]:
new_dataset = []
idx = 0
for data in dataset['test']:
    wrong = data['wrong']
    correct = data['correct']
    data_row = {
        'id': f"KoreaSpelling_Correction_{idx}",
        'conversations': [
                            {'from': 'human', 'value': wrong},
                            {'from': 'gpt', 'value': correct},
                         ],
        'task': 'correction',
    }
    new_dataset.append(data_row)
    idx += 1

In [None]:
new_dataset[2]

In [None]:
with open("/data/llm_datasets/custom/vicuna_format/KoreaSpellingCorrection/test.json", "w") as json_file:
    json.dump(new_dataset, json_file)

In [91]:
print(dpo_list[2])
dataset = load_dpo_dataset(dpo_list[2])
dataset

/data/llm_datasets/distilabel-math-preference-dpo/data/


Dataset({
    features: ['metadata', 'instruction', 'chosen_response', 'chosen_rating', 'rejected_response', 'rejected_rating'],
    num_rows: 2418
})

In [84]:
new_dataset = []
for data in dataset:
    new_dataset.append({
        'id': f"Ko_ultrafeedback_binarized_{data['prompt_id']}",
        'input': data['prompt'],
        'chosen': data['chosen'][1]['content'],
        'rejected': data['rejected'][1]['content'],
        'task': "dpo"
    })

In [89]:
new_dataset = []
# idx = 0
for idx, data in enumerate(dataset):
    new_dataset.append({
        'id': f"Ko_orca_dpo_pairs_{idx}",
        'input': data['question'],
        'chosen': data['chosen'],
        'rejected': data['rejected'],
        'task': "dpo_system",
        'system': data['system'],
    })


In [92]:
dataset[0]

{'metadata': '{"length-input": 139, "length-generations-1": 539, "length-generations-2": 493, "length-generations-3": 801, "rating-generations-1": 8.0, "rating-generations-2": 7.0, "rating-generations-3": 9.0, "distance-best-rated": 1.0}',
 'instruction': 'How can I simplify the algebraic expression `(3x^2 - 4y^3) / (2x)`?.\n Take a deep breath, think step by step, and give an accurate response',
 'chosen_response': 'To simplify the algebraic expression `(3x^2 - 4y^3) / (2x)`, we can follow a few steps:\n\nStep 1: Distribute the division symbol by multiplying the expression by the reciprocal of the denominator. The reciprocal of `2x` is `1/(2x)`, so the expression becomes `(3x^2 - 4y^3) * (1/(2x))`.\n\nStep 2: Simplify within the parentheses by dividing each term separately. \n\n- For the first term, `3x^2`, divide `3x^2` by `2x`. This gives us `(3x^2) / (2x) = (3/2) * (x^2 / x) = (3/2) * x`.\n\n- For the second term, `-4y^3`, divide `-4y^3` by `2x`. This gives us `(-4y^3) / (2x) = (-2

In [95]:
new_dataset = []
# idx = 0
for idx, data in enumerate(dataset):
    new_dataset.append({
        'id': f"Ko_distilabel-math-preference-dpo_{idx}",
        'input': data['instruction'],
        'chosen': data['chosen_response'],
        'rejected': data['rejected_response'],
        'task': "dpo",
    })

In [96]:
new_dataset[0]

{'id': 'Ko_distilabel-math-preference-dpo_0',
 'input': 'How can I simplify the algebraic expression `(3x^2 - 4y^3) / (2x)`?.\n Take a deep breath, think step by step, and give an accurate response',
 'chosen': 'To simplify the algebraic expression `(3x^2 - 4y^3) / (2x)`, we can follow a few steps:\n\nStep 1: Distribute the division symbol by multiplying the expression by the reciprocal of the denominator. The reciprocal of `2x` is `1/(2x)`, so the expression becomes `(3x^2 - 4y^3) * (1/(2x))`.\n\nStep 2: Simplify within the parentheses by dividing each term separately. \n\n- For the first term, `3x^2`, divide `3x^2` by `2x`. This gives us `(3x^2) / (2x) = (3/2) * (x^2 / x) = (3/2) * x`.\n\n- For the second term, `-4y^3`, divide `-4y^3` by `2x`. This gives us `(-4y^3) / (2x) = (-2) * (y^3 / x)`.\n\nStep 3: Combine the simplified terms from Step 2. The expression now becomes `(3/2) * x - 2 * (y^3 / x)`.\n\nSo, the simplified form of the algebraic expression `(3x^2 - 4y^3) / (2x)` is `(3

In [97]:
with open("/data/llm_datasets/custom/kodpo/untranslated/distilabel-math-preference-dpo.json", "w") as json_file:
    json.dump(new_dataset, json_file)

In [79]:
chosen[0]['content']

'Write a 1,000-word op-ed piece in a formal tone, analyzing and providing examples of the ways in which social media platforms have been utilized to spread extremist and violent ideologies. In your analysis, discuss the specific tactics that these groups use to spread their messages online and the effects of these tactics on both individuals and society. Additionally, provide possible solutions that could be implemented to combat the spread of these dangerous ideologies on social media. Your piece should be well-researched, citing reputable sources to support your arguments.'

In [60]:
for data in dataset:
    prompt_id = data['prompt_id']
    chosen = data['chosen']
    rejected = data['rejected']
    # _id = f"Ko_ultrafeedback_binarized_{}"

In [None]:
'id', 'input', 'chosen', 'rejected', 'task'

In [None]:
%%time
def send_request(new_dataset):
    global idx
    for _ in range(2):
        if idx > len_dataset:
            break
        lock.acquire()
        data = dataset[subset][idx]
        idx += 1
        lock.release()
        
        print(f"{idx}/{len_dataset}", '\t\t\t\t\t\t', end='\r')
        
        _id = data['id']
        context = data['context']
        question = data['question']
        answer = data['answer']
        if answer.lower() == 'yes':
            answer = '네'
        elif answer.lower() == 'no':
            answer = '아니오'
        
        # response
        result = generate_refiner(
            model_name,
            context,
            question,
            answer
        )
        
        new_dataset.append({
            'id': _id,
            'conversations': [
                                {'from': 'human', 'value': question},
                                {'from': 'gpt', 'value': result},
                             ],
            'task_name': "instruct",
            'instruction': context,
        })


model_name = "MingAI-70B-chat-orca_v0.42_2_dpo-GPTQ"
subset = 'train'
new_dataset = []
threads = []
idx = 0
lock = threading.Lock()
len_dataset = len(dataset[subset])
n_thread = 64


for i in range(n_thread):
    t = threading.Thread(target=send_request, args=(new_dataset,)) # 
    t.start()
    # time.sleep(0.5)
    threads.append(t)
    
for t in threads:
    t.join()

In [None]:
dataset = load_dataset("json", data_files="/data/llm_datasets/lima_vicuna_format/lima_vicuna_format.json")
enko_dataset = []
remained_dataset = []

lang_dict = {}
for data in dataset['train']:
    conversations = data['conversations']
    langs = {}
    len_conv = 0
    for conv in conversations:
        _from = conv['from']
        _value = conv['value']

        len_conv += len(_value)
        lang, conf = lang_detect.predict(_value.replace('\n', ' '))
        lang = lang[0]
        if lang not in langs:
            langs[lang] = 1
        else:
            langs[lang] += 1

    if '__label__en' in langs:
        langs['__label__en'] -= 1

    dominent_lang = max(langs)
    if dominent_lang not in lang_dict:
        lang_dict[dominent_lang] = [data]
    else:
        lang_dict[dominent_lang].append(data)
    # break

In [None]:
for key, value in lang_dict.items():
    print(key, len(value))

In [None]:
import requests
api_server_url = "http://localhost:41002"
def count_total_token(conversations):
    num_token = 0
    for conv in conversations:
        input_json = {
            "model_name": "MingAI-70B-chat-orca_v0.42_2_dpo-GPTQ",
            "prompt": conv['value'],
        }

        ret = requests.post(api_server_url + "/count_token", json=input_json)

        output_json = ret.json()
        num_token += output_json['count']
    return num_token


for _idx in range(10):
    num_total_token = count_total_token(lang_dict['__label__en'][_idx])
    print(num_total_token)

In [None]:
%%time
api_server_url = "http://localhost:41002"
def send_translate_request(new_dataset):
    global idx
    # for _ in range(2):
    while(1):
        if idx > len_dataset:
            break
        lock.acquire()
        pidx = idx
        data = lang_dict['__label__en'][pidx]
        idx += 1
        lock.release()
        
        print(f"{idx}/{len_dataset}", '\t\t\t\t\t\t', end='\r')#
        
        conv = data['conversations']
        new_conv = []
        for _data in conv:
            value = _data['value']
            # response
            results = ""
            text_blocks = []
            code_blocks = []
            for bidx, block in enumerate(value.split("```")):
                if bidx % 2 == 0:
                    text_blocks.append(block)
                else:
                    code_blocks.append(block)

            for tidx, text_block in enumerate(text_blocks):
                prompt = f"### 영어:\n{text_block}\n### 한국어:\n"
                input_json = {
                    "model_name": "Gugugo-koen-7B-V1.1",
                    "prompt": prompt,
                    "temperature": 0.7,
                    "top_p": 0.8,
                    "max_new_tokens": 4096,
                    "stop": ["</끝>", "###"],
                }

                ret = requests.post(
                    api_server_url + "/worker_generate_stream",
                    json=input_json,
                    stream=True,
                )

                for chunk in ret.iter_lines(decode_unicode=False, delimiter=b"\0"):
                    if chunk:
                        result_data = json.loads(chunk.decode())

                result = result_data['text'][len(prompt):].rstrip('\n')
                results += result
                if len(code_blocks) > tidx:
                    results += "```" + code_blocks[tidx] + "```"
            
            new_conv.append({
                'from': _data['from'],
                'value': results,
            })
        new_dataset.append({
            'conversations': new_conv,
            'id': data['id'],
        })


# code_prefixes = ["python", "c++", "minikube", "docker", "json", "java", 
#                  "php", "bash", "c#", "cpp", "css", "perl", "html", "xml", 
#                  "ruby", "sql", "ini", "apt", "socat", "tcp", "localhost",
#                  "git"]
new_dataset = []
threads = []
idx = 0
lock = threading.Lock()
len_dataset = len(lang_dict['__label__en'])
n_thread = 64 * 7


for i in range(n_thread):
    t = threading.Thread(target=send_translate_request, args=(new_dataset,)) # 
    t.start()
    # time.sleep(0.5)
    threads.append(t)
    
for t in threads:
    t.join()

In [None]:
new_dataset += lang_dict['__label__pt'] + lang_dict['__label__es'] + lang_dict['__label__de'] + lang_dict['__label__zh'] + lang_dict['__label__fr']

In [None]:
with open("/data/llm_datasets/custom/vicuna_format/lima_vicuna_format_ko.json", "w") as json_file:
    json.dump(new_dataset, json_file)