In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import warnings
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
from tqdm.notebook import tqdm
sns.set()
rcParams['figure.figsize'] = (20,10)
pd.options.display.max_columns = None
warnings.filterwarnings('ignore')
from pprint import pprint
import json
from collections import defaultdict
from googletrans import Translator
import time
from datasets import Dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModel, AutoConfig, pipeline
from datasets import load_dataset, Dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bigscience/bloom-3b')

In [None]:
translator = Translator()
def translate_df(df, batch_size=1, translator=translator):
    def translate_text(text, sleep_time=2):
        if text in [''] or text is None:
            return ''
        while True:
            try:
                return translator.translate(text, src="en", dest="vi").text
            except:
                print("Requests error")
                time.sleep(sleep_time)
    
    def clean_text(text):
        return text
    
    data = defaultdict(list)
    for i in tqdm(range(0, len(df), batch_size)):
        batch_df = df.iloc[i : i + batch_size]
        temp_data = defaultdict(list)
        for col in df.columns:
            batch_df[col] = batch_df[col].apply(lambda x: clean_text(x))
            col_vals = ' ## '.join(batch_df[col].values)
            temp_data[col] = [x.strip() for x in translate_text(col_vals).split('##')]

        can_insert_batch = True
        for col in df.columns:
            if len(temp_data[col]) != len(batch_df):
                can_insert_batch = False
                break

        if can_insert_batch:
            for col in df.columns:
                data[col].extend(batch_df[col].values)
                data[f'{col}_translated'].extend(temp_data[col])
            continue
        
        # else loop through each row
        for _, row in batch_df.iterrows():
            for col in df.columns:
                text = clean_text(row[col])
                data[col].append(text)
                data[f'{col}_translated'].append(translate_text(text))
                
    df = pd.DataFrame(data)
    return df.drop(columns=[col for col in df.columns if '_translated' not in col]) 

In [None]:
a = """Cuộc trò chuyện giữa con người và trợ lý AI.
[|Con người|] {prompt}
[|AI|] {response}
[|Con người|]"""

def get_prompt(row):
    return a.format(prompt=row['prompt'], response=row['response'])

In [None]:
temp = pd.read_json('../../data/translated/vi_alpaca_reduced.jsonl', lines=True)
temp2 = temp.apply(lambda x: get_prompt(x), axis=1)
temp2 = pd.DataFrame(temp2)
temp2.columns = ['prompt']

In [None]:
temp3 = pd.read_json('../../data/translated/quora_chat_data_translated.json', orient='records')

In [None]:
temp3.to_json('../../data/translated/quora_chat_data_translated.json',lines=True,orient='records')
temp2.to_json('../../data/translated/alpaca_chat_cleaned_translated.json',lines=True,orient='records')

## Clean ShareGPT

In [None]:
share_gpt = pd.read_json('../../data/original/ShareGPT_V3_unfiltered_cleaned_split.json')

In [None]:
popular_programming_keywords = [
    "python",
    "java",
    "javascript",
    "c#",
    "c++",
    "typescript",
    "ruby",
    "swift",
    "kotlin",
    "php",
    "rust",
    "scala",
    "dart",
    "matlab",
    "objective-c",
    "perl",
    "lua",
    "assert",
    "async",
    "await",
    "def",
    "elif",
    "lambda",
    "nonlocal",
    "function",
    "var",
    "cuda",
    "torch",
    "code",
    "sudo",
    "bash"
]
popular_programming_languages = [
    "python",
    "java",
    "javascript",
    "typescript",
    "kotlin",
    "objective-c",
]
def check_text_not_contain_code(text):
    temp2 = text.lower()
    temp = set(temp2.split())
    for language in popular_programming_keywords:
        if language in temp:
            return False
    for language in popular_programming_languages:
        if language in temp2:
            return False
    return True

temp = share_gpt['conversations'].apply(lambda x: str(x)).apply(check_text_not_contain_code)


In [None]:
# remove duplicate of a Dataset
from pandas import DataFrame


DataFrame().drop_duplicates(subset=['id']).reset_index(drop=True)

In [None]:
share_gpt_no_code = share_gpt[temp][['conversations']]
share_gpt_no_code = share_gpt_no_code[share_gpt_no_code['conversations'].apply(lambda x: len(x) >= 4)]
share_gpt_no_code.shape

In [None]:
from numpy import nan


def create_conversation(turns):
    res = "The conversation between human and AI assistant.\n"
    for turn in turns:
        if turn['from'] == 'human':
            res += "[|Human|] " + turn['value'] + "\n"
        elif turn['from'] == 'gpt':
            res += "[|AI|] " + turn['value'] + "\n"
        else:
            return nan
    return res

share_gpt_no_code_conversations = share_gpt_no_code.apply(lambda x: create_conversation(x['conversations']), axis=1)

In [None]:
print(share_gpt_no_code_conversations.sample(1).iloc[0])

In [None]:
share_gpt_no_code_conversations.to_json('../../data/original/share_gpt_no_code_conversations.json', orient='records', lines=True)

### MMLU

In [None]:
mmlu_zero_shot_test = Dataset.from_json('../../../data/mmlu_eval_test/zero_shot_mmlu_test.json')
mmlu_five_shot_test = Dataset.from_json('../../../data/mmlu_eval_test/five_shot_mmlu_test.json')
mmlu_zero_shot_val = Dataset.from_json('../../../data/mmlu_eval_test/zero_shot_mmlu_val.json')
mmlu_five_shot_val = Dataset.from_json('../../../data/mmlu_eval_test/five_shot_mmlu_val.json')

def mapper(example):
    text = 'The following are multiple choice questions (with answers) about'
    example['input'] = example['input'].replace(f'{text} ', f'The conversation between human and AI assistant.\n[|Human|] {text}')
    example['output'] = f'[|AI|] {example["output"]}\n[|Human|]'
    return example

mmlu_zero_shot_test = mmlu_zero_shot_test.map(mapper)
mmlu_zero_shot_val = mmlu_zero_shot_val.map(mapper)
mmlu_five_shot_test = mmlu_five_shot_test.map(mapper)
mmlu_five_shot_val = mmlu_five_shot_val.map(mapper)

In [None]:
temp = mmlu_five_shot_test.shuffle()[0]
print(temp['input'])
print(temp['output'])

In [None]:
temp = mmlu_zero_shot_test.shuffle()[0]
print(temp['input'])
print(temp['output'])

In [None]:
mmlu_zero_shot_test.to_json('../../../data/mmlu_eval_test/zero_shot_mmlu_chat_test.jsonl')
mmlu_five_shot_test.to_json('../../../data/mmlu_eval_test/five_shot_mmlu_chat_test.jsonl')
mmlu_zero_shot_val.to_json('../../../data/mmlu_eval_test/zero_shot_mmlu_chat_val.jsonl')
mmlu_five_shot_val.to_json('../../../data/mmlu_eval_test/five_shot_mmlu_chat_val.jsonl')

## FAQ

In [None]:
path = '../../../data/translated/all_faqs.json'
faqs = pd.read_json(path)

In [None]:
faqs

In [None]:
template = """{instruction}\n{input}\nCâu trả lời: {output}"""
print(template.format(**faqs.sample(1).iloc[0]))

In [None]:
temp = faqs['input'].apply(lambda x: not x.endswith('Điều luật liên quan: '))
faqs = faqs[temp]

In [None]:
faqs

In [None]:
faqs['output_len'] = faqs['output'].apply(lambda x: len(x.split()))

In [None]:
faqs['output_len'].hist(bins=100)

In [None]:
faqs_standard = faqs[faqs['output_len'] < 200]

In [None]:
template = """{instruction}\n{input}\nCâu trả lời: {output}"""
print(template.format(**faqs_standard.sample(1).iloc[0]))

In [None]:
template = """{instruction}\n{input}\nCâu trả lời: {output}"""
print(template.format(**faqs.sample(1).iloc[0]))

In [None]:
faqs_standard

In [None]:
temp = Dataset.from_json('/Users/phamhoang1408/Desktop/Phase 2 Viettel/main_repo/data/training/alpaca_chat_cleaned_51k_translated.json')
temp.rename_column(temp.column_names[0], 'text')

In [None]:
ds = Dataset.from_json('../../data/original/gpt4-instruct-similarity-0.8-dataset.json')

In [None]:
len(ds.shuffle()[0]['response'].split())

In [None]:
ds

In [None]:
from datasets import Dataset, concatenate_datasets
from glob import glob

def load_dataset(folder_path, dataset_size=None):
    data = []
    file_paths = glob(folder_path + "/*.jsonl")
    for path in file_paths:
        ds = Dataset.from_json(path)
        if len(ds.column_names) != 1:
            raise ValueError("Dataset must have only one text column")
        ds = ds.rename_column(ds.column_names[0], "text")
        data.append(ds)
    if dataset_size is None:
        return concatenate_datasets(data, axis=0).shuffle()
    return concatenate_datasets(data, axis=0).shuffle().select(range(dataset_size))

In [None]:
ds = load_dataset('../../data/training_31_7')

In [None]:
print(ds.shuffle()[0]['text'])

In [None]:
from glob import glob
from datasets import Dataset, concatenate_datasets
def load_dataset(folder_path, dataset_size=None):
    data = []
    for path in glob(folder_path + "/*.jsonl"):
        ds = Dataset.from_json(path)
        if len(ds.column_names) != 1:
            raise ValueError("Dataset must have only one text column")
        ds = ds.rename_column(ds.column_names[0], "text")
        data.append(ds)
    if dataset_size is None:
        final_ds = concatenate_datasets(data, axis=0).shuffle(seed=42)
    final_ds = (
        concatenate_datasets(data, axis=0).shuffle(seed=42).select(range(dataset_size))
    )
    final_ds = final_ds.filter(lambda x: x["text"] != "" or x["text"] is not None)
    return final_ds

In [None]:
ds = load_dataset('../../data/training_31_7')

In [None]:
df = ds.to_pandas()

In [None]:
def check(x):
    if x == '' or x is None:
        return True
    return False
temp = df['text'].apply(check)

In [None]:
temp.sum()

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bigscience/bloomz-3b')

In [None]:
tokenizer

In [None]:
tokenizer.tokenize('')

In [None]:
en_tree = []
with open('../../data/original/oasst_ready_trees.json') as f:
    for line in f:
        temp = json.loads(line)
        if temp['prompt']['lang'] in ['en']:
            en_tree.append(temp)

In [None]:
vi_tree = []
with open('../../data/original/oasst_ready_trees.json') as f:
    for line in f:
        temp = json.loads(line)
        if temp['prompt']['lang'] in ['vi']:
            vi_tree.append(temp)

In [None]:
def traverse(root):
    if len(root['replies']) == 0:
        return [[(root['role'], root['text'])]]
    data = []
    for reply in root['replies']:
        data.extend(traverse(reply))
    data = [[(root['role'], root['text'])] + d for d in data]
    return data

In [None]:
en_conversations = []
for tree in en_tree:
    en_conversations.extend(traverse(tree['prompt']))

vi_conversations = []
for tree in vi_tree:
    vi_conversations.extend(traverse(tree['prompt']))

In [None]:
len(en_conversations)

In [None]:
len(vi_conversations)

In [None]:
templates = "Cuộc trò chuyện giữa con người và trợ lý AI.\n{conversation}"
def format(conversations):
    data = []
    for conversation in conversations:
        temp = ""
        for turn in conversation:
            if turn[0] == 'prompter':
                temp += f"[|Con người|] {turn[1]}\n"
            else:
                temp += f"[|AI|] {turn[1]}\n"
        data.append(templates.format(conversation=temp))
    return data

In [None]:
en_data = format(en_conversations)

In [None]:
vi_data = format(vi_conversations)

In [None]:
print(en_data[11])

In [None]:
en_df = pd.DataFrame(en_data, columns=['text'])
en_df.to_json('../../data/original/en_oasst.json', orient='records', lines=True)

WiVi

In [None]:
wivi = Dataset.from_json('/Users/phamhoang1408/Desktop/Phase 2 Viettel/main_repo/data/original/wizard_vicuna_dataset_v2.json')
def mapper(x):
    conversation = ""
    for turn in x['conversations']:
        if turn['from'] == 'gpt':
            conversation += f"[|AI|] {turn['value']}\n" 
        elif turn['from'] == 'human':
            conversation += f"[|Human|] {turn['value']}\n"
        else:
            raise Exception()
    x['conversations'] = conversation.strip()
    return x


popular_programming_keywords = [
    "python",
    "java",
    "javascript",
    "c#",
    "c++",
    "typescript",
    "ruby",
    "swift",
    "kotlin",
    "php",
    "rust",
    "scala",
    "dart",
    "matlab",
    "objective-c",
    "perl",
    "elif",
    "lambda",
    "nonlocal",
    "function",
    "cuda",
    "torch",
    "code",
    "sudo",
    "bash",
    "int",
    "html",
    "main()",
    "chinese",
    '\begin'
]
popular_programming_languages = [
    "sql",
    "linux",
    "</",
    "/>",
    "bash",
    "python",
    "java",
    "javascript",
    "typescript",
    "swift",
    "kotlin",
    "rust",
    "scala",
    "dart",
    "matlab",
    'latex',
    '\begin'
]
def check_text_not_contain_code(text):
    temp2 = text.lower()
    temp = set(temp2.split())
    for language in popular_programming_keywords:
        if language in temp:
            return False
    for language in popular_programming_languages:
        if language in temp2:
            return False
    return True

wivi_no_code = wivi.map(mapper).filter(lambda x: check_text_not_contain_code(x['conversations']))
wivi_no_code

In [None]:
print(wivi_no_code.shuffle()[0]['conversations'])

In [None]:
wivi_no_code.to_json('../../../data/original/wizard_vicuna_nocode.jsonl', orient='records', lines=True)

### Wizard

In [None]:
wizard = Dataset.from_json('/Users/phamhoang1408/Desktop/Phase 2 Viettel/main_repo/data/original/wizard_full.jsonl')
def mapper(x):
    conversation = ""
    for turn in x['conversations']:
        if turn['from'] == 'gpt':
            conversation += f"[|AI|] {turn['value']}\n" 
        elif turn['from'] == 'human':
            conversation += f"[|Human|] {turn['value']}\n"
        else:
            raise Exception()
    x['conversations'] = conversation.strip()
    return x


popular_programming_keywords = [
    "python",
    "java",
    "javascript",
    "c#",
    "c++",
    "typescript",
    "ruby",
    "swift",
    "kotlin",
    "php",
    "rust",
    "scala",
    "dart",
    "matlab",
    "objective-c",
    "perl",
    "elif",
    "lambda",
    "nonlocal",
    "function",
    "cuda",
    "torch",
    "code",
    "sudo",
    "bash",
    "int",
    "html",
    "main()",
    "chinese",
    '\begin',
]
popular_programming_languages = [
    "sql",
    "linux",
    "</",
    "/>",
    "bash",
    "python",
    "java",
    "javascript",
    "c#",
    "c++",
    "typescript",
    "swift",
    "kotlin",
    "rust",
    "scala",
    "dart",
    "matlab",
    'latex'
    '\begin',
]
def check_text_not_contain_code(text):
    temp2 = text.lower()
    temp = set(temp2.split())
    for language in popular_programming_keywords:
        if language in temp:
            return False
    for language in popular_programming_languages:
        if language in temp2:
            return False
    return True

wizard_no_code = wizard.map(mapper).filter(lambda x: check_text_not_contain_code(x['conversations']))
wizard_no_code

In [None]:
print(wizard_no_code.shuffle()[0]['conversations'])

In [None]:
path = '/Users/phamhoang1408/Desktop/Phase 2 Viettel/main_repo/data/original/wizard_no_code.jsonl'
wizard_no_code.to_json(path, orient='records', lines=True)

In [None]:
'\begin' in """[|Human|] Can you provide a list of 3 popular tourist attractions in Tokyo? 
\begin{itemize}
    \item The first tourist attraction is the Tokyo Tower, which is a communications and observation tower located in the Shiba-koen district of Minato, Tokyo. It stands at a height of 333 meters, making it one of the tallest structures in Japan.
    \item The second tourist attraction is the Sensoji Temple, which is a Buddhist temple located in the Asakusa district of Tokyo. It is one of the oldest and most famous temples in Tokyo, and it attracts millions of visitors every year.
    \item The third tourist attraction is the Meiji Shrine, which is a Shinto shrine located in the Shibuya district of Tokyo. It is dedicated to the deified spirits of Emperor Meiji and Empress Shoken, and it is one of the most popular shrines in Tokyo.
\end{itemize}
[|AI|] These are the 3 popular tourist attractions in Tokyo:
\begin{enumerate}
    \item Tokyo Tower
    \item Sensoji Temple
    \item Meiji Shrine
\end{enumerate}""".lower()

### Okapi

In [None]:
okapi_instruct = Dataset.from_json('/Users/phamhoang1408/Desktop/Phase 2 Viettel/main_repo/data/original/okapi/okapi_instruct_vi.json')
okapi_rm = Dataset.from_json('/Users/phamhoang1408/Desktop/Phase 2 Viettel/main_repo/data/original/okapi/okapi_rm_vi.json')
okapi_rl = Dataset.from_json('/Users/phamhoang1408/Desktop/Phase 2 Viettel/main_repo/data/original/okapi/okapi_rl_vi.json')
okapi_rl = okapi_rl.rename_column('prefered_output','output')
full_okapi =  concatenate_datasets([
    okapi_instruct.select_columns(['instruction', 'input', 'output']),
    okapi_rm.select_columns(['instruction', 'input', 'output']),
    okapi_rl.select_columns(['instruction', 'input', 'output'])
])
full_okapi = Dataset.from_pandas(full_okapi.to_pandas().drop_duplicates(subset=['output']).reset_index(drop=True))
print(okapi_instruct.shape, okapi_rm.shape, okapi_rl.shape, full_okapi.shape)
full_okapi.shuffle()[0]

In [None]:
key_words = [
    "code"
    "python",
    "java",
    "javascript",
    "c#",
    "c++",
    "typescript",
    "ruby",
    "swift",
    "kotlin",
    "php",
    "rust",
    "scala",
    "dart",
    "matlab",
    "objective-c",
    "perl",
    "elif",
    "lambda",
    "nonlocal",
    "function",
    "def",
    "html","css",
    "sql",
    "bash",
    'latex',
    "print",
    "import",
    "return",
    "from"
]
def check_text_contain_code(text):
    if text is None or text == '':
        return False
    temp = text.lower()
    for kw in key_words:
        if kw in temp:
            return True
    return False

def filter_code(x):
    return check_text_contain_code(x['input']) or check_text_contain_code(x['output'])

code_okapi = full_okapi.filter(filter_code)
code_okapi

In [None]:
temp = code_okapi.shuffle()[0]
print(temp['instruction'] + '\n----\n' + temp['input'] + '\n----\n' + temp['output'])

In [None]:
no_code_okapi = full_okapi.filter(lambda x: not check_text_contain_code(x['input']) and not check_text_contain_code(x['output']))

In [None]:
no_code_okapi

In [None]:
temp = full_okapi.shuffle()[0]
print(temp['instruction'] + '\n----\n' + temp['input'] + '\n----\n' + temp['output'])

In [None]:
# full_okapi.to_json('../../../data/original/okapi/full_okapi.jsonl', orient='records', lines=True)
code_okapi.to_json('../../../data/original/okapi/code_related_okapi.jsonl', orient='records', lines=True)
# no_code_okapi.to_json('../../../data/original/okapi/no_code_okapi.jsonl', orient='records', lines=True)