In [None]:
from FlagEmbedding import FlagModel
import numpy as np
from sklearn.manifold import TSNE
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
from pprint import pprint as pp
import time
import umap
import os
import random
import time
from contextlib import contextmanager
import torch
from sentence_transformers import SentenceTransformer

@contextmanager
def timer():
    start_time = time.time()
    try:
        yield
    finally:
        end_time = time.time()
        print(f"Elapsed time: {end_time - start_time:.4f} seconds")

from datasets import load_dataset, load_from_disk
from datasets import load_dataset, concatenate_datasets, load_from_disk
import pandas as pd
import datasets
from datasets import Dataset
from pprint import pprint as pp
from datasets import Dataset
from sklearn.cluster import KMeans
from tqdm import tqdm
import torch
import heapq

code_data = load_dataset("sahil2801/CodeAlpaca-20k")["train"]
fin_data = load_dataset("FinGPT/fingpt-sentiment-train")["train"]
med_data = load_dataset("medalpaca/medical_meadow_medical_flashcards")["train"]
general_data = load_dataset("tatsu-lab/alpaca")["train"]
math_data = load_dataset("TIGER-Lab/MathInstruct")["train"]

def alpaca_format(example):
    if example['input'] == "":
        example["instruction"] = example["instruction"]
    else:
        example["instruction"] = example["instruction"] + " " + example['input']
    example["response"] = example['output']
    return example

def process_sft_dataset(dataset_name, dataset, dataset_sample=None)->datasets.Dataset:
    if dataset_name in ["lucasmccabe-lmi/CodeAlpaca-20k", "yahma/alpaca-cleaned", "FinGPT/fingpt-sentiment-train"]:
        dataset = dataset.map(alpaca_format, remove_columns=['input', 'output'], desc=f"Preprocessing {dataset_name} for unified format.")
    elif dataset_name in ["WizardLM/WizardLM_evol_instruct_70k"]:
        dataset = dataset.rename_column("output", "response")
    elif dataset_name in ["tatsu-lab/alpaca", "vicgalle/alpaca-gpt4", "gbharti/finance-alpaca"]:
        dataset = dataset.map(alpaca_format, remove_columns=['input', 'output', 'text'], desc=f"Preprocessing {dataset_name} for unified format.")
    elif dataset_name in ["TIGER-Lab/MathInstruct"]:
        df = pd.DataFrame(dataset)
        df = df.drop_duplicates(subset=['instruction'])
        dataset = datasets.Dataset.from_pandas(df)
        # dataset = dataset.shuffle(seed=42).select(range(51000))
        dataset = dataset.rename_column("output", "response")
        dataset = dataset.remove_columns(['source'])
    elif dataset_name in ["lighteval/MATH"]:
        dataset = dataset.rename_column("solution", "response")
        dataset = dataset.rename_column("problem", "instruction")
        dataset = dataset.remove_columns(['level', 'type'])
    elif dataset_name in ['gsm8k']:
        dataset = dataset.rename_column("question", "instruction")
        dataset = dataset.rename_column("answer", "response")
    elif dataset_name in ['medalpaca/medical_meadow_medical_flashcards']:       # TODO: 'lavita/ChatDoctor-HealthCareMagic-100k'. not sure whether to discard the instruction.
        dataset = dataset.remove_columns(['instruction'])
        dataset = dataset.rename_column("input", "instruction")
        dataset = dataset.rename_column("output", "response")
    elif "math" in dataset_name:
        dataset = dataset.remove_columns(['source'])
        dataset = dataset.rename_column("output", "response")
    else:
        raise NotImplementedError(f"Dataset {dataset_name} is not supported.")
    dataset = dataset.shuffle(seed=42)
    if dataset_sample:
        num_sample = min(len(dataset), dataset_sample)
        dataset = dataset.select(range(num_sample))
    print(f">> ===== After processing, Dataset {dataset_name} has {len(dataset)} examples. =====")
    return dataset

In [None]:
processed_data = []
# 这块一定要注意!!! name 和datasest的顺序都要改
for name, dataset in zip(["lucasmccabe-lmi/CodeAlpaca-20k","TIGER-Lab/MathInstruct","FinGPT/fingpt-sentiment-train","medalpaca/medical_meadow_medical_flashcards","tatsu-lab/alpaca",],[code_data,math_data,fin_data,med_data,general_data]):
# for name, dataset in zip(["lucasmccabe-lmi/CodeAlpaca-20k","FinGPT/fingpt-sentiment-train","medalpaca/medical_meadow_medical_flashcards", "TIGER-Lab/MathInstruct"],[code_data,fin_data,med_data,math_data]):
    tmp:datasets.Dataset = process_sft_dataset(name,dataset)
    # if "fin" in name: 
    #     tmp = tmp.shuffle(seed=42).select(range(51000))
    print(tmp.column_names)
    processed_data.append(tmp)

In [None]:
data_concated = concatenate_datasets(processed_data)
PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response:"
    ),
}
prompt_input = PROMPT_DICT["prompt_no_input"]

# code

In [None]:
data = load_dataset("json", data_files="/mnt/bn/data-tns-live-llm/leon/human-eval/data/HumanEval.jsonl")["train"].select(range(5))
data.column_names
import copy
data = data.rename_columns({"prompt":"instruction"})
data = data.add_column("prompt",[prompt_input.format_map(example) for example in data])
data = data.add_column("completion", data["canonical_solution"])
data.save_to_disk("/mnt/bn/data-tns-live-llm/leon/less/prompt_human_eval.parquet")

# med

In [None]:
root = "/mnt/bn/data-tns-live-llm/leon/datasets/less-data/eval/mmlu/"
names = "anatomy clinical_knowledge college_biology college_medicine medical_genetics professional_medicine".split(" ")
test_dfs = []
dev_dfs = []
for name in names:
    test_dfs.append(pd.read_csv(f"{root}/test/{name}_test.csv",header=None))
    dev_dfs.append(pd.read_csv(f"{root}/dev/{name}_dev.csv",header=None))
test_dfs[0]
dev_dfs[0]
choices = ["A", "B", "C", "D"]
def format_example(df, idx, include_answer=True):
    prompt = df.iloc[idx, 0]
    k = df.shape[1] - 2
    for j in range(k):
        prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j + 1])
    prompt += "\nAnswer:"
    if include_answer:
        prompt += " {}\n\n".format(df.iloc[idx, k + 1])
    return prompt
def format_subject(subject):
    l = subject.split("_")
    s = ""
    for entry in l:
        s += " " + entry
    return s
def gen_prompt(train_df, subject, k=-1):
    prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format(
        format_subject(subject)
    )
    if k == -1:
        k = train_df.shape[0]
    # k = min(train_df.shape[0],k)
    for i in range(k):
        prompt += format_example(train_df, i)
    return prompt
prompts = []
completion = []
for idx, name in zip(range(len(names)), names):
    test_df = test_dfs[idx]
    dev_df = dev_dfs[idx]
    subject = name
    print(test_df.shape[0])
    for i in range(0, test_df.shape[0]):
        prompt_end = format_example(test_df, i, include_answer=False)
        train_prompt = gen_prompt(dev_df, subject, k=5)
        prompt = train_prompt + prompt_end
        prompts.append(prompt)
        break
    completion = test_df.iloc[:, -1].values[0]
df = pd.DataFrame({
    "prompt": prompts,
    "completion": completion
})
print(df["prompt"][1])
df = Dataset.from_pandas(df)
df.to_json("/mnt/bn/data-tns-live-llm/leon/less/prompt_med.jsonl")

# fin

In [None]:
label_dict = {
    "fpb":{
        0:"negative",
        1:'neutral',
        2:'positive',
    },
    "tfns":{
        0:"negative",
        1:'positive',
        2:'neutral',
    }
}
fpb = load_from_disk("/mnt/bn/data-tns-live-llm/leon/FinGPT/fingpt/FinGPT_Benchmark/data/financial_phrasebank-sentences_50agree")["train"]
fiqa = load_from_disk("/mnt/bn/data-tns-live-llm/leon/FinGPT/fingpt/FinGPT_Benchmark/data/fiqa-2018")["train"]
tfns = load_from_disk("/mnt/bn/data-tns-live-llm/leon/FinGPT/fingpt/FinGPT_Benchmark/data/twitter-financial-news-sentiment")["train"]
print(fpb.column_names)
print(fiqa.column_names)
print(tfns.column_names)
def format_example(example: dict) -> dict:
    context = f"Instruction: {example['instruction']}\n"
    if example.get("input"):
        context += f"Input: {example['input']}\n"
    context += "Answer: "
    target = example["output"]
    return {"context": context, "target": target}
def make_label(x):
    if x < - 0.1: return "negative"
    elif x >=-0.1 and x < 0.1: return "neutral"
    elif x >= 0.1: return "positive"
fiqa = fiqa.to_pandas()
fiqa["instruction"] = "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}."
fiqa["output"] = fiqa.sentiment_score.apply(make_label)
fiqa = fiqa[['sentence', 'output',"instruction"]]
fiqa.columns = ["input", "output","instruction"]
fiqa[["prompt","completion"]] = fiqa.apply(format_example, axis=1, result_type="expand")
fiqa = Dataset.from_pandas(fiqa)
def format_example(example: dict) -> dict:
    context = f"Instruction: {example['instruction']}\n"
    if example.get("input"):
        context += f"Input: {example['input']}\n"
    context += "Answer: "
    target = example["output"]
    return {"context": context, "target": target}
dic = label_dict["fpb"]
fpb = fpb.to_pandas()
fpb.columns = ["input", "output"]
fpb["output"] = fpb["output"].apply(lambda x:dic[x])

fpb["instruction"] = "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}."
fpb[["prompt","completion"]] = fpb.apply(format_example, axis = 1, result_type="expand")
fpb[:2]
fpb = Dataset.from_pandas(fpb)
def format_example(example: dict) -> dict:
    context = f"Instruction: {example['instruction']}\n"
    if example.get("input"):
        context += f"Input: {example['input']}\n"
    context += "Answer: "
    target = example["output"]
    return {"context": context, "target": target}
dic = label_dict["tfns"]
tfns = tfns.to_pandas()
tfns['label'] = tfns['label'].apply(lambda x:dic[x])
tfns["instruction"] = 'What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.'

tfns.columns = ['input', 'output', 'instruction']
tfns[["prompt","completion"]] = tfns.apply(format_example, axis = 1, result_type="expand")
print(tfns[:2])
tfns = Dataset.from_pandas(tfns)
prompts = []
prompts.extend(fiqa["prompt"][:2])
prompts.extend(fpb["prompt"][:2])
prompts.extend(tfns["prompt"][:2])
completions = []
completions.extend(fiqa["completion"][:2])
completions.extend(fpb["completion"][:2])
completions.extend(tfns["completion"][:2])
df = pd.DataFrame({
    "prompt": prompts,
    "completion": completions
})
print(df["prompt"][1])
print(df["completion"][1])
df = Dataset.from_pandas(df)
df.to_json("/mnt/bn/data-tns-live-llm/leon/less/prompt_fin.jsonl")

# math

In [None]:
data = load_dataset("json", data_files="/mnt/bn/data-tns-live-llm/leon/MAmmoTH2/math_eval/dataset/gsm8k/gsm8k.jsonl")["train"]
data = data.rename_columns({
    "question":"instruction",
    "answer": "completion"
})
data[:2]
data = data.add_column("prompt",[prompt_input.format_map(row) for row in data])
data[:2]
data = data.select(range(5))
data.to_json("/mnt/bn/data-tns-live-llm/leon/less/prompt_math.jsonl")