In [1]:
#Q&Aをもとに､データセットをつくってみる

In [1]:
import random
import numpy as np
import json
random.seed(0)
np.random.seed(0)

In [2]:
qa_path="database/output/qa0926.json"
context_path="database/output/context0926.json"


with open(qa_path,"r") as f:
    q_a_with_context_list=json.load(f)

with open(context_path,"r") as f:
    context_list=json.load(f)
len(q_a_with_context_list)

238917

In [3]:
context_id_set=list(set([c["context_id"] for c in q_a_with_context_list]))
len(context_id_set)

42394

In [5]:
q_a_with_context_list[0]


'A 5′-intron near the promoter is often sufficient for IME.5′-introns can increase gene expression levels by increasing recruitment of RNA polymerase II to the transcription initiation site. Promoter-proximal introns can also affect transcription initiation via chromatin modifications. A 5′-intron is sufficient to recruit chromatin opening marks. Previous reports measured 5′-intron IME in terms of RNA abundance, protein activity or phenotype (related directly to protein activity). The contemporaneous best available technologies for these studies had technical caveats like the inability to control transgene copy number and locus (either of which could have affected transgene expression level). The in vitro measurements of IME at the RNA and protein level were assayed via biochemical extracts. Finally, in some cases, reporter gene expression was transient (temporally unstable due to the nature of transfection/transduction). With modern genome editing and microfluidic technologies, we are

In [4]:
from tqdm import tqdm
#記述式と選択式の割合
multiple_qa_ratio=0.6
#コンテキスト有無の割合
inc_context_ratio=0.9

question_datset=[]

for context_id in tqdm(context_id_set):
    #特定のcontextを持つ問題を集める
    part_question_list=[c for c in q_a_with_context_list if c["context_id"]==context_id]

    #物切れの回答類を削除
    for typ in ["question","answer"]:
        part_question_list=[c for c in part_question_list if len(c[typ])>10]
        part_question_list=[c for c in part_question_list if c[typ][-1]!=":"]
        for ng_words in ["Claude","http","URL","Oops!"]:
            part_question_list=[c for c in part_question_list if c[typ].find(ng_words)==-1]

    if len(part_question_list)<1:
        continue

    #ランダムに問題形式を決定
    if np.random.random()<multiple_qa_ratio:
        mode="mult"
    else:
        mode="gen"
    if len(part_question_list)<4:
        mode="gen"


    #質問をrandom select
    sampled_qa=random.sample(part_question_list,min(4,len(part_question_list)))
    target_qa_id=random.randint(0,min(3,len(sampled_qa)-1))
    target_qa=sampled_qa[target_qa_id]

    question_dict={
        "question":target_qa["question"],
        "answer":target_qa["answer"],
        "ref_id":target_qa["ref_id"],
    }
    #choices
    if mode=="mult":
        question_dict["choices"]=[qa["answer"] for qa in sampled_qa]

    #contextを使うか否か
    use_context=np.random.random()>inc_context_ratio

    #質問にtheseとかを含む場合は､contextを使う

    filt_keys=["the","these","those","this","its","et al"]

    for key in filt_keys:
        if key in target_qa["question"]:
            use_context=True
            break

    context=context_list[target_qa["context_id"]]
    if use_context:
        question_dict["context"]=context
    else:
        question_dict["context"]=""

    question_dict["question_type"]="original"
    question_datset.append(question_dict)

  0%|          | 0/42394 [00:00<?, ?it/s]

100%|██████████| 42394/42394 [05:05<00:00, 138.78it/s]


In [5]:
with open("database/output/qa_dataset1028.json","w") as f:
    json.dump(question_datset,f,indent=1)

In [6]:
wo_context_data=[d for d in question_datset if d["context"]==""]
len(wo_context_data)

11849

In [7]:
import random
random.seed(0)
random.shuffle(question_datset)

In [8]:
test_nums=160
gen_nums=40
test_dataset=[]
train_dataset=[]

gen_count=0
for d in question_datset:

    #add testing
    if len(test_dataset)<test_nums and d["context"]=="":
        if "choices" not in d.keys() and gen_count<gen_nums:
            gen_count+=1
            test_dataset.append(d)
            continue

        test_dataset.append(d)
    
    #add training
    else:
        train_dataset.append(d)



In [9]:
train_dataset[:3]

[{'question': 'Even if the Seebeck coefficient increases at small wire sizes, what prevents gains in the overall power factor?',
  'answer': 'Even though the Seebeck coefficient may increase at small wire sizes, the electrical conductivity decreases due to a stronger sensitivity to carrier scattering mechanisms and geometry, possibly preventing gains in the overall power factor.',
  'ref_id': 2574,
  'choices': ['Calculations by Kim et al. applying the Landauer formalism showed that while the Seebeck coefficient per mode can be improved by lower dimensionality, a large packing density of nanowires with small diameters is required to realize this advantage.',
   'Even though the Seebeck coefficient may increase at small wire sizes, the electrical conductivity decreases due to a stronger sensitivity to carrier scattering mechanisms and geometry, possibly preventing gains in the overall power factor.',
   'The initially predicted quantum size effects drew the interest of the thermoelectri

In [10]:
#load mmlu
import glob
import pandas as pd
mmlu_path_list=glob.glob("database/mmlu/*.csv")

df=None
for path in mmlu_path_list:
    temp_df=pd.read_csv(path)
    temp_df.columns=["Q","A","B","C","D","Ans"]
    if df is None:
        df=temp_df
    else:
        df=pd.concat([df,temp_df])
#dfを行でシャッフル
df=df.sample(frac=1).reset_index(drop=True)
df[:3]

Unnamed: 0,Q,A,B,C,D,Ans
0,A spring of force constant k is stretched a ce...,k,2k,4k,8k,D
1,A student performs five titrations and obtains...,Accurate but not precise,Precise but not accurate,Both accurate and precise,Neither accurate nor precise,B
2,The 1H spectrum of a mixture of dimethylsulpho...,1:1,1:3,1:6,2:3,C


In [11]:
mmlu_test_dataset=[]
mmlu_test_nums=50
for record in df.to_dict(orient="records"):
    if len(mmlu_test_dataset)>=mmlu_test_nums:
        break

    answer=record[record["Ans"]]
    
    question_dict={
        "question":record["Q"],
        "answer":answer,
        "choices":[record["A"],record["B"],record["C"],record["D"]],
        "context":"",
        "question_type":"mmlu"
    }
    mmlu_test_dataset.append(question_dict)

mmlu_test_dataset[:3]

[{'question': 'A spring of force constant k is stretched a certain distance. It takes twice as much work to stretch a second spring by half this distance. The force constant of the second spring is',
  'answer': '8k',
  'choices': ['k', '2k', '4k', '8k'],
  'context': '',
  'question_type': 'mmlu'},
 {'question': 'A student performs five titrations and obtains a mean result of 0.110 M, with a standard deviation of 0.001 M. If the actual concentration of the titrated solution is 0.100 M, which of the following is true about the titration results?',
  'answer': 'Precise but not accurate',
  'choices': ['Accurate but not precise',
   'Precise but not accurate',
   'Both accurate and precise',
   'Neither accurate nor precise'],
  'context': '',
  'question_type': 'mmlu'},
 {'question': 'The 1H spectrum of a mixture of dimethylsulphoxide (DMSO) and acetonitrile (AN) contains lines with relative intensities α and 3α, respectively. What is the ratio of the two concentrations, [DMSO]:[AN]?',


In [18]:

with open("database/output/1028qa_dataset_train.json","w") as f:
    json.dump(train_dataset,f,indent=1)

with open("database/output/1028qa_dataset_test.json","w") as f:
    json.dump(test_dataset+mmlu_test_dataset,f,indent=1)

In [14]:
#irrelevant mmlu qa dataset


#load mmlu
import glob
import pandas as pd
mmlu_path_list=glob.glob("database/mmlu/train/*.csv")

df=None
for path in mmlu_path_list:
    temp_df=pd.read_csv(path)
    temp_df.columns=["Q","A","B","C","D","Ans"]
    if df is None:
        df=temp_df
    else:
        df=pd.concat([df,temp_df])
#dfを行でシャッフル
df=df.sample(frac=1).reset_index(drop=True)
df.shape

(6171, 6)

In [17]:

mmlu_train_dataset=[]
mmlu_train_nums=6000
for record in df.to_dict(orient="records"):
    if len(mmlu_test_dataset)>=mmlu_train_nums:
        break

    answer=record[record["Ans"]]
    
    question_dict={
        "question":record["Q"],
        "answer":answer,
        "choices":[record["A"],record["B"],record["C"],record["D"]],
        "context":"",
        "question_type":"mmlu"
    }
    mmlu_train_dataset.append(question_dict)


with open("database/output/1028mmlu_dataset_train.json","w") as f:
    json.dump(mmlu_train_dataset,f,indent=1)