In [None]:
# finetuning llama2

# (Q)分子構造+(R)理由+(A)物性データセットのLLMによる学習と予測
- Q&A: 融点データセットを使用
- R: 自分自身で考えさせて､正解のデータを学習させる

In [None]:
#!pip install transformers==4.35.0
#!pip install peft==0.5.0
#!pip install bitsandbytes==0.41.1
#!pip install accelerate==0.23.0
#!pip install flash-attn==2.3.1.post1
#!pip install datasets==2.14.5

In [4]:
import os
#os.environ["CUDA_VISIBLE_DEVICES"]="1"

from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch
from peft import LoraConfig, get_peft_model
from transformers import AutoTokenizer,pipeline
from datasets import Dataset
import copy
from tqdm import tqdm
#問題設定: はじめのN件をテストデータにする
n_test=50

In [5]:
#ハイパラ関連
#モデル名
model_name="mistralai/Mixtral-8x7B-Instruct-v0.1"
#LoRA関連
r=32
lora_alpha=r
bit=16
#bit=8
#bit=4

#LoRAのadapter
target_modules= [
    "lm_head",
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate",
    "w1",
    "w2",
    "w3"
]



#学習関連
#gradient_checkpointing =True  #vramの節約をしたい場合
gradient_checkpointing =False
per_device_train_batch_size=1
epochs=3
lr=10**-5
do_train=True
#do_train=False

In [6]:

device_map="auto"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

def init_model(model_name, r, lora_alpha, target_modules, bit=4):
    if bit == 4:
        print("Using 4-bit mode")
        model = AutoModelForCausalLM.from_pretrained(model_name,
                                                     quantization_config=bnb_config,
                                                     device_map=device_map,
                                                     use_flash_attention_2=True,
                                                     )
    elif bit == 8:
        print("Using 8-bit mode")
        model = AutoModelForCausalLM.from_pretrained(model_name,
                                                     load_in_8bit=True,
                                                     device_map=device_map,
                                                     use_flash_attention_2=True,
                                                     )
    elif bit == 16:
        print("Using fp16 mode")
        model = AutoModelForCausalLM.from_pretrained(model_name,
                                                     device_map=device_map,
                                                     torch_dtype=torch.float16,
                                                     use_flash_attention_2=True,
                                                     )
    else:
        raise ValueError("bit must be 4, 8 or 16")

    if len(target_modules)==0:
        return model
    peft_config = LoraConfig(
        task_type="CAUSAL_LM", inference_mode=False, r=r, lora_alpha=lora_alpha,
        lora_dropout=0.1,
        target_modules=target_modules,
    )
    model = get_peft_model(model, peft_config)
    return model


In [7]:

#モデル初期化
model=init_model(model_name, r, lora_alpha, target_modules, bit=bit)

Using fp16 mode


The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
Loading checkpoint shards: 100%|██████████| 19/19 [00:24<00:00,  1.28s/it]


In [8]:
#層の表示
for name, param in model.named_parameters():
    print(name)

base_model.model.model.embed_tokens.weight
base_model.model.model.layers.0.self_attn.q_proj.weight
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight
base_model.model.model.layers.0.self_attn.k_proj.weight
base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight
base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight
base_model.model.model.layers.0.self_attn.v_proj.weight
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight
base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight
base_model.model.model.layers.0.self_attn.o_proj.weight
base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight
base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight
base_model.model.model.layers.0.block_sparse_moe.gate.weight
base_model.model.model.layers.0.block_sparse_moe.gate.lora_A.default.weight
base_model.model.model.layer

In [9]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MixtralForCausalLM(
      (model): MixtralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MixtralDecoderLayer(
            (self_attn): MixtralFlashAttention2(
              (q_proj): Linear(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(
                in_features=4096, out_features=1024, bias=False
        

In [10]:


tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

#pipe = pipeline("text-generation", model=model,
#                tokenizer=tokenizer, max_new_tokens=1000)

# データセットの準備

In [11]:
#データセットの読み込み
import pandas as pd
import random
df=pd.read_csv("dataset/231225AutoReasoning/240104best_reason_record.csv")
dataset=df.to_dict(orient="records")
random.seed(0)
random.shuffle(dataset)

print(df.shape)
df[:2]

(2406, 9)


Unnamed: 0,name,smiles,csid,link,source,Reason,mpC,Prediction(integer),Abs error
0,"2,3-dibromo-2-methyl-butane",BrC(C)(C)C(Br)C,71392,http://www.srcinc.com/what-we-do/product.aspx?...,PHYSPROP,"Starting with butane, which melts around -138....",7.0,6.6,0.4
1,5-chlorobenzofuroxan,c1cc2c(cc1Cl)no[n+]2[O-],123661,http://www.alfa.com/en/GP100W.pgm?DSSTK=A14261,Alfa Aesar,"The basic structure for comparison is benzene,...",47.0,45.0,2.0


In [12]:
import random
#system_prompt="You are a professional chemist. Predict the melting point of the following compound."


def gen_compound_text(chemical_record,
    reason="",prediction=""):
    name=chemical_record["name"]
    smiles=chemical_record["smiles"]
    prompt=f"""
#Problem
##Name: {name}
##SMILES: {smiles}"""
    if reason !="" and prediction!="":
        prompt+=f"""
##Reason: {reason}
##Prediction: {prediction}
"""
    else:
        #test mode
        prompt+="""
##Reason: 
"""
    return prompt



def generate_question_prompt(dataset,
                             test_id,
                             n_prompt_examples=5,
                             prompt_dataset=None):

    if prompt_dataset is None:
        candidate_prompt_ids=[i for i in range(len(dataset))]
        candidate_prompt_ids.remove(test_id)
        prompt_dataset=dataset
    else:
        candidate_prompt_ids=[i for i in range(len(prompt_dataset))]
    prompt=""

    #train prompt
    for _ in range(n_prompt_examples):
        id=random.choice(candidate_prompt_ids)
        prompt+=gen_compound_text(prompt_dataset[id],
                                reason=prompt_dataset[id]["Reason"],
                                prediction=prompt_dataset[id]["Prediction(integer)"])
        prompt+="\n"

    #test prompt
    prompt+=gen_compound_text(dataset[test_id])

    return prompt


def prepare_dataset(context_list, tokenizer):
    data_list = [{"text": i} for i in context_list]
    random.shuffle(data_list)

    # tokenize
    dataset = Dataset.from_dict(
        {"text": [item["text"] for item in data_list[:]]})
    dataset = dataset.map(lambda samples: tokenizer(
        samples['text']), batched=True)

    return dataset


# モデル自身によるデータセットの生成

In [13]:
#予測周りのutility funcs
import re
import torch
import gc
from IPython.display import clear_output
from trl import AutoModelForCausalLMWithValueHead
def gen_text_stop_word(prompt,model,tokenizer,
                       device="cuda:0",
                       stop_words=["#Problem","#Reason","# Problem"],
                       double_stop_words=["#Prediction"],
                       stream=False,
                       #stream=True,
                       max_tokens=400,
                       ):
    gc.collect()
    torch.cuda.empty_cache()



    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    # 生成されたテキストを格納する変数
    generated_text = ""

    # トークンを一つずつ生成
    for i in range(max_tokens):
        # 次のトークンを予測
        outputs = model(input_ids)
        if type(model) is AutoModelForCausalLMWithValueHead:
            #AutoModelForCausalLMWithValueHeadの場合
            logits = outputs[0]
            next_token_logits = logits[:, -1, :]
        else:
            next_token_logits = outputs.logits[:, -1, :]

        next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1)

        # 生成されたトークンを現在の入力に追加
        input_ids = torch.cat([input_ids, next_token], dim=-1)

        # 生成されたテキストを更新
        generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)[len(prompt):]

        if stream:
            if i%30==0:
                clear_output()
            print(generated_text)

        # ストップワードのチェック
        if any(stop_word in generated_text for stop_word in stop_words):
            break

        # 2回以上出現したらstopするwordのcheck 
        stop_flag=False
        for check_word in double_stop_words:
            count=generated_text.count(check_word)
            if count>=2:
                stop_flag=True
                break
        if stop_flag:
            break

    return generated_text

def ask_value(prompt,model,tokenizer):
    res=gen_text_stop_word(prompt,model,tokenizer)
    #res=pipe(prompt)[0]["generated_text"]
    print("----\n\n")
    print(res.strip())

    regex_list=[
        r"Prediction:\s*(\d+\.?\d*)",
        r"Prediction:\s*(-?\d+\.?\d*)",
    ]

    value=None
    for reg in regex_list:
        match = re.search(reg, res)
        if match:
            value = match.group(1)
            break


    return res,value



In [14]:
example_prompt_list=[
    {
        "mpC": 298.5,
        "name": "a-aminocaproic acid",
        "smiles": "CCCCC(N)C(=O)O",
        "csid": 9103,
        "link": "http://www.srcinc.com/what-we-do/product.aspx?id=133",
        "source": "PHYSPROP",
        "Reason": "The target compound is a-amino caproic acid, which contains several functional groups and structural features that affect its melting point. The basic unit here is a straight-chain alkane, which we can use as a starting reference. A general approximation is given that, for each methylene (-CH2-) group in a saturated alkane chain, the melting point increases by about 20\u00b0C from methane as baseline, which is approximately -182.5\u00b0C. In a-amino caproic acid, there are five methylene groups leading to an increase of 100\u00b0C. The amino group (-NH2) typically increases the melting point due to intermolecular hydrogen bonding with approximately 100\u00b0C, depending on the structure. Lastly, the carboxylic acid group (-COOH) also highly contributes to melting point elevation, possibly more than 100\u00b0C due to strong hydrogen-bonding and dimer formation. There may be interactions between the amino and carboxylic acid groups that can further bolster these effects. Considering the combination of these functional groups and the lack of any large branching or rings, we estimate a moderate to high increase over and above the linear alkane equivalent.",
        "Prediction(integer)": "299",
    },
    {
        "mpC": 240.0,
        "name": "tryptazan",
        "smiles": "O=C(O)C(N)Cc2nnc1ccccc12",
        "csid": 86015,
        "link": "http://www.srcinc.com/what-we-do/product.aspx?id=133",
        "source": "PHYSPROP",
        "Reason": "The target compound tryptazan contains various functional groups that affect its melting point. Starting with the basic unit of benzene, which has a melting point of 5.5 \u00b0C, we consider the following adjustments: 1) The amide group (-CONH-) is known to increase the melting point significantly due to its capacity for hydrogen bonding and resonance, which can contribute to stronger intermolecular forces (+90 \u00b0C). 2) The carboxylic acid group (-COOH) further increases the melting point because it can form strong hydrogen bonds and dimers, which significantly raises intermolecular attraction (+100 \u00b0C). 3) The presence of the fused pyrazine ring introduces additional aromaticity and planarity, which can enhance \u03c0-\u03c0 stacking interactions between molecules, leading to an increased melting point (+40 \u00b0C). 4) The amine group (-NH-) also participates in hydrogen bonding, adding to the melting point increase; however, with one less proton donor than the amide or carboxylic acid, its contribution is lower (+10 \u00b0C). Adding these values to the melting point of benzene provides a predicted melting point for tryptazan.",
        "Prediction(integer)": "240",
    },
    {
        "mpC": 45.0,
        "name": "1,7,7-trimethyl-3-methylenebicyclo[2.2.1]heptan-2-one",
        "smiles": "O=C1C(=C)C2CCC1(C)C2(C)C",
        "csid": 2031466,
        "link": "http://dx.doi.org/10.1021/ci0500132",
        "source": "Karthikeyan M.; Glen R.C.; Bender A. General melting point prediction based on a diverse compound dataset and artificial neural networks. J. Chem. Inf. Model.; 2005; 45(3); 581-3070",
        "Reason": "The compound in question is 1,7,7-trimethyl-3-methylenebicyclo[2.2.1]heptan-2-one, which contains several functional groups and structural features that can influence its melting point. A reference compound for comparison could be norbornanone with a melting point of 28 degrees Celsius, which is comparable to the bicyclic core of the target compound without the additional substitutions we need to consider.\n\n- The bicyclic structure inherently has ring strain and steric hindrance which may increase its melting point compared to acyclic compounds.\n- The ketone group (C=O) typically raises melting points due to the polarity it introduces, allowing for stronger intermolecular forces (dipole-dipole interaction). Estimating this effect at about +20 degrees Celsius.\n- The presence of three methyl (CH3) groups, which are electron-donating, would slightly increase the melting point due to increased molecular weight and van der Waals forces. Estimating the effect of each methyl group as about +5 degrees Celsius considering the size and potential increase in van der Waals interactions.\n- The methylene (CH2=) group could increase the melting point marginally due to added rigidity in the structure. Estimating this effect at about +5 degrees Celsius.\n\nSumming these effects for prediction, we have the baseline norbornanone value of 28 degrees Celsius (+20 for ketone group) + (3 * +5 for each methyl group) + (+5 for the methylene group).",
        "Prediction(integer)": "45",
    },
    {
        "mpC": 30.0,
        "name": "dimethyl 1,10-decanedicarboxylate",
        "smiles": "COC(=O)CCCCCCCCCCC(=O)OC",
        "csid": 67007,
        "link": "http://www.alfa.com/en/GP100W.pgm?DSSTK=L08757",
        "source": "Alfa Aesar",
        "Reason": "The compound dimethyl 1,10-decanedicarboxylate consists of a long aliphatic chain (decanedicarboxylate) and two ester functional groups. For such a long carbon chain, the melting point is heavily influenced by the crystalline packing efficiency and van der Waals interactions. Alkanes with around 10 carbon atoms have a typical melting point around 30 degrees Celsius. Each ester group in the compound will tend to disrupt the orderly packing because of the oxygen atoms' polar nature, which might slightly decrease the melting point compared to a pure hydrocarbon chain, while also increasing molecular weight which could slightly increase the melting point. As there are two ester groups, we can estimate their effect on the melting point to somewhat balance out, leading to a small overall modification from the base hydrocarbon melting point. Without specific literature on similar structures, a precise numerical prediction of the ester groups' effect is challenging, but it can be anticipated to be within +/- 5 degrees celsius.",
        "Prediction(integer)": "30",
    },
    {
        "mpC": 28.0,
        "name": "4-bromotoluene",
        "smiles": "Cc1ccc(cc1)Br",
        "csid": 13875258,
        "link": "http://www.alfa.com/en/GP100W.pgm?DSSTK=A15843",
        "source": "Alfa Aesar",
        "Reason": "The basic unit, benzene, has a melting point of 5.5\u00b0C. The methyl group typically raises the melting point due to a larger molecular weight and slight increases in van der Waals forces; however, for the melting point the influence might be smaller than for the boiling point, so we estimate a milder increase of +5\u00b0C. The bromine atom, being substantially heavier than a hydrogen atom, will indeed increase the van der Waals forces and subsequently the melting point, estimated at +50\u00b0C. Yet, the presence of bulky groups like bromine can interfere with the crystalline packing of molecules, which may mitigate the rise in melting point. Since molecular packing is crucial for the melting point, this hindrance effect can be estimated as a significant reduction, about -30\u00b0C for the overall compound's structural disruption effect.",
        "Prediction(integer)": "28",


    }
]

In [16]:
import json
model.eval()
#model.train()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MixtralForCausalLM(
      (model): MixtralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MixtralDecoderLayer(
            (self_attn): MixtralFlashAttention2(
              (q_proj): Linear(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(
                in_features=4096, out_features=1024, bias=False
        

In [None]:
from datetime import datetime
import json

#prompt tuningをランダムに変えながら､訓練データで予測(自習)していく

#予測時のハイパラ
n_max_trials=4  # 値を返さなかったときの再試行の最大数
error_threshold=30 #結果を保存する許容誤差
prediction_results={}

res_list=[]

random.seed(1234)
for train_id in tqdm(range(10**5)):
    #clear_output()
    gc.collect()
    torch.cuda.empty_cache()
    for _ in range(n_max_trials):
        try:
        #if True:

            n_prompt_examples=random.randint(1,5) #何件の例題をprompt tuningで出すか
            prompt=generate_question_prompt(dataset,train_id,n_prompt_examples=n_prompt_examples,prompt_dataset=example_prompt_list)
            reason,value=ask_value(prompt,model,tokenizer)
        except Exception as e:
            print(e)
            continue

        if len(reason)<30:
            continue

        if value is not None:
            try:
                value=float(value)
            except:
                continue

            record=copy.deepcopy(dataset[train_id])
            record["Reason"]=reason
            record["Prediction(integer)"]=value
            record["model_name"]=model_name

            err=abs(record["mpC"]-float(value))
            print("actual: ",record["mpC"],"predicted: ", value,"err: ",err)
            print(reason)

            
            if err<error_threshold:
                current_datetime = datetime.now()

                save_path=f"dataset/240116mixtral_reasoning/{current_datetime}.json"
                save_path=f"dataset/240116llama2_reasoning/{current_datetime}.json"
                save_path.replace("-","").replace(":","")
                with open(save_path,"w") as f:
                    json.dump(record,f,indent=4)

                break

# モデルの訓練

In [17]:
import transformers
from datetime import datetime


def gen_train_text(dataset):
    train_text_list=[]
    for id in range(len(dataset)):
        prompt=gen_compound_text(dataset[id],
                                    reason=dataset[id]["Reason"],
                                    prediction=dataset[id]["Prediction(integer)"])
        train_text_list.append(prompt)

    return train_text_list


#tokenized_dataset = prepare_dataset(gen_train_text(example_prompt_list), tokenizer)
tokenized_dataset = prepare_dataset(gen_train_text(dataset)[n_test:], tokenizer)

#train
train_args = transformers.TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        #gradient_accumulation_steps=1,
        warmup_steps=0,
        num_train_epochs=epochs,
        learning_rate=lr,
        fp16=True,
        logging_steps=100,
        save_total_limit=1,
        output_dir='outputs/'+datetime.now().strftime('%Y%m%d%H%M%S'),
        gradient_checkpointing=gradient_checkpointing,
    )

# trainer
#callbacks = [EarlyStoppingCallback()]
callbacks = []

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=train_args,
    callbacks=callbacks,
    data_collator=transformers.DataCollatorForLanguageModeling(
        tokenizer, mlm=False)
)

if do_train:
    training_result = trainer.train()
    training_result.training_loss

Map: 100%|██████████| 2356/2356 [00:00<00:00, 7871.98 examples/s]
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
100,1.2393
200,1.0236
300,0.9824
400,0.9336
500,0.9495
600,0.9182
700,0.8943
800,0.9234
900,0.9077
1000,0.8941


In [None]:


#モデルの保存: adapterのみ保存するとき｡
#from datetime import datetime
#current_datetime = datetime.now()
#model.save_pretrained(f"./outputs/{current_datetime}")
model.save_pretrained(f"./outputs/mixtral_3epoch_0117")

#モデルの読み込み: afapter経由で読み込むとき
from peft import AutoPeftModelForCausalLM
model_path="./outputs/7b_ft"
#model_path="./outputs/7b_ft_with_self_prediction_0115"

"""
model = AutoPeftModelForCausalLM.from_pretrained(model_path,
                                                 device_map=device_map,
                                                     torch_dtype=torch.float16,
                                                     use_flash_attention_2=True,
                                                 )
"""

# モデル性能の評価

In [None]:
model.eval()

#予測時のハイパラ
n_prompt_examples=0 #何件の例題をprompt tuningで出すか:  例題に予測が引っ張られがちなので､0にしてみる
n_max_trials=1  # 値を返さなかったときの再試行の最大数
random.seed(0)
prediction_results={}


res_list=[]
for test_id in tqdm(range(n_test)):
    print(f"promlem {test_id+1} / {n_test}")
    for _ in range(n_max_trials):
        try:
            prompt=generate_question_prompt(dataset,test_id,n_prompt_examples=n_prompt_examples)
            reason,value=ask_value(prompt,model,tokenizer)
        except Exception as e:
            print(e)
            continue


        if value is not None:
            record=copy.deepcopy(dataset[test_id])
            record["Test (Predicted reason)"]=reason
            record["Test (Predicted value)"]=value
            print("actual: ",record["mpC"],"predicted: ", record["Test (Predicted value)"],)
            res_list.append(record)
            break
prediction_results[n_prompt_examples]=res_list

In [None]:
#plot
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from datetime import datetime
import json
current_datetime = datetime.now()
vmin=-200
vmax=300

#plot prediction results
for n_prompt_examples,records in prediction_results.items():
    sel_df=pd.DataFrame(records)
    #floatに可能なものは変換
    sel_df["Test (Predicted value)"] = pd.to_numeric(sel_df["Test (Predicted value)"], errors='coerce')
    sel_df=sel_df[sel_df["Test (Predicted value)"].notnull()]
    if len(sel_df)==0:
        continue
    mse=mean_squared_error(sel_df["mpC"],sel_df["Test (Predicted value)"])

    plt.figure()
    sns.scatterplot(data=sel_df,x="mpC",y="Test (Predicted value)")
    plt.title(f"n_prompt_examples={n_prompt_examples} MSE={mse:.0f}")

    #x,yの範囲を揃える
    plt.xlim(vmin,vmax)
    plt.ylim(vmin,vmax)
    #対角線を描く
    plt.plot([vmin,vmax],[vmin,vmax],color="gray")
    formatted_filename = f"results/model=mixtral_{current_datetime.strftime('%Y%m%d_%H%M%S')}_train={do_train}.png"
    plt.savefig(formatted_filename)
    #break

save_json_filename=formatted_filename.replace(".png",".json")
with open(save_json_filename,"w") as f:
    json.dump(prediction_results,fp=f,
              indent=4)

In [None]:
#スコア
print("MSE: ",mean_squared_error(sel_df["mpC"],sel_df["Test (Predicted value)"]))
print("MAE: ",mean_absolute_error(sel_df["mpC"],sel_df["Test (Predicted value)"]))
print("R2: ", r2_score(sel_df["mpC"],sel_df["Test (Predicted value)"]))


In [None]:
#回答可能な問題の割合
sel_df.shape[0]/n_test