## Prepare dataset for fine-tuning with mp_20

In [5]:
import pandas as pd
df = pd.read_csv("../data/mp_20/mp_20.csv")
df.head(2)

Unnamed: 0.1,Unnamed: 0,material_id,formation_energy_per_atom,band_gap,pretty_formula,e_above_hull,elements,cif,spacegroup.number
0,37228,mp-1221227,-1.63746,0.2133,Na3MnCoNiO6,0.043001,"['Co', 'Mn', 'Na', 'Ni', 'O']",# generated using pymatgen\ndata_Na3MnCoNiO6\n...,8
1,19480,mp-974729,-0.314759,0.0,Nd(Al2Cu)4,0.0,"['Al', 'Cu', 'Nd']",# generated using pymatgen\ndata_Nd(Al2Cu)4\n_...,139


In [6]:
def create_input(row):
    return f"band_gap={row['band_gap']} spacegroup.number={row['spacegroup.number']}"

df['cif'] = df['cif'].str.replace('# generated using pymatgen\n', '', regex=False)

new_data = {
    'input': df.apply(create_input, axis=1),
    'output': df['cif']
}

df_for_llama = pd.DataFrame(new_data)
df_for_llama.to_csv('../data/df_for_llama.csv', index=False)

In [7]:
df_for_llama.head(2)

Unnamed: 0,input,output
0,band_gap=0.2132999999999998 spacegroup.number=8,data_Na3MnCoNiO6\n_symmetry_space_group_name_H...
1,band_gap=0.0 spacegroup.number=139,data_Nd(Al2Cu)4\n_symmetry_space_group_name_H-...


## Fine-tuning llama

In [1]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

# –ó–∞–≥—Ä—É–∑–∫–∞ CSV —Ñ–∞–π–ª–∞
df = pd.read_csv('../data/df_for_llama_m100.csv')

# –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ DataFrame –≤ Dataset
dataset = Dataset.from_pandas(df)

# –ó–∞–≥—Ä—É–∑–∫–∞ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞ LLaMA
model_name = "NousResearch/Llama-2-7b-chat-hf"  # –£–∫–∞–∂–∏—Ç–µ –∞–∫—Ç—É–∞–ª—å–Ω—É—é –≤–µ—Ä—Å–∏—é LLaMA
tokenizer = AutoTokenizer.from_pretrained(model_name)

# –î–æ–±–∞–≤–ª–µ–Ω–∏–µ —Å–ø–µ—Ü–∏–∞–ª—å–Ω–æ–≥–æ —Ç–æ–∫–µ–Ω–∞ EOS (End of Sentence)
tokenizer.pad_token = tokenizer.eos_token

# –§—É–Ω–∫—Ü–∏—è –¥–ª—è —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏ –¥–∞–Ω–Ω—ã—Ö
def preprocess_function(examples):
    inputs = examples['input']
    outputs = examples['output']
    
    # –°–∫–ª–µ–∏–≤–∞–µ–º –≤—Ö–æ–¥ –∏ –≤—ã—Ö–æ–¥ —Å –ø–æ–º–æ—â—å—é —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    labels = tokenizer(outputs, max_length=512, truncation=True, padding='max_length')
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# –ü—Ä–∏–º–µ–Ω—è–µ–º —Ñ—É–Ω–∫—Ü–∏—é –ø—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∏ –∫ –¥–∞—Ç–∞—Å–µ—Ç—É
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/45129 [00:00<?, ? examples/s]

In [2]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

# –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏
model = AutoModelForCausalLM.from_pretrained(model_name)

# –ï—Å–ª–∏ –≤—ã —Ö–æ—Ç–∏—Ç–µ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å –º–µ—Ç–æ–¥ LoRA (Low-Rank Adaptation) –¥–ª—è —ç–∫–æ–Ω–æ–º–∏–∏ —Ä–µ—Å—É—Ä—Å–æ–≤:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,  # —Ä–∞–Ω–≥ –º–∞—Ç—Ä–∏—Ü—ã
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # —Ü–µ–ª–µ–≤—ã–µ —Å–ª–æ–∏
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [None]:
training_args = TrainingArguments(
    output_dir="../model/results/",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=2,
    logging_dir='../model/logs/',
    logging_steps=10,
    fp16=True,  # –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ mixed precision –¥–ª—è —É—Å–∫–æ—Ä–µ–Ω–∏—è
    push_to_hub=False,  # –ï—Å–ª–∏ –≤—ã —Ö–æ—Ç–∏—Ç–µ –∑–∞–≥—Ä—É–∑–∏—Ç—å –º–æ–¥–µ–ª—å –Ω–∞ Hugging Face Hub
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# –ó–∞–ø—É—Å–∫ –æ–±—É—á–µ–Ω–∏—è
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [1]:
model.save_pretrained("../model/fine_tuned_Llama-2-7b")
tokenizer.save_pretrained("../model/fine_tuned_Llama-2-7b")

NameError: name 'model' is not defined

In [None]:
from transformers import pipeline
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from transformers import AutoTokenizer

# –ó–∞–≥—Ä—É–∑–∫–∞ fine-tuned –º–æ–¥–µ–ª–∏
model = AutoModelForCausalLM.from_pretrained("..model/fine_tuned_Llama-2-7b")
tokenizer = AutoTokenizer.from_pretrained("..model/fine_tuned_Llama-2-7b")

# –°–æ–∑–¥–∞–Ω–∏–µ –ø–∞–π–ø–ª–∞–π–Ω–∞ –¥–ª—è –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏ —Ç–µ–∫—Å—Ç–∞
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)






Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


RuntimeError: probability tensor contains either `inf`, `nan` or element < 0

In [2]:
# –ì–µ–Ω–µ—Ä–∞—Ü–∏—è —Ç–µ–∫—Å—Ç–∞
input_text = "band_gap=0.0 spacegroup.number=139"
output = generator(input_text, max_length=50)
print(output)

RuntimeError: probability tensor contains either `inf`, `nan` or element < 0

## Valid fine-tuning models

for valid models we synthesised a small dataset with inputs using description of banging_gap & spacegroup.numbers. We assume that generated CIF files will be converted to pymatgen.Structure without any problems. So metric quality of model will be percent of converted structures.

In [9]:
df.describe()

Unnamed: 0.1,Unnamed: 0,formation_energy_per_atom,band_gap,e_above_hull,spacegroup.number
count,45229.0,45229.0,45229.0,45229.0,45229.0
mean,22614.18371,-1.214873,0.789139,0.017189,123.126843
std,13056.910572,1.027732,1.417502,0.023133,79.655043
min,0.0,-5.153569,0.0,0.0,1.0
25%,11307.0,-2.001845,0.0,0.0,57.0
50%,22614.0,-0.806754,0.0,0.003469,139.0
75%,33921.0,-0.403862,1.1334,0.030189,194.0
max,45230.0,0.079825,17.9023,0.079999,229.0


In [10]:
import numpy as np
import pandas as pd

# –°—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏–µ —Ö–∞—Ä–∞–∫—Ç–µ—Ä–∏—Å—Ç–∏–∫–∏ –∏–∑ –¥–∞—Ç–∞—Å–µ—Ç–∞
stats = {
    'band_gap': {'mean': 0.789139, 'std': 1.417502, 'min': 0.0, 'max': 17.902300},
    'spacegroup.number': {'min': 1, 'max': 229}
}

# –ì–µ–Ω–µ—Ä–∞—Ü–∏—è 100 —Å–ª—É—á–∞–π–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π
np.random.seed(42)  # –§–∏–∫—Å–∏—Ä—É–µ–º seed –¥–ª—è –≤–æ—Å–ø—Ä–æ–∏–∑–≤–æ–¥–∏–º–æ—Å—Ç–∏

# –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –∑–Ω–∞—á–µ–Ω–∏–π –¥–ª—è band_gap (–Ω–æ—Ä–º–∞–ª—å–Ω–æ–µ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ)
band_gap = np.clip(
    np.random.normal(stats['band_gap']['mean'], stats['band_gap']['std'], 100),
    stats['band_gap']['min'],
    stats['band_gap']['max']
)

# –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –∑–Ω–∞—á–µ–Ω–∏–π –¥–ª—è spacegroup.number (—Ä–∞–≤–Ω–æ–º–µ—Ä–Ω–æ–µ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ)
spacegroup_number = np.random.randint(stats['spacegroup.number']['min'], stats['spacegroup.number']['max'] + 1, 100)

# –§–æ—Ä–º–∏—Ä–æ–≤–∞–Ω–∏–µ —Å—Ç—Ä–æ–∫ input
inputs = [
    f"band_gap={bg:.4f} spacegroup.number={sg}"
    for bg, sg in zip(band_gap, spacegroup_number)
]

# –°–æ–∑–¥–∞–Ω–∏–µ DataFrame –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
synthetic_data = pd.DataFrame({'input': inputs})

# –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –≤ CSV —Ñ–∞–π–ª
synthetic_data.to_csv('../data/banchmark/synthetic_inputs.csv', index=False)


In [12]:
synthetic_data.head(2)

Unnamed: 0,input
0,band_gap=1.4932 spacegroup.number=137
1,band_gap=0.5931 spacegroup.number=62


let's take 100 samples from mp_20 dataset for comparison between synthesed and real data for model

In [11]:
real_sample = df_for_llama.sample(n=100)
remaining_data = df_for_llama.drop(real_sample.index)

real_sample[['input']].to_csv('../data/banchmark/real_inputs.csv', index=False)

# –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –æ—Å—Ç–∞–≤—à–∏—Ö—Å—è –¥–∞–Ω–Ω—ã—Ö –≤ CSV —Ñ–∞–π–ª
remaining_data.to_csv('../data/df_for_llama_m100.csv', index=False)

In [12]:
remaining_data.head(2)

Unnamed: 0,input,output
0,band_gap=0.2132999999999998 spacegroup.number=8,data_Na3MnCoNiO6\n_symmetry_space_group_name_H...
1,band_gap=0.0 spacegroup.number=139,data_Nd(Al2Cu)4\n_symmetry_space_group_name_H-...


In [None]:
import pandas as pd
from pymatgen.core.structure import Structure
from pymatgen.io.cif import CifParser
from transformers import pipeline

def validate_cif(real_csv, synthetic_csv, model_path):
    # –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏
    generator = pipeline("text-generation", model=model_path)

    def process_dataset(csv_file, dataset_type):
        # –ó–∞–≥—Ä—É–∑–∫–∞ CSV —Ñ–∞–π–ª–∞
        df = pd.read_csv(csv_file)
        
        results = []
        successful_count = 0

        for idx, row in df.iterrows():
            input_text = row['input']
            
            # –ì–µ–Ω–µ—Ä–∞—Ü–∏—è CIF —Å—Ç—Ä–æ–∫–∏ —Å –ø–æ–º–æ—â—å—é –º–æ–¥–µ–ª–∏
            generated_cif = generator(input_text, max_length=500)[0]['generated_text']
            
            try:
                # –ü–æ–ø—ã—Ç–∫–∞ –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç—å CIF –≤ —Å—Ç—Ä—É–∫—Ç—É—Ä—É pymatgen
                structure = Structure.from_str(generated_cif, fmt="cif")
                conclusion = "Successful"
                successful_count += 1
            except Exception as e:
                # –ï—Å–ª–∏ –≤–æ–∑–Ω–∏–∫–ª–∞ –æ—à–∏–±–∫–∞, —Å—á–∏—Ç–∞–µ–º –≥–µ–Ω–µ—Ä–∞—Ü–∏—é –Ω–µ—É–¥–∞—á–Ω–æ–π
                conclusion = "Failed"

            # –î–æ–±–∞–≤–ª—è–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç –≤ —Å–ø–∏—Å–æ–∫
            results.append({
                'input': input_text,
                'generated_cif': generated_cif,
                'conclusion': conclusion
            })

        # –°–æ–∑–¥–∞–Ω–∏–µ DataFrame —Å —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞–º–∏
        result_df = pd.DataFrame(results)
        
        # –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤ –≤ CSV —Ñ–∞–π–ª
        result_df.to_csv(f'{dataset_type}_validation_results.csv', index=False)
        
        # –ü–æ–¥—Å—á–µ—Ç –ø—Ä–æ—Ü–µ–Ω—Ç–∞ —É—Å–ø–µ—à–Ω—ã—Ö –≥–µ–Ω–µ—Ä–∞—Ü–∏–π
        success_rate = (successful_count / len(df)) * 100
        
        return result_df, success_rate

    # –û–±—Ä–∞–±–æ—Ç–∫–∞ —Ä–µ–∞–ª—å–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö
    real_results, real_success_rate = process_dataset(real_csv, 'real')

    # –û–±—Ä–∞–±–æ—Ç–∫–∞ —Å–∏–Ω—Ç–µ–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö
    synthetic_results, synthetic_success_rate = process_dataset(synthetic_csv, 'synthetic')

    # –í—ã–≤–æ–¥ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
    print(f"Real data success rate: {real_success_rate:.2f}%")
    print(f"Synthetic data success rate: {synthetic_success_rate:.2f}%")

    return real_results, synthetic_results

# –ü—Ä–∏–º–µ—Ä –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è —Ñ—É–Ω–∫—Ü–∏–∏
real_csv = 'real_inputs_100.csv'
synthetic_csv = 'synthetic_inputs.csv'
model_path = './fine_tuned_llama'  # –£–∫–∞–∂–∏—Ç–µ –ø—É—Ç—å –∫ –≤–∞—à–µ–π fine-tuned –º–æ–¥–µ–ª–∏

real_results, synthetic_results = validate_cif(real_csv, synthetic_csv, model_path)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pymatgen.core.structure import Structure
from transformers import pipeline

def analyze_attempts(real_csv, synthetic_csv, model_path, attempts_list):
    # –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏
    generator = pipeline("text-generation", model=model_path)

    def process_dataset(csv_file, dataset_type):
        # –ó–∞–≥—Ä—É–∑–∫–∞ CSV —Ñ–∞–π–ª–∞
        df = pd.read_csv(csv_file)
        
        success_rates = []  # –î–ª—è —Ö—Ä–∞–Ω–µ–Ω–∏—è –ø—Ä–æ—Ü–µ–Ω—Ç–æ–≤ —É—Å–ø–µ—à–Ω—ã—Ö –≥–µ–Ω–µ—Ä–∞—Ü–∏–π
        
        for attempts in attempts_list:
            successful_count = 0

            for idx, row in df.iterrows():
                input_text = row['input']
                success = False  # –§–ª–∞–≥ —É—Å–ø–µ—Ö–∞ –¥–ª—è —Ç–µ–∫—É—â–µ–≥–æ input

                for _ in range(attempts):
                    # –ì–µ–Ω–µ—Ä–∞—Ü–∏—è CIF —Å—Ç—Ä–æ–∫–∏ —Å –ø–æ–º–æ—â—å—é –º–æ–¥–µ–ª–∏
                    generated_cif = generator(input_text, max_length=500)[0]['generated_text']

                    try:
                        # –ü–æ–ø—ã—Ç–∫–∞ –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç—å CIF –≤ —Å—Ç—Ä—É–∫—Ç—É—Ä—É pymatgen
                        structure = Structure.from_str(generated_cif, fmt="cif")
                        success = True  # –ï—Å–ª–∏ —É—Å–ø–µ—à–Ω–æ, –≤—ã—Ö–æ–¥–∏–º –∏–∑ —Ü–∏–∫–ª–∞
                        break
                    except Exception:
                        continue  # –ï—Å–ª–∏ –Ω–µ—É–¥–∞—á–Ω–æ, –ø—Ä–æ–±—É–µ–º —Å–Ω–æ–≤–∞

                if success:
                    successful_count += 1

            # –ü–æ–¥—Å—á–µ—Ç –ø—Ä–æ—Ü–µ–Ω—Ç–∞ —É—Å–ø–µ—à–Ω—ã—Ö –≥–µ–Ω–µ—Ä–∞—Ü–∏–π
            success_rate = (successful_count / len(df)) * 100
            success_rates.append(success_rate)

        return success_rates

    # –û–±—Ä–∞–±–æ—Ç–∫–∞ —Ä–µ–∞–ª—å–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö
    real_success_rates = process_dataset(real_csv, 'real')

    # –û–±—Ä–∞–±–æ—Ç–∫–∞ —Å–∏–Ω—Ç–µ–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö
    synthetic_success_rates = process_dataset(synthetic_csv, 'synthetic')

    # –ü–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ –≥—Ä–∞—Ñ–∏–∫–∞
    plt.figure(figsize=(10, 6))
    plt.plot(attempts_list, real_success_rates, label='Real Data', marker='o')
    plt.plot(attempts_list, synthetic_success_rates, label='Synthetic Data', marker='x')
    
    plt.title('Success Rate vs Number of Attempts')
    plt.xlabel('Number of Attempts per Input')
    plt.ylabel('Success Rate (%)')
    plt.legend()
    plt.grid(True)
    
    # –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –≥—Ä–∞—Ñ–∏–∫–∞ –≤ —Ñ–∞–π–ª
    plt.savefig('success_rate_vs_attempts.png')  # –°–æ—Ö—Ä–∞–Ω—è–µ–º –≥—Ä–∞—Ñ–∏–∫ –≤ PNG —Ñ–∞–π–ª
    plt.show()

    # –í—ã–≤–æ–¥ –¥–∞–Ω–Ω—ã—Ö –≤ —Ç–µ–∫—Å—Ç–æ–≤–æ–º —Ñ–æ—Ä–º–∞—Ç–µ
    with open('success_rates.txt', 'w') as f:
        f.write("Number of Attempts: {}\n".format(attempts_list))
        f.write("Real Data Success Rates (%): {}\n".format(real_success_rates))
        f.write("Synthetic Data Success Rates (%): {}\n".format(synthetic_success_rates))

    return real_success_rates, synthetic_success_rates

# –ü—Ä–∏–º–µ—Ä –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è —Ñ—É–Ω–∫—Ü–∏–∏
real_csv = 'real_inputs_100.csv'
synthetic_csv = 'synthetic_inputs.csv'
model_path = './fine_tuned_llama'  # –£–∫–∞–∂–∏—Ç–µ –ø—É—Ç—å –∫ –≤–∞—à–µ–π fine-tuned –º–æ–¥–µ–ª–∏
attempts_list = [1, 2, 3, 5, 10]  # –°–ø–∏—Å–æ–∫ –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ –ø–æ–ø—ã—Ç–æ–∫

real_success_rates, synthetic_success_rates = analyze_attempts(real_csv, synthetic_csv, model_path, attempts_list)