## Prepare dataset for fine-tuning with mp_20

In [5]:
import pandas as pd
df = pd.read_csv("../data/mp_20/mp_20.csv")
df.head(2)

Unnamed: 0.1,Unnamed: 0,material_id,formation_energy_per_atom,band_gap,pretty_formula,e_above_hull,elements,cif,spacegroup.number
0,37228,mp-1221227,-1.63746,0.2133,Na3MnCoNiO6,0.043001,"['Co', 'Mn', 'Na', 'Ni', 'O']",# generated using pymatgen\ndata_Na3MnCoNiO6\n...,8
1,19480,mp-974729,-0.314759,0.0,Nd(Al2Cu)4,0.0,"['Al', 'Cu', 'Nd']",# generated using pymatgen\ndata_Nd(Al2Cu)4\n_...,139


In [6]:
def create_input(row):
    return f"band_gap={row['band_gap']} spacegroup.number={row['spacegroup.number']}"

df['cif'] = df['cif'].str.replace('# generated using pymatgen\n', '', regex=False)

new_data = {
    'input': df.apply(create_input, axis=1),
    'output': df['cif']
}

df_for_llama = pd.DataFrame(new_data)
df_for_llama.to_csv('../data/df_for_llama.csv', index=False)

In [7]:
df_for_llama.head(2)

Unnamed: 0,input,output
0,band_gap=0.2132999999999998 spacegroup.number=8,data_Na3MnCoNiO6\n_symmetry_space_group_name_H...
1,band_gap=0.0 spacegroup.number=139,data_Nd(Al2Cu)4\n_symmetry_space_group_name_H-...


## Fine-tuning llama

In [1]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

# Загрузка CSV файла
df = pd.read_csv('../data/df_for_llama_m100.csv')

# Преобразование DataFrame в Dataset
dataset = Dataset.from_pandas(df)

# Загрузка токенизатора LLaMA
model_name = "NousResearch/Llama-2-7b-chat-hf"  # Укажите актуальную версию LLaMA
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Добавление специального токена EOS (End of Sentence)
tokenizer.pad_token = tokenizer.eos_token

# Функция для токенизации данных
def preprocess_function(examples):
    inputs = examples['input']
    outputs = examples['output']
    
    # Склеиваем вход и выход с помощью токенизатора
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    labels = tokenizer(outputs, max_length=512, truncation=True, padding='max_length')
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Применяем функцию предобработки к датасету
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/45129 [00:00<?, ? examples/s]

In [2]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

# Загрузка модели
model = AutoModelForCausalLM.from_pretrained(model_name)

# Если вы хотите использовать метод LoRA (Low-Rank Adaptation) для экономии ресурсов:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,  # ранг матрицы
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # целевые слои
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [None]:
training_args = TrainingArguments(
    output_dir="../model/results/",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=2,
    logging_dir='../model/logs/',
    logging_steps=10,
    fp16=True,  # Использование mixed precision для ускорения
    push_to_hub=False,  # Если вы хотите загрузить модель на Hugging Face Hub
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# Запуск обучения
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [1]:
model.save_pretrained("../model/fine_tuned_Llama-2-7b")
tokenizer.save_pretrained("../model/fine_tuned_Llama-2-7b")

NameError: name 'model' is not defined

In [None]:
from transformers import pipeline
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from transformers import AutoTokenizer

# Загрузка fine-tuned модели
model = AutoModelForCausalLM.from_pretrained("..model/fine_tuned_Llama-2-7b")
tokenizer = AutoTokenizer.from_pretrained("..model/fine_tuned_Llama-2-7b")

# Создание пайплайна для генерации текста
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)






Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


RuntimeError: probability tensor contains either `inf`, `nan` or element < 0

In [2]:
# Генерация текста
input_text = "band_gap=0.0 spacegroup.number=139"
output = generator(input_text, max_length=50)
print(output)

RuntimeError: probability tensor contains either `inf`, `nan` or element < 0

## Valid fine-tuning models

for valid models we synthesised a small dataset with inputs using description of banging_gap & spacegroup.numbers. We assume that generated CIF files will be converted to pymatgen.Structure without any problems. So metric quality of model will be percent of converted structures.

In [9]:
df.describe()

Unnamed: 0.1,Unnamed: 0,formation_energy_per_atom,band_gap,e_above_hull,spacegroup.number
count,45229.0,45229.0,45229.0,45229.0,45229.0
mean,22614.18371,-1.214873,0.789139,0.017189,123.126843
std,13056.910572,1.027732,1.417502,0.023133,79.655043
min,0.0,-5.153569,0.0,0.0,1.0
25%,11307.0,-2.001845,0.0,0.0,57.0
50%,22614.0,-0.806754,0.0,0.003469,139.0
75%,33921.0,-0.403862,1.1334,0.030189,194.0
max,45230.0,0.079825,17.9023,0.079999,229.0


In [10]:
import numpy as np
import pandas as pd

# Статистические характеристики из датасета
stats = {
    'band_gap': {'mean': 0.789139, 'std': 1.417502, 'min': 0.0, 'max': 17.902300},
    'spacegroup.number': {'min': 1, 'max': 229}
}

# Генерация 100 случайных значений
np.random.seed(42)  # Фиксируем seed для воспроизводимости

# Генерация значений для band_gap (нормальное распределение)
band_gap = np.clip(
    np.random.normal(stats['band_gap']['mean'], stats['band_gap']['std'], 100),
    stats['band_gap']['min'],
    stats['band_gap']['max']
)

# Генерация значений для spacegroup.number (равномерное распределение)
spacegroup_number = np.random.randint(stats['spacegroup.number']['min'], stats['spacegroup.number']['max'] + 1, 100)

# Формирование строк input
inputs = [
    f"band_gap={bg:.4f} spacegroup.number={sg}"
    for bg, sg in zip(band_gap, spacegroup_number)
]

# Создание DataFrame для сохранения результатов
synthetic_data = pd.DataFrame({'input': inputs})

# Сохранение в CSV файл
synthetic_data.to_csv('../data/banchmark/synthetic_inputs.csv', index=False)


In [12]:
synthetic_data.head(2)

Unnamed: 0,input
0,band_gap=1.4932 spacegroup.number=137
1,band_gap=0.5931 spacegroup.number=62


let's take 100 samples from mp_20 dataset for comparison between synthesed and real data for model

In [11]:
real_sample = df_for_llama.sample(n=100)
remaining_data = df_for_llama.drop(real_sample.index)

real_sample[['input']].to_csv('../data/banchmark/real_inputs.csv', index=False)

# Сохранение оставшихся данных в CSV файл
remaining_data.to_csv('../data/df_for_llama_m100.csv', index=False)

In [12]:
remaining_data.head(2)

Unnamed: 0,input,output
0,band_gap=0.2132999999999998 spacegroup.number=8,data_Na3MnCoNiO6\n_symmetry_space_group_name_H...
1,band_gap=0.0 spacegroup.number=139,data_Nd(Al2Cu)4\n_symmetry_space_group_name_H-...


In [None]:
import pandas as pd
from pymatgen.core.structure import Structure
from pymatgen.io.cif import CifParser
from transformers import pipeline

def validate_cif(real_csv, synthetic_csv, model_path):
    # Загрузка модели
    generator = pipeline("text-generation", model=model_path)

    def process_dataset(csv_file, dataset_type):
        # Загрузка CSV файла
        df = pd.read_csv(csv_file)
        
        results = []
        successful_count = 0

        for idx, row in df.iterrows():
            input_text = row['input']
            
            # Генерация CIF строки с помощью модели
            generated_cif = generator(input_text, max_length=500)[0]['generated_text']
            
            try:
                # Попытка преобразовать CIF в структуру pymatgen
                structure = Structure.from_str(generated_cif, fmt="cif")
                conclusion = "Successful"
                successful_count += 1
            except Exception as e:
                # Если возникла ошибка, считаем генерацию неудачной
                conclusion = "Failed"

            # Добавляем результат в список
            results.append({
                'input': input_text,
                'generated_cif': generated_cif,
                'conclusion': conclusion
            })

        # Создание DataFrame с результатами
        result_df = pd.DataFrame(results)
        
        # Сохранение результатов в CSV файл
        result_df.to_csv(f'{dataset_type}_validation_results.csv', index=False)
        
        # Подсчет процента успешных генераций
        success_rate = (successful_count / len(df)) * 100
        
        return result_df, success_rate

    # Обработка реальных данных
    real_results, real_success_rate = process_dataset(real_csv, 'real')

    # Обработка синтезированных данных
    synthetic_results, synthetic_success_rate = process_dataset(synthetic_csv, 'synthetic')

    # Вывод результатов
    print(f"Real data success rate: {real_success_rate:.2f}%")
    print(f"Synthetic data success rate: {synthetic_success_rate:.2f}%")

    return real_results, synthetic_results

# Пример использования функции
real_csv = 'real_inputs_100.csv'
synthetic_csv = 'synthetic_inputs.csv'
model_path = './fine_tuned_llama'  # Укажите путь к вашей fine-tuned модели

real_results, synthetic_results = validate_cif(real_csv, synthetic_csv, model_path)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pymatgen.core.structure import Structure
from transformers import pipeline

def analyze_attempts(real_csv, synthetic_csv, model_path, attempts_list):
    # Загрузка модели
    generator = pipeline("text-generation", model=model_path)

    def process_dataset(csv_file, dataset_type):
        # Загрузка CSV файла
        df = pd.read_csv(csv_file)
        
        success_rates = []  # Для хранения процентов успешных генераций
        
        for attempts in attempts_list:
            successful_count = 0

            for idx, row in df.iterrows():
                input_text = row['input']
                success = False  # Флаг успеха для текущего input

                for _ in range(attempts):
                    # Генерация CIF строки с помощью модели
                    generated_cif = generator(input_text, max_length=500)[0]['generated_text']

                    try:
                        # Попытка преобразовать CIF в структуру pymatgen
                        structure = Structure.from_str(generated_cif, fmt="cif")
                        success = True  # Если успешно, выходим из цикла
                        break
                    except Exception:
                        continue  # Если неудачно, пробуем снова

                if success:
                    successful_count += 1

            # Подсчет процента успешных генераций
            success_rate = (successful_count / len(df)) * 100
            success_rates.append(success_rate)

        return success_rates

    # Обработка реальных данных
    real_success_rates = process_dataset(real_csv, 'real')

    # Обработка синтезированных данных
    synthetic_success_rates = process_dataset(synthetic_csv, 'synthetic')

    # Построение графика
    plt.figure(figsize=(10, 6))
    plt.plot(attempts_list, real_success_rates, label='Real Data', marker='o')
    plt.plot(attempts_list, synthetic_success_rates, label='Synthetic Data', marker='x')
    
    plt.title('Success Rate vs Number of Attempts')
    plt.xlabel('Number of Attempts per Input')
    plt.ylabel('Success Rate (%)')
    plt.legend()
    plt.grid(True)
    
    # Сохранение графика в файл
    plt.savefig('success_rate_vs_attempts.png')  # Сохраняем график в PNG файл
    plt.show()

    # Вывод данных в текстовом формате
    with open('success_rates.txt', 'w') as f:
        f.write("Number of Attempts: {}\n".format(attempts_list))
        f.write("Real Data Success Rates (%): {}\n".format(real_success_rates))
        f.write("Synthetic Data Success Rates (%): {}\n".format(synthetic_success_rates))

    return real_success_rates, synthetic_success_rates

# Пример использования функции
real_csv = 'real_inputs_100.csv'
synthetic_csv = 'synthetic_inputs.csv'
model_path = './fine_tuned_llama'  # Укажите путь к вашей fine-tuned модели
attempts_list = [1, 2, 3, 5, 10]  # Список количества попыток

real_success_rates, synthetic_success_rates = analyze_attempts(real_csv, synthetic_csv, model_path, attempts_list)