In [1]:
# Установите необходимые библиотеки
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig
from datasets import Dataset

# 1. Загрузка данных
df = pd.read_csv("C:\Programs\crystall_generation\data\df_for_llama_m100.csv")  # Ваш CSV файл с колонками input и output
train_data = Dataset.from_pandas(df)

# 2. Токенизация
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Для GPT-2

def preprocess_function(examples):
    inputs = examples["input"]
    outputs = examples["output"]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(outputs, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_data = train_data.map(preprocess_function, batched=True)



  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 45129/45129 [00:22<00:00, 2004.94 examples/s]


In [29]:
df

Unnamed: 0,input,output
0,band_gap=0.2132999999999998 spacegroup.number=8,data_Na3MnCoNiO6\r\n_symmetry_space_group_name...
1,band_gap=0.0 spacegroup.number=139,data_Nd(Al2Cu)4\r\n_symmetry_space_group_name_...
2,band_gap=0.0 spacegroup.number=225,data_LiMnIr2\r\n_symmetry_space_group_name_H-M...
3,band_gap=3.8556 spacegroup.number=62,data_LiCSN\r\n_symmetry_space_group_name_H-M ...
4,band_gap=0.0 spacegroup.number=71,data_Yb3Ga9Pt2\r\n_symmetry_space_group_name_H...
...,...,...
45124,band_gap=1.697 spacegroup.number=164,data_WS2\r\n_symmetry_space_group_name_H-M '...
45125,band_gap=0.0 spacegroup.number=225,data_Y2ZnPt\r\n_symmetry_space_group_name_H-M ...
45126,band_gap=1.9239 spacegroup.number=74,data_RbMgCoF6\r\n_symmetry_space_group_name_H-...
45127,band_gap=7.2758 spacegroup.number=82,data_BPO4\r\n_symmetry_space_group_name_H-M ...


In [4]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используем устройство: {device}")

Используем устройство: cuda


In [2]:
import torch
torch.cuda.is_available()

True

In [7]:
# 3. Настройка LoRA
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
peft_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=8,  # Ранг матриц LoRA
    lora_alpha=32,
    lora_dropout=0.1,
)
model = get_peft_model(model, peft_config)

# 4. Конфигурация обучения
training_args = TrainingArguments(
    output_dir="lora_model",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    learning_rate=1e-4,
    logging_steps=500,
    save_strategy="epoch",
    fp16=True,  # Для ускорения на GPU
)

# 5. Обучение
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    tokenizer=tokenizer,
)


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [18]:
trainer.train()

Step,Training Loss
500,3.2566
1000,3.2378
1500,3.2332
2000,3.2397
2500,3.2189
3000,3.2213
3500,3.2148
4000,3.2082
4500,3.2094
5000,3.1919


TrainOutput(global_step=33849, training_loss=3.1193139339015277, metrics={'train_runtime': 3026.9333, 'train_samples_per_second': 44.727, 'train_steps_per_second': 11.183, 'total_flos': 8996472283594752.0, 'train_loss': 3.1193139339015277, 'epoch': 3.0})

In [20]:
# 6. Сохранение модели
model.save_pretrained("lora_finetuned_model_gpt2")
tokenizer.save_pretrained("lora_finetuned_model_gpt2")

('lora_finetuned_model_gpt2\\tokenizer_config.json',
 'lora_finetuned_model_gpt2\\special_tokens_map.json',
 'lora_finetuned_model_gpt2\\vocab.json',
 'lora_finetuned_model_gpt2\\merges.txt',
 'lora_finetuned_model_gpt2\\added_tokens.json',
 'lora_finetuned_model_gpt2\\tokenizer.json')

In [27]:
from transformers import pipeline

generator = pipeline("text-generation", model="lora_finetuned_model_gpt2")
output = generator("Созданный cif файл со свойствами: band_gap=0.0 spacegroup.number=139", max_length=500)
print(output)

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Созданный cif файл со свойствами: band_gap=0.0 spacegroup.number=139 kc=10000 mp=0:05:45 cif=0:05:35 bh=0:54:29 cif=0:54:13 cif=0:53:35 f3=0 самх следитеняноество kb/mkt=2.45 bh/b,1 f2h/f,1 mt=16 kd/p,4 1 g6/s:1 m/s km/s/1/1 f2h/f:1,1,1 2 g6/s:1 km/s/1/1 f2h/f:1,1 2,2 3 g6/s:1 km/s/1/1 f2h/f:1,1,1 3,2 g6/s:1 km/s/1/1 f2h/f:1,2,2 4 g6/s:1 km/s/2/2 f2h/f:2,2,2,2 1 g6/s:1 km/s/2/2 f2h/f:2,2,2 4 g6/s:1 km/s/2/2 f2h/f:2,2,2 5 g6/s:1 km/s/2/2 f2h/f:2,2,2 6 g6/s:1 km/s/2/2 f2h/f:2,3,2 7 g6/s:1 km/s/2/2 f2h/f:2,3,3 12 g6/s:1 km/s/3/2 f2h/f:2,3,3 16 g6/s:1 km/s/3/2 f2h/f:3,3,3 17 g6/s:1 km/s/3/2 f2h/f:3,3,3 18 g'}]


In [28]:
print(output[0]['generated_text'])

Созданный cif файл со свойствами: band_gap=0.0 spacegroup.number=139 kc=10000 mp=0:05:45 cif=0:05:35 bh=0:54:29 cif=0:54:13 cif=0:53:35 f3=0 самх следитеняноество kb/mkt=2.45 bh/b,1 f2h/f,1 mt=16 kd/p,4 1 g6/s:1 m/s km/s/1/1 f2h/f:1,1,1 2 g6/s:1 km/s/1/1 f2h/f:1,1 2,2 3 g6/s:1 km/s/1/1 f2h/f:1,1,1 3,2 g6/s:1 km/s/1/1 f2h/f:1,2,2 4 g6/s:1 km/s/2/2 f2h/f:2,2,2,2 1 g6/s:1 km/s/2/2 f2h/f:2,2,2 4 g6/s:1 km/s/2/2 f2h/f:2,2,2 5 g6/s:1 km/s/2/2 f2h/f:2,2,2 6 g6/s:1 km/s/2/2 f2h/f:2,3,2 7 g6/s:1 km/s/2/2 f2h/f:2,3,3 12 g6/s:1 km/s/3/2 f2h/f:2,3,3 16 g6/s:1 km/s/3/2 f2h/f:3,3,3 17 g6/s:1 km/s/3/2 f2h/f:3,3,3 18 g


In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Путь к файлам модели
model_path = "C:/Programs/crystall_generation/Llama-3.2-3B-Instruct/original"

# Загрузка весов напрямую из .pth
state_dict = torch.load(f"{model_path}/consolidated.00.pth", map_location="cuda")

# Загрузка конфигурации из params.json
from transformers import AutoConfig

config = AutoConfig.from_pretrained(f"{model_path}/params.json")

# Создание модели вручную
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=None, config=config, state_dict=state_dict)

# Загрузка токенизатора
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Пример генерации
input_text = "What is the capital of France?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

output = model.generate(input_ids, max_new_tokens=50)
print(tokenizer.decode(output[0], skip_special_tokens=True))


  from .autonotebook import tqdm as notebook_tqdm


ValueError: Unrecognized model in C:/Programs/crystall_generation/Llama-3.2-3B-Instruct/original/params.json. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, aria, aria_text, audio-spectrogram-transformer, autoformer, bamba, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, cohere2, colpali, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dab-detr, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deformable_detr, deit, depth_anything, depth_pro, deta, detr, diffllama, dinat, dinov2, dinov2_with_registers, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, emu3, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_mamba, fastspeech2_conformer, flaubert, flava, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, git, glm, glpn, got_ocr2, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gptj, gptsan-japanese, granite, granitemoe, granitemoeshared, granitevision, graphormer, grounding-dino, groupvit, helium, hiera, hubert, ibert, idefics, idefics2, idefics3, idefics3_vision, ijepa, imagegpt, informer, instructblip, instructblipvideo, jamba, jetmoe, jukebox, kosmos-2, layoutlm, layoutlmv2, layoutlmv3, led, levit, lilt, llama, llava, llava_next, llava_next_video, llava_onevision, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, mgp-str, mimi, mistral, mixtral, mllama, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, modernbert, moonshine, moshi, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmo2, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, owlv2, owlvit, paligemma, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, persimmon, phi, phi3, phimoe, pix2struct, pixtral, plbart, poolformer, pop2piano, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_5_vl, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rt_detr_v2, rwkv, sam, seamless_m4t, seamless_m4t_v2, segformer, seggpt, sew, sew-d, siglip, siglip_vision_model, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superglue, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, table-transformer, tapas, textnet, time_series_transformer, timesformer, timm_backbone, timm_wrapper, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vitpose, vitpose_backbone, vits, vivit, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xmod, yolos, yoso, zamba, zamba2, zoedepth

In [1]:
import transformers
print(transformers.__version__)
# Убедитесь, что версия соответствует требованиям для Llama 3.2

  from .autonotebook import tqdm as notebook_tqdm


4.49.0
