In [1]:
import docx
import os
import pandas as pd
from tqdm.auto import tqdm

import torch
from unsloth import FastLanguageModel

from prompts import prompt, system

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [2]:
path_to_test = "./test data"

In [3]:
numbers = [int(file.split('.')[0].split('-')[1]) for file in os.listdir(f"{path_to_test}/HMI/")]
numbers

[114671,
 259571,
 259572,
 261611,
 29448,
 30364,
 30365,
 30370,
 315231,
 65831,
 65832,
 65833,
 86921,
 88001,
 88002]

In [4]:
name2cls = {
    "Fully compliant": "FC",
    "Largely compliant": "LC",
    "Partially compliant": "PC",
    "Not compliant": "NC"
}

In [5]:
# загружаем дообученную модель
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="./Mistral-finetuned",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

==((====))==  Unsloth 2024.9.post4: Fast Mistral patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla V100-PCIE-32GB. Max memory: 31.739 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.0+cu121. CUDA = 7.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.24+cu118. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Downloading shards: 100%|██████████| 2/2 [00:00<00:00, 615.86it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.21s/it]
Unsloth 2024.9.post4 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


In [6]:
FastLanguageModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(131072, 5120)
        (layers): ModuleList(
          (0-39): 40 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=5120, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=5120, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit

In [7]:
def get_differences(name, uc_text, ssts_text):
    inputs = tokenizer([prompt.format(system["Differences"], name, uc_text, ssts_text, "Differences", "")], return_tensors = "pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=128, use_cache=True)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result.split("### Differences:")[-1].strip()

def get_description(name, uc_text, ssts_text):
    inputs = tokenizer([prompt.format(system["Description"], name, uc_text, ssts_text, "Description", "")], return_tensors = "pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=128, use_cache=True)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result.split("### Description:")[-1].strip()

def get_cls(name, uc_text, ssts_text):
    inputs = tokenizer([prompt.format(system["Complience Level"], name, uc_text, ssts_text, "Complience Level", "")], return_tensors = "pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=128, use_cache=True)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return name2cls[result.split("### Complience Level:")[-1].strip()]

In [8]:
%%time

submit = []
for number in tqdm(numbers):
    uc_doc = docx.Document(f"{path_to_test}/HMI/UC-{number}.docx")
    uc_text = "\n".join(paragraph.text for paragraph in uc_doc.paragraphs[1:])
    name = uc_doc.paragraphs[0].text.split('\xa0')[-1].strip()
    
    if not os.path.exists(f"{path_to_test}/SSTS/SSTS-{number}.docx"):
        submit.append({
            "Number": number,
            "Name": name,
            "Differences": "ssts hasn't info about this",
            "Description": "-",
            "Complience Level": "NA"
        })
        continue
        
    ssts_doc = docx.Document(f"{path_to_test}/SSTS/SSTS-{number}.docx")
    ssts_text = "\n".join(paragraph.text for paragraph in ssts_doc.paragraphs[1:])
    
    submit.append({
        "Number": number,
        "Name": name,
        "Differences": get_differences(name, uc_text, ssts_text),
        "Description": get_description(name, uc_text, ssts_text),
        "Complience Level": get_cls(name, uc_text, ssts_text)
    })

100%|██████████| 15/15 [04:08<00:00, 16.54s/it]

CPU times: user 4min 6s, sys: 2.17 s, total: 4min 8s
Wall time: 4min 8s





скорость 16.54s на итерацию

In [9]:
submit = pd.DataFrame(submit)
submit.head()

Unnamed: 0,Number,Name,Differences,Description,Complience Level
0,114671,Revoke access to the vehicle from a driver or ...,No differences,The driver opens the ATOM application on his i...,NC
1,259571,Mute/unmute the FM Radio playback,There is no soft button icon and description i...,The user clicks on the radio and clicks on the...,LC
2,259572,Mute/unmute the FM Radio playback,There is no soft button icon and description i...,The user clicks on the radio and clicks on the...,NC
3,261611,FM Radio Stations switching,No online music playback page on in_5.\nNo net...,Users can switch the next/previous FM Radio st...,NC
4,29448,Configure heat preservation,No online&offline mode.,Users can configure heat preservation via in_2...,LC


In [10]:
submit["Complience Level"].value_counts()

LC    11
NC     4
Name: Complience Level, dtype: int64

In [11]:
submit.to_csv("submission.csv", index=False)