In [1]:
import time
# time.sleep(3600)

In [2]:
# dataset
# use bert to encode text
import json
import dataclasses
from pathlib import Path
from typing import List, Optional
from collections import defaultdict

import torch
import tqdm
from IPython.core.usage import default_banner

from transformers import TextStreamer
from unsloth import FastLanguageModel
import time
start_time = time.time()

max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.


@dataclasses.dataclass
class Config:
    path_uid_output = Path('uid_candidate_with_banner')


@dataclasses.dataclass
class UID:
    banner: str
    unique_id: str
    ip: str
    port: int
    timestamp: str
    service_name: str
    rule_name: str
    req_data: str
    vendor: str
    product: str
    model: str
    uid_predict: Optional[str] = None
    bert_encoding: Optional[List[torch.FloatTensor]] = None
    cluster_label: Optional[str] = None

    def __repr__(self):
        return f"{self.cluster_label}:{self.rule_name}:{self.ip}:{self.port}:{self.timestamp}"


def get_file_data(filename: Path) -> List[UID]:
    file_content = [UID(**json.loads(line)) for line in filename.read_text().splitlines()]
    return file_content


def get_tokenizer_and_model():
    _model, _tokenizer = FastLanguageModel.from_pretrained(
        model_name="lora_model",  # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )
    FastLanguageModel.for_inference(_model)  # Enable native 2x faster inference
    return _tokenizer, _model

model_name = "unsloth/Meta-Llama-3.1-8B"
model_name = "unsloth/mistral-7b-v0.3-bnb-4bit"         # Mistral v3 2x faster!
model_name = "unsloth/Phi-3-mini-4k-instruct"

model_name = "Mistral-Nemo-Instruct-2407-bnb-4bit"
model_name = "Meta-Llama-3.1-8B"
model_name = "gemma-2-9b-bnb-4bit"
model_name = "mistral-7b-v0.3-bnb-4bit"
model_name = "Meta-Llama-3.1-8B-Instruct-bnb-4bit"
model_name = "Mistral-Nemo-Base-2407-bnb-4bit" # New Mistral 12b 2x faster!
model_name = "Meta-Llama-3.1-8B-bnb-4bit"

 
def get_raw_tokenizer_and_model():

    model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = f"unsloth/{model_name}" ,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )     
    FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

    return tokenizer, model


def predict_streamer(banner: str, tokenizer, model):
    question = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction:
    {''' You are an expert in data mining and network security. I will provide you with some network scan data, which is divided by lines, with each line representing a piece of data. Not every record can successfully extract a fingerprint; you only need to find the data that you believe can serve as a fingerprint. You need to fully understand the semantics of this data, determine the meaning of the string or the strings that follow, and utilize your knowledge of web development, such as the Date field in the HTTP header, nonce in requests, and csrf_token that frequently change. Filter out data that frequently changes and cannot be used as a fingerprint, and then identify the specific strings that meet the following requirements for a fingerprint:
        The fingerprint must uniquely identify an individual device, not just a type of device.
        The fingerprint must not change upon repeated network requests.
        The fingerprint must remain consistent over a long period.
        The fingerprint must not change upon device reboot.
        '''}

    ### Input:
    {banner}

    ### Response:
    {''}"""

    inputs = tokenizer([question], return_tensors="pt").to("cuda")

    text_streamer = TextStreamer(tokenizer)
    # output to stdout
    model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)
    # a = model.pred(**inputs)
    # print(a)
    # print(pred)


def predict_direct(banner: str, tokenizer, model):
    question = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction:
    {''' You are an expert in data mining and network security. I will provide you with some network scan data, which is divided by lines, with each line representing a piece of data. Not every record can successfully extract a fingerprint; you only need to find the data that you believe can serve as a fingerprint. You need to fully understand the semantics of this data, determine the meaning of the string or the strings that follow, and utilize your knowledge of web development, such as the Date field in the HTTP header, nonce in requests, and csrf_token that frequently change. Filter out data that frequently changes and cannot be used as a fingerprint, and then identify the specific strings that meet the following requirements for a fingerprint:
        The fingerprint must uniquely identify an individual device, not just a type of device.
        The fingerprint must not change upon repeated network requests.
        The fingerprint must remain consistent over a long period.
        The fingerprint must not change upon device reboot.
        '''}

    ### Input:
    {banner}

    ### Response:
    {''}"""

    inputs = tokenizer([question], return_tensors="pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens=128)

    # 解码模型生成的 tokens
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return decoded_output


# batch pred
def predict_direct_batch(banner_list: List[str], tokenizer, model):
    questions = [f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction:
    {''' You are an expert in data mining and network security. I will provide you with some network scan data, which is divided by lines, with each line representing a piece of data. Not every record can successfully extract a fingerprint; you only need to find the data that you believe can serve as a fingerprint. You need to fully understand the semantics of this data, determine the meaning of the string or the strings that follow, and utilize your knowledge of web development, such as the Date field in the HTTP header, nonce in requests, and csrf_token that frequently change. Filter out data that frequently changes and cannot be used as a fingerprint, and then identify the specific strings that meet the following requirements for a fingerprint:
        The fingerprint must uniquely identify an individual device, not just a type of device.
        The fingerprint must not change upon repeated network requests.
        The fingerprint must remain consistent over a long period.
        The fingerprint must not change upon device reboot.
        '''}

    ### Input:
    {banner}

    ### Response:
    {''}""" for banner in banner_list]
    # print("q", "*"*30, questions)

    inputs = tokenizer(questions, return_tensors="pt", padding=True).to("cuda")

    # text_streamer = TextStreamer(tokenizer)
    outputs = model.generate(**inputs, max_new_tokens=128)

    # 解码模型生成的 tokens
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return decoded_outputs


llama_tokenizer, llama_model = get_tokenizer_and_model()
# llama_tokenizer, llama_model = get_raw_tokenizer_and_model()
llama_model

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.0.
   \\   /|    GPU: Tesla V100-PCIE-32GB. Max memory: 31.739 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0. CUDA = 7.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit(
      

In [8]:
# model_name = "Meta-Llama-3.1-8B-bnb-4bit"

model_name = f"500-{model_name}"
all_result = defaultdict(list)
for uid_file in Config.path_uid_output.iterdir():
    result = []
    records = get_file_data(uid_file)
    print(uid_file, len(records))
    to_pred = []
    for uid_obj in tqdm.tqdm(records):
        resp_banner = uid_obj.banner
        # print(resp_banner)
        to_pred.append(uid_obj)
        if len(to_pred) == 15:
            aa = predict_direct_batch([i.banner for i in to_pred], tokenizer=llama_tokenizer, model=llama_model)
            for i in range(len(aa)):
                to_pred[i].uid_predict = aa[i]
            all_result[uid_file.name].extend(to_pred)
            to_pred = []

        if len(all_result[uid_file.name]) > 50 * 10:
            break

for filename in all_result:
    filepath = Path(model_name) / filename
    filepath.parent.mkdir(exist_ok=True)
    filepath.write_text("\n".join([json.dumps( dataclasses.asdict(i)) for i in all_result[filename]]))

 


uid_candidate_with_banner/ftp.jsonl 299591


  0%|▏                                                                          | 509/299591 [00:44<7:17:10, 11.40it/s]


uid_candidate_with_banner/rtsp.jsonl 7520


  7%|█████▎                                                                         | 509/7520 [00:46<10:40, 10.95it/s]


uid_candidate_with_banner/onvif.jsonl 3118


 16%|████████████▉                                                                  | 509/3118 [03:00<15:24,  2.82it/s]
