In [None]:
!pip install pytesseract transformers datasets evaluate rouge-score nltk tensorboard py7zr --upgrade
# !pip install pytesseract transformers==4.28.1 datasets evaluate rouge-score nltk tensorboard py7zr
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import glob
from datasets import load_dataset
import datasets
import random



## Load the dataset

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load cowrie_train_data.xlsx, has a total of 81620
cowrie_data = pd.read_excel('cowrie_train_data.xlsx')

# Select the features (X) and label (Y)
selected_columns = ["username", "input", "protocol", "duration", "keyAlgs", "data", "message", "kexAlgs", "eventid"]
cowrie_selected_data = cowrie_data[selected_columns].copy()  # Copy to avoid SettingWithCopyWarning

# Convert the label 'eventid' to string (T5 requires text-to-text format)
cowrie_selected_data.loc[:, 'eventid'] = cowrie_selected_data['eventid'].astype(str)

cowrie_selected_data = cowrie_selected_data.sample(n=min(len(cowrie_selected_data), 20000), random_state=42)

# Split the dataset into train and test sets
train_df = cowrie_selected_data.sample(frac=0.8, random_state=42)  # 80% for training
test_df = cowrie_selected_data.drop(train_df.index)  # Remaining 20% for testing

# Print the number of samples in each set
print(f"Number of samples in the training set: {len(train_df)}")
print(f"Number of samples in the test set: {len(test_df)}")


# Combine feature columns into a single text input
def combine_features(row):
    return f"username: {row['username']} input: {row['input']} protocol: {row['protocol']} duration: {row['duration']} keyAlgs: {row['keyAlgs']} data: {row['data']} message: {row['message']} kexAlgs: {row['kexAlgs']}"

# Add the combined text column to the datasets
train_df['text'] = train_df.apply(combine_features, axis=1)
test_df['text'] = test_df.apply(combine_features, axis=1)

# Add the target column to the datasets
train_df['label'] = train_df['eventid']
test_df['label'] = test_df['eventid']

# Convert to Hugging Face Dataset format
dataset = {
    'train': Dataset.from_pandas(train_df),
    'test': Dataset.from_pandas(test_df)
}

# Print the sizes of the dataset in Hugging Face Dataset format
print(f"Train dataset size (Hugging Face format): {len(dataset['train'])}")
print(f"Test dataset size (Hugging Face format): {len(dataset['test'])}")

# Load the T5 tokenizer
# model_id = "google/flan-t5-base"
model_id = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)

Number of samples in the training set: 16000
Number of samples in the test set: 4000
Train dataset size (Hugging Face format): 16000
Test dataset size (Hugging Face format): 4000




In [None]:
dataset['train'][567]

{'username': None,
 'input': None,
 'protocol': None,
 'duration': None,
 'keyAlgs': "['ssh-rsa-cert-v01@openssh.com', 'ssh-dss-cert-v01@openssh.com', 'ecdsa-sha2-nistp256-cert-v01@openssh.com', 'ecdsa-sha2-nistp384-cert-v01@openssh.com', 'ecdsa-sha2-nistp521-cert-v01@openssh.com', 'ssh-ed25519-cert-v01@openssh.com', 'ecdsa-sha2-nistp256', 'ecdsa-sha2-nistp384', 'ecdsa-sha2-nistp521', 'ssh-rsa', 'ssh-dss', 'ssh-ed25519']",
 'data': None,
 'message': 'Remote SSH client fingerprint: 5fbd5748aa8458f11223e71353612e16',
 'kexAlgs': "['curve25519-sha256@libssh.org', 'ecdh-sha2-nistp256', 'ecdh-sha2-nistp384', 'ecdh-sha2-nistp521', 'diffie-hellman-group14-sha1']",
 'eventid': 'cowrie.client.kex',
 'text': "username: nan input: nan protocol: nan duration: nan keyAlgs: ['ssh-rsa-cert-v01@openssh.com', 'ssh-dss-cert-v01@openssh.com', 'ecdsa-sha2-nistp256-cert-v01@openssh.com', 'ecdsa-sha2-nistp384-cert-v01@openssh.com', 'ecdsa-sha2-nistp521-cert-v01@openssh.com', 'ssh-ed25519-cert-v01@openssh.co

## Get max padding length

In [None]:
from datasets import concatenate_datasets

# Tokenize the inputs and targets
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(
    lambda x: tokenizer(x["text"], truncation=True, padding='max_length'),
    batched=True,
    remove_columns=['text', 'label']
)

# Determine the maximum source length
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(
    lambda x: tokenizer(x["label"], truncation=True, padding='max_length'),
    batched=True,
    remove_columns=['text', 'label']
)

# Determine the maximum target length
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Max source length: 512


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Max target length: 512


## Preprocess the dataset (tokenized)

In [None]:
# Define the preprocessing function
def preprocess_function(sample, padding="max_length"):
    inputs = sample["text"]

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets
    labels = tokenizer(sample["label"], max_length=max_target_length, padding=padding, truncation=True)

    # Replace padding tokens in labels with -100 for ignoring
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the preprocessing function to each dataset separately; map() can't be used on dict but Dataset object can
tokenized_dataset = {
    'train': dataset['train'].map(preprocess_function, batched=True, remove_columns=['text', 'label']),
    'test': dataset['test'].map(preprocess_function, batched=True, remove_columns=['text', 'label'])
}
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['username', 'input', 'protocol', 'duration', 'keyAlgs', 'data', 'message', 'kexAlgs', 'eventid', '__index_level_0__', 'input_ids', 'attention_mask', 'labels']


## Load the model

In [None]:
# model_id="google/flan-t5-base"
model_id = "google/flan-t5-small"

# load pretrained model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

## Define the evaluation metrics

In [None]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Load the F1 score metric (or any other metric you are interested in)
metric = evaluate.load("f1")

# Formats the predicted and target texts by ensuring that each sentence is separated by a newline; this is for the ROUGE evaluation metrics
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Decode predictions
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in labels with pad_token_id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Apply postprocessing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Compute the F1 metric (or ROUGE if needed)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, average='macro')

    # Round and include additional metrics
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)

    return result


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Prepare DataCollator (for batch processing)

In [None]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator: preparing batches of data (ensure padding etc.) to be fed into T5 model
data_collator = DataCollatorForSeq2Seq(
    tokenizer, # ensures tokenizer consistent even after preprocess data
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8 #  ensures that the length of the sequences is padded to be a multiple of 8
)

## Define training arguments and trainer

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # per_device_train_batch_size=128,
    # per_device_eval_batch_size=128,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=3e-4,
    num_train_epochs=2,
    # num_train_epochs=1,
    logging_strategy="epoch", # means that the training metrics (such as loss, accuracy, etc.) will be logged at the end of each epoch
    evaluation_strategy="no", #avoid intermediate evaluation
    save_strategy="epoch", # save model checkpoint after each epoch
    save_total_limit=2, # max # of checkpoints to keep
    load_best_model_at_end=False,
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)



## Start training

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
# Start training
trainer.train()

Step,Training Loss
2000,0.0252
4000,0.0004


TrainOutput(global_step=4000, training_loss=0.012806741669774056, metrics={'train_runtime': 1629.4729, 'train_samples_per_second': 19.638, 'train_steps_per_second': 2.455, 'total_flos': 5948496150528000.0, 'train_loss': 0.012806741669774056, 'epoch': 2.0})

In [None]:
# Print the first example from the training set in the original dataset
raw_sample = dataset['train'][4640]  # You can change the index to view different samples

# Print the raw input features (X) and label (Y)
print(f"Raw Input (X): {raw_sample['text']}")
print(f"Raw Label (Y): {raw_sample['label']}")

Raw Input (X): username: nan input: nan protocol: nan duration: 37.35070705 keyAlgs: nan data: nan message: Connection lost after 37 seconds kexAlgs: nan
Raw Label (Y): cowrie.session.closed


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Input text for inference
input_text = "username: nan input: nan protocol: nan duration: nan keyAlgs: ['rsa-sha2-512-cert-v01@openssh.com', 'ext-info-c']"

# Tokenize the input text and move tensors to the correct device
inputs = tokenizer(input_text, return_tensors="pt").to(device)

# Ensure model is in evaluation mode
model.eval()

# Perform inference
with torch.no_grad():
    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=50,  # Adjust max length as needed
        num_beams=4,   # Adjust beam search parameters as needed
        early_stopping=True  # Stop generation when eos token is predicted
    )

# Decode the generated output ids back into text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Input Text:", input_text)
print("Generated Text:", generated_text)

Input Text: username: nan input: nan protocol: nan duration: nan keyAlgs: ['rsa-sha2-512-cert-v01@openssh.com', 'ext-info-c']
Generated Text: cowrie.client.kex


## Evaluation

## Run Inference and classification report

In [None]:
from tqdm.auto import tqdm

samples_number = len(dataset['test'])
progress_bar = tqdm(range(samples_number))
predictions_list = []
labels_list = []
for i in range(samples_number):
  text = dataset['test']['text'][i]
  inputs = tokenizer.encode_plus(text, padding='max_length', max_length=512, return_tensors='pt').to('cuda')
  outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=150, num_beams=4, early_stopping=True)
  prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
  predictions_list.append(prediction)
  labels_list.append(dataset['test']['label'][i])

  progress_bar.update(1)

  0%|          | 0/4000 [00:00<?, ?it/s]

In [None]:
str_labels_list = []
for i in range(len(labels_list)): str_labels_list.append(str(labels_list[i]))

In [None]:
from sklearn.metrics import classification_report

report = classification_report(str_labels_list, predictions_list, zero_division=0)
print(report)

                                     precision    recall  f1-score   support

                  cowrie.client.kex       1.00      1.00      1.00       524
                 cowrie.client.size       1.00      1.00      1.00         1
              cowrie.client.version       1.00      1.00      1.00       632
              cowrie.command.failed       1.00      1.00      1.00       102
               cowrie.command.input       1.00      1.00      1.00       213
           cowrie.direct-tcpip.data       1.00      1.00      1.00       179
        cowrie.direct-tcpip.request       1.00      1.00      1.00       211
               cowrie.file_download       0.00      0.00      0.00         0
                  cowrie.log.closed       1.00      1.00      1.00       177
                cowrie.login.failed       1.00      1.00      1.00       125
               cowrie.login.success       1.00      1.00      1.00       269
              cowrie.session.closed       1.00      1.00      1.00       67