In [None]:
# Install fixed versions
!pip install torch==2.3.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --force-reinstall -q
!pip install numpy==1.24.4 --force-reinstall -q
!pip install scipy==1.11.4 --force-reinstall -q
!pip install datasets==2.16.1 --force-reinstall -q
!pip install huggingface-hub==0.24.7 --force-reinstall -q  # Fixed version in range
!pip install transformers==4.36.2 --force-reinstall -q
!pip install accelerate==0.27.2 --force-reinstall -q
!pip install scikit-learn==1.3.2 --force-reinstall -q
!pip uninstall peft -y -q

import torch
print(f"PyTorch: {torch.__version__}")

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Imports (minimal)
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, EarlyStoppingCallback
from torch import nn

# Load data (subsample 5% for speed)
train_path = "/kaggle/input/unsw/UNSW_NB15_training-set.parquet"
test_path = "/kaggle/input/unsw/UNSW_NB15_testing-set.parquet"

train_df = pd.read_parquet(train_path).sample(frac=0.05, random_state=42)
test_df = pd.read_parquet(test_path).sample(frac=0.05, random_state=42)
print("Train subsample:", train_df.shape, "Test subsample:", test_df.shape)

# Text field (minimal columns: 4)
cols_for_text = ['proto','service','state','dur']
def make_text(row):
    return " | ".join([f"{c}={row[c]}" for c in cols_for_text if c in row])

train_df["log_text"] = train_df.apply(make_text, axis=1)
test_df["log_text"] = test_df.apply(make_text, axis=1)
print(f"Using {len(cols_for_text)} features:" , cols_for_text)

# Labels
train_df["y"] = train_df["attack_cat"].apply(lambda x: 0 if str(x).lower() in ["normal","benign"] else 1)
test_df["y"] = test_df["attack_cat"].apply(lambda x: 0 if str(x).lower() in ["normal","benign"] else 1)

# Split (minimal)
combined = pd.concat([train_df[['log_text','y']], test_df[['log_text','y']]], ignore_index=True)
train_val, test_split = train_test_split(combined, test_size=0.2, stratify=combined["y"], random_state=42)
train_split, val_split = train_test_split(train_val, test_size=0.15, stratify=train_val["y"], random_state=42)

hf = DatasetDict({
    "train": Dataset.from_pandas(train_split.reset_index(drop=True)).rename_column("y", "label"),
    "validation": Dataset.from_pandas(val_split.reset_index(drop=True)).rename_column("y", "label"),
    "test": Dataset.from_pandas(test_split.reset_index(drop=True)).rename_column("y", "label"),
})
print(hf)

# Tokenization (max_length=64 for speed)
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess(batch):
    return tokenizer(batch["log_text"], truncation=True, padding="max_length", max_length=64)

hf = hf.map(preprocess, batched=True, remove_columns=["log_text"])
hf.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
print("Tokenization complete.")

# Class weights
labels = np.asarray(hf["train"]["label"])
class_weights = compute_class_weight("balanced", classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float)
print("Class weights:", class_weights)

# Model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
print("Model loaded.")

# Metrics (minimal)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", zero_division=0)
    return {"f1": f1}

# Training args (2 epochs, small batch)
training_args = TrainingArguments(
    output_dir="/kaggle/working/results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    num_train_epochs=2,
    per_device_train_batch_size=4,  # Small batch
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    fp16=True,
    logging_steps=20,  # Frequent log but small
    report_to="none",
    save_total_limit=1,  # Minimal save
    seed=42,
    warmup_steps=20,  # Small warmup
    lr_scheduler_type="linear",
)

collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Trainer (simple)
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = nn.CrossEntropyLoss(weight=class_weights.to(outputs.logits.device))(outputs.logits, labels)
        return (loss, outputs) if return_outputs else loss

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=hf["train"],
    eval_dataset=hf["validation"],
    processing_class=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]  # Fast stop
)

print("Trainer ready.")

# Train
print("Starting training...")
trainer.train()
print("Training finished!")

# Evaluate
print("Evaluating...")
metrics = trainer.evaluate(hf["test"])
print("TEST METRICS:", metrics)

# Export (simple)
import shutil
output_dir = "/kaggle/working/soc_model"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
shutil.make_archive("/kaggle/working/soc_model", "zip", output_dir)
print("Download: /kaggle/working/soc_model.zip")