In [1]:
!pip install pandas scikit-learn torch transformers datasets numpy tqdm


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m547.8/547.8 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting

In [4]:
import csv
import random
from datetime import datetime, timedelta
import ipaddress
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset

# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def generate_ip_range(base_ip, num_ips):
    base = ipaddress.ip_address(base_ip)
    return [str(base + i) for i in range(num_ips)]

def generate_attack_campaign():
    campaigns = [
        {"name": "Brute Force Campaign", "type": "Brute Force Attack", "duration": timedelta(hours=random.randint(1, 4))},
        {"name": "Data Exfiltration Campaign", "type": "Potential Data Exfiltration", "duration": timedelta(hours=random.randint(2, 6))},
        {"name": "Malware Infection Campaign", "type": "Possible Malware Infection", "duration": timedelta(days=random.randint(1, 3))},
        {"name": "Insider Threat Campaign", "type": "Potential Insider Threat", "duration": timedelta(days=random.randint(1, 2))},
        {"name": "Configuration Change Campaign", "type": "Configuration Change", "duration": timedelta(hours=random.randint(1, 3))},
        {"name": "Security Control Tampering Campaign", "type": "Security Control Tampering", "duration": timedelta(hours=random.randint(2, 5))},
        {"name": "DDoS Campaign", "type": "DDoS Attack on HTTPS", "duration": timedelta(hours=random.randint(1, 6))}
    ]
    return random.choice(campaigns)

def generate_log_entries(num_entries):
    log_entries = []
    current_time = datetime.now()

    while len(log_entries) < num_entries:
        campaign = generate_attack_campaign()
        campaign_start = current_time - timedelta(days=random.randint(0, 30))
        campaign_end = campaign_start + campaign['duration']

        if campaign['type'] in ["Brute Force Attack", "Potential Data Exfiltration", "DDoS Attack on HTTPS"]:
            base_ip = f"192.168.{random.randint(1, 254)}.0"
            ip_range = generate_ip_range(base_ip, 50 if campaign['type'] == "DDoS Attack on HTTPS" else 10)
        else:
            ip_range = [f"10.0.{random.randint(1, 254)}.{random.randint(1, 254)}"]

        num_events = random.randint(20, 100) if campaign['type'] == "DDoS Attack on HTTPS" else random.randint(3, 10)
        server = f"srv-{random.randint(100, 999)}"
        user = f"user{random.randint(100, 999)}"

        for _ in range(num_events):
            timestamp = campaign_start + (campaign_end - campaign_start) * random.random()
            ip = random.choice(ip_range)

            if campaign['type'] == "Brute Force Attack":
                event = f"Failed login attempt for {user} on {server} from IP {ip}"
                if random.random() < 0.1:
                    event = f"Successful login for {user} on {server} from IP {ip}"
            elif campaign['type'] == "Potential Data Exfiltration":
                event = f"Unusual outbound traffic spike detected from {server} to IP {ip} on port 443"
            elif campaign['type'] == "Possible Malware Infection":
                event = f"Suspicious process activity detected on {server} with IP {ip}"
            elif campaign['type'] == "Potential Insider Threat":
                event = f"Unexpected privileged account creation for {user} from {server} (IP {ip})"
            elif campaign['type'] == "Configuration Change":
                event = f"Firewall rule change detected on {server} from admin IP {ip}"
            elif campaign['type'] == "Security Control Tampering":
                event = f"Antivirus software disabled on {server} with IP {ip}"
            elif campaign['type'] == "DDoS Attack on HTTPS":
                event = f"High volume of HTTPS requests to {server} from IP {ip}"

            log_entries.append((
                f"{timestamp.strftime('%Y-%m-%d %H:%M:%S')} - {event}",
                campaign['type'],
                campaign['name'],
                server,
                user,
                ip
            ))

    return log_entries[:num_entries]

def generate_analysis(log_entry, event_type, campaign_name, server, user, ip):
    locations = ["New York, USA", "London, UK", "Tokyo, Japan", "Sydney, Australia", "Berlin, Germany"]
    roles = ["Database Administrator", "System Administrator", "Network Engineer", "Security Analyst", "Software Developer"]

    analysis = f"""Alert Title: {event_type} Detected
Date and Time: {log_entry.split(' - ')[0]} UTC
Event Description: {log_entry.split(' - ')[1]}
Event Type: {event_type}
Contextual Information:
Source IP and Host: {ip} (location: {random.choice(locations)})
Destination IP and Host: {server} (location: Internal Network)
User Information: {user} (Role: {random.choice(roles)})
Geolocation Data: {random.choice(locations)}
Technical Details:
Log Entries:
{log_entry}
Network Protocols: {"HTTPS" if event_type == "DDoS Attack on HTTPS" else random.choice(['SSH', 'HTTP', 'HTTPS', 'FTP', 'RDP'])}
Severity and Priority:
Severity Level: {"Critical" if event_type == "DDoS Attack on HTTPS" else "High" if event_type in ["Brute Force Attack", "Potential Data Exfiltration"] else "Medium"}
Priority Level: {"Urgent" if event_type in ["DDoS Attack on HTTPS", "Brute Force Attack", "Potential Data Exfiltration"] else "High"}
Impact Assessment:
Affected Systems: {server}
Business Impact: {"Potential service disruption and unavailability of web services." if event_type == "DDoS Attack on HTTPS" else "Potential unauthorized access to sensitive information." if event_type == "Brute Force Attack" else "Possible data loss or system compromise."}
Threat Intelligence:
Indicator of Compromise (IOC): IP {ip} associated with {campaign_name}.
Recommended Actions:
Immediate Actions: {"Implement traffic filtering and rate limiting on affected services." if event_type == "DDoS Attack on HTTPS" else "Isolate " + server + " from the network, reset " + user + "'s password." if event_type == "Brute Force Attack" else "Monitor " + server + " for unusual activity, restrict " + user + "'s access."}
Investigative Steps: {"Analyze traffic patterns and payload signatures." if event_type == "DDoS Attack on HTTPS" else "Review logs for further suspicious activity, check for any changes made by " + user + " during the session."}
Remediation Steps: {"Scale infrastructure resources, implement DDoS mitigation services." if event_type == "DDoS Attack on HTTPS" else "Implement account lockout policies after multiple failed login attempts" if event_type == "Brute Force Attack" else "Update security policies and access controls"}, ensure all systems have updated security patches.
"""
    return analysis

# Generate the dataset
num_entries = 3000
dataset = []

log_entries = generate_log_entries(num_entries)
for log_entry, event_type, campaign_name, server, user, ip in log_entries:
    analysis = generate_analysis(log_entry, event_type, campaign_name, server, user, ip)
    dataset.append([log_entry, analysis])

# Write to CSV
with open('security_logs.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['log_entry', 'analysis'])
    writer.writerows(dataset)

print(f"Generated {num_entries} log entries and saved to security_logs.csv")

# Load the dataset
df = pd.read_csv('security_logs.csv')

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Load T5 tokenizer and model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

# Tokenize the dataset
def preprocess_data(examples):
    inputs = ["analyze security log: " + ex for ex in examples['log_entry']]
    targets = examples['analysis']
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(preprocess_data, batched=True)
val_dataset = val_dataset.map(preprocess_data, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir='./logs',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_t5_security_model")
tokenizer.save_pretrained("./fine_tuned_t5_security_model")

# Function to generate analysis for a new log entry
def generate_analysis(log_entry):
    input_text = "analyze security log: " + log_entry
    input_ids = tokenizer(input_text, return_tensors="pt", max_length=128, truncation=True).input_ids.to(device)
    with torch.no_grad():
        outputs = model.generate(input_ids, max_length=512, num_return_sequences=1, no_repeat_ngram_size=2)
    outputs = outputs.cpu()
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
test_log = "2024-07-19 10:15:30 - High volume of HTTPS requests to srv-789 from IP 192.168.1.100"
analysis = generate_analysis(test_log)
print(f"Log Entry: {test_log}")
print(f"Generated Analysis:\n{analysis}")




Using device: cuda
Generated 3000 log entries and saved to security_logs.csv


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/2700 [00:00<?, ? examples/s]



Map:   0%|          | 0/300 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,1.6667,0.12285
2,0.3104,0.05794
3,0.1378,0.049345


Evaluation Results: {'eval_loss': 0.04934464022517204, 'eval_runtime': 5.8935, 'eval_samples_per_second': 50.904, 'eval_steps_per_second': 12.726, 'epoch': 3.0}
Log Entry: 2024-07-19 10:15:30 - High volume of HTTPS requests to srv-789 from IP 192.168.1.100
Generated Analysis:
Alert Title: DDoS Attack on HTTPS Detected Date and Time: 2024-07-19 10:15:30 UTC Event Description: High volume of HTTPs requests to srv-789 from IP 192.168.1.100 Event Type: Detentate on D DoS Campaign. Recommended Actions: Immediate Action: Implement traffic filtering and rate limiting on affected services. Investigative Action : Monitor a user's logs for suspicious activity, check for any changes made by user‚Äôs


In [None]:
#download folder
