In [18]:
import pandas as pd
import random
from datetime import datetime, timedelta



In [41]:

def generate_synthetic_logs(existing_logs, num_iterations=20, logs_per_iteration=100):
    """
    Generate synthetic logs by randomly picking logs from the existing dataset.
    Adjusts LineId, Month, Date, and Time fields realistically, including logs that occur within seconds.
    Also ensures that some consecutive logs from the original dataset are picked together.
    Combines the original dataset with synthetic logs into a new dataset.
    Also converts the combined dataset to a simplified text log format.
    """
    
    # Load the dataset
    df = pd.read_csv("Linux_2k.log_structured.csv")
    
    # Ensure dataset has the required columns
    required_columns = ["LineId", "Month", "Date", "Time", "Level", "Component", "PID", "Content", "EventId", "EventTemplate"]
    assert all(col in df.columns for col in required_columns), "Dataset missing required columns"
    
    # Convert existing dataset to a list of dictionaries
    log_entries = df.to_dict(orient='records')
    
    # Get starting values
    current_line_id = df["LineId"].max() + 1
    start_date = datetime.now() - timedelta(days=len(df) // 30)  # Start from an estimated older date
    
    synthetic_logs = []
    
    for _ in range(num_iterations):
        random.seed()  # Randomize seed every iteration
        daily_logs = []
        date_pointer = start_date + timedelta(days=random.randint(0, 30))
        
        # Ensure no Feb 29 date is generated, replace with Feb 28
        if date_pointer.month == 2 and date_pointer.day == 29:
            date_pointer = date_pointer.replace(day=28)
        
        i = 0
        while i < logs_per_iteration:
            if random.random() < 0.1 and i + 5 < logs_per_iteration:  # 10% chance to pick 5 consecutive logs
                start_index = random.randint(0, len(log_entries) - 6)  # Ensure space for 5 logs
                selected_logs = log_entries[start_index:start_index + 5]
                i += 5
            else:
                selected_logs = [random.choice(log_entries)]
                i += 1
            
            for log in selected_logs:
                log_copy = log.copy()
                
                # Assign new LineId
                log_copy["LineId"] = current_line_id
                current_line_id += 1
                
                # Assign realistic timestamps
                time_increment = random.randint(1, 900)  # Increment by 1 to 900 seconds (15 minutes max)
                date_pointer += timedelta(seconds=time_increment)
                
                # Ensure no Feb 29 date is generated after incrementing time, replace with Feb 28
                if date_pointer.month == 2 and date_pointer.day == 29:
                    date_pointer = date_pointer.replace(day=28)
                
                log_copy["Month"] = date_pointer.strftime('%b')  # Abbreviated month name (e.g., Jan, Feb)
                log_copy["Date"] = date_pointer.day
                log_copy["Time"] = date_pointer.strftime('%H:%M:%S')
                
                daily_logs.append(log_copy)
        
        synthetic_logs.extend(daily_logs)
    
    # Convert synthetic logs to DataFrame
    synthetic_df = pd.DataFrame(synthetic_logs)
    
    # Combine original dataset with synthetic logs
    combined_df = pd.concat([df, synthetic_df], ignore_index=True)
    combined_df.to_csv("combined_logs.csv", index=False)
    print(f"Generated {len(synthetic_logs)} synthetic logs and saved combined dataset to combined_logs.csv")
    
    # Convert to simple text log format
    with open("combined_logs.log", "w") as f:
        for _, row in combined_df.iterrows():
            component_cleaned = str(row['Component']).replace('.0', '')  # Remove .0 from Component
            pid_value = int(row['PID']) if pd.notna(row['PID']) else "-"  # Handle NaN PIDs
            log_entry = f"{row['Month']} {row['Date']} {row['Time']} combo {component_cleaned}[{pid_value}]: {row['Content']}\n"
            f.write(log_entry)
    print("Saved combined dataset in simple text log format as combined_logs.log")
    
    return combined_df  # Return dataframe for preview

# Example usage:
combined_df = generate_synthetic_logs("server_logs.csv")
# Display first few rows
combined_df.head()



Generated 2000 synthetic logs and saved combined dataset to combined_logs.csv
Saved combined dataset in simple text log format as combined_logs.log


Unnamed: 0,LineId,Month,Date,Time,Level,Component,PID,Content,EventId,EventTemplate
0,1,Jun,14,15:16:01,combo,sshd(pam_unix),19939.0,authentication failure; logname= uid=0 euid=0 ...,E16,authentication failure; logname= uid=0 euid=0 ...
1,2,Jun,14,15:16:02,combo,sshd(pam_unix),19937.0,check pass; user unknown,E27,check pass; user unknown
2,3,Jun,14,15:16:02,combo,sshd(pam_unix),19937.0,authentication failure; logname= uid=0 euid=0 ...,E16,authentication failure; logname= uid=0 euid=0 ...
3,4,Jun,15,02:04:59,combo,sshd(pam_unix),20882.0,authentication failure; logname= uid=0 euid=0 ...,E18,authentication failure; logname= uid=0 euid=0 ...
4,5,Jun,15,02:04:59,combo,sshd(pam_unix),20884.0,authentication failure; logname= uid=0 euid=0 ...,E18,authentication failure; logname= uid=0 euid=0 ...


In [None]:
OPTIONAL TO CREATE AND GROW THE DATASET

In [51]:
def generate_synthetic_logs(existing_logs, num_iterations=40, logs_per_iteration=100):
    """
    Generate synthetic logs by randomly picking logs from the existing dataset.
    Adjusts LineId, Month, Date, and Time fields realistically, including logs that occur within seconds.
    Also ensures that some consecutive logs from the original dataset are picked together.
    Combines the original dataset with synthetic logs into a new dataset.
    Also converts the combined dataset to a simplified text log format.
    """
    
    # Load the dataset
    df = pd.read_csv("combined_logs_growth.csv")
    
    # Ensure dataset has the required columns
    required_columns = ["LineId", "Month", "Date", "Time", "Level", "Component", "PID", "Content", "EventId", "EventTemplate"]
    assert all(col in df.columns for col in required_columns), "Dataset missing required columns"
    
    # Convert existing dataset to a list of dictionaries
    log_entries = df.to_dict(orient='records')
    
    # Get starting values
    current_line_id = df["LineId"].max() + 1
    start_date = datetime.now() - timedelta(days=len(df) // 30)  # Start from an estimated older date
    
    synthetic_logs = []
    
    for _ in range(num_iterations):
        random.seed()  # Randomize seed every iteration
        daily_logs = []
        date_pointer = start_date + timedelta(days=random.randint(0, 30))
        
        # Ensure no Feb 29 date is generated, replace with Feb 28
        if date_pointer.month == 2 and date_pointer.day == 29:
            date_pointer = date_pointer.replace(day=28)
        
        i = 0
        while i < logs_per_iteration:
            if random.random() < 0.1 and i + 5 < logs_per_iteration:  # 10% chance to pick 5 consecutive logs
                start_index = random.randint(0, len(log_entries) - 6)  # Ensure space for 5 logs
                selected_logs = log_entries[start_index:start_index + 5]
                i += 5
            else:
                selected_logs = [random.choice(log_entries)]
                i += 1
            
            for log in selected_logs:
                log_copy = log.copy()
                
                # Assign new LineId
                log_copy["LineId"] = current_line_id
                current_line_id += 1
                
                # Assign realistic timestamps
                time_increment = random.randint(1, 900)  # Increment by 1 to 900 seconds (15 minutes max)
                date_pointer += timedelta(seconds=time_increment)
                
                # Ensure no Feb 29 date is generated after incrementing time, replace with Feb 28
                if date_pointer.month == 2 and date_pointer.day == 29:
                    date_pointer = date_pointer.replace(day=28)
                
                log_copy["Month"] = date_pointer.strftime('%b')  # Abbreviated month name (e.g., Jan, Feb)
                log_copy["Date"] = date_pointer.day
                log_copy["Time"] = date_pointer.strftime('%H:%M:%S')
                
                daily_logs.append(log_copy)
        
        synthetic_logs.extend(daily_logs)
    
    # Convert synthetic logs to DataFrame
    synthetic_df = pd.DataFrame(synthetic_logs)
    
    # Combine original dataset with synthetic logs
    combined_df = pd.concat([df, synthetic_df], ignore_index=True)
    combined_df.to_csv("combined_logs_growth.csv", index=False)
    print(f"Generated {len(synthetic_logs)} synthetic logs and saved combined dataset to combined_logs.csv")
    
    # Convert to simple text log format
    with open("combined_logs_growth.log", "w") as f:
        for _, row in combined_df.iterrows():
            component_cleaned = str(row['Component']).replace('.0', '')  # Remove .0 from Component
            pid_value = int(row['PID']) if pd.notna(row['PID']) else "-"  # Handle NaN PIDs
            log_entry = f"{row['Month']} {row['Date']} {row['Time']} combo {component_cleaned}[{pid_value}]: {row['Content']}\n"
            f.write(log_entry)
    print("Saved combined dataset in simple text log format as combined_logs_growth.log")
    
    return combined_df  # Return dataframe for preview

# Example usage:
combined_df = generate_synthetic_logs("server_logs.csv")
# Display first few rows
combined_df.head()

Generated 4000 synthetic logs and saved combined dataset to combined_logs.csv
Saved combined dataset in simple text log format as combined_logs_growth.log


Unnamed: 0,LineId,Month,Date,Time,Level,Component,PID,Content,EventId,EventTemplate
0,1,Jun,14,15:16:01,combo,sshd(pam_unix),19939.0,authentication failure; logname= uid=0 euid=0 ...,E16,authentication failure; logname= uid=0 euid=0 ...
1,2,Jun,14,15:16:02,combo,sshd(pam_unix),19937.0,check pass; user unknown,E27,check pass; user unknown
2,3,Jun,14,15:16:02,combo,sshd(pam_unix),19937.0,authentication failure; logname= uid=0 euid=0 ...,E16,authentication failure; logname= uid=0 euid=0 ...
3,4,Jun,15,02:04:59,combo,sshd(pam_unix),20882.0,authentication failure; logname= uid=0 euid=0 ...,E18,authentication failure; logname= uid=0 euid=0 ...
4,5,Jun,15,02:04:59,combo,sshd(pam_unix),20884.0,authentication failure; logname= uid=0 euid=0 ...,E18,authentication failure; logname= uid=0 euid=0 ...
