In [10]:
import os
import pandas as pd
import re
import numpy as np
from datetime import datetime

from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from scipy.stats import zscore
from sklearn.metrics import classification_report
from gensim.models import Word2Vec

from nltk.corpus import stopwords, wordnet


# Download stopwords if not already downloaded
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

def get_synonym(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().replace('_', ' '))
    return synonyms

stop_words = set(stopwords.words('english'))
error_keywords = {
    'does not exist','error','Failed', 'fail', 'failure', 'forbidden', 'critical', 'warning', 'alert',
    'password failure', 'connection timeout', 'disk full', 'out of memory',
    'segmentation fault', 'not found', 'invalid request', 'service unavailable'
}
additional_keywords = {
    'denied', 'unauthorized', 'invalid', 'timeout', 'abort', 'blocked', 
    'rejected', 'unreachable', 'not found', 'no such file', 'broken pipe',
    'protocol error', 'access violation', 'stack trace', 'core dumped'
}

#Expand keywords with Synonyms 
expanded_error_keywords = set(error_keywords)
expanded_additional_keywords = set(additional_keywords)

for keyword in error_keywords:
    expanded_error_keywords.update(get_synonym(keyword))

for keyword in additional_keywords:
    expanded_additional_keywords.update(get_synonym(keyword))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ismayilzadamaharram/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ismayilzadamaharram/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
#EXPANDING ANOMALY RULES

def detect_anomaly_extended (message):
    message_lower = message.lower()
    return any(word in message_lower for word in expanded_error_keywords.union(expanded_additional_keywords))

#Regex
def detect_pattern_anomalies(message):
    suspicious_patterns = [
        r'failed\slogin',    # Failed login attempts
        r'sql\sinjection',   # SQL injection patterns
        r'invalid\suser',    # Invalid user patterns
        r'POSSIBLE\sBREAK-IN\sATTEMPT',  # Specific break-in attempt message
        r'connection\stimeout',    # Connection timeout patterns
        r'disk\sfull',             # Disk full errors
        r'segmentation\sfault',    # Segmentation fault
        r'service\sunavailable',   # Service unavailable errors
        r'protocol\serror',        # Protocol errors
        r'access\sviolation',      # Access violation errors
        r'authentication\sfailed', # Authentication failure patterns
        r'unable\sto\sconnect',    # Connection issues
        r'login\sattempt\sfailure' # Login attempt failures
    ]
    return any(re.search(pattern, message.lower()) for pattern in suspicious_patterns)

#Statistical outliers
def detect_statistical_outliers(df):
    message_counts = df['message'].value_counts()
    z_scores = zscore(message_counts)
    outliers = np.abs(z_scores) > 2.5
    df['statistical_anomaly'] = df['message'].map(lambda x: message_counts[x] if x in message_counts.index else 0)
    df['statistical_anomaly'] = df['statistical_anomaly'].map(lambda x: outliers[message_counts.index.get_loc(x)] if x in message_counts.index else False)


# Clustering anomalies using Word2Vec
def detect_clustering_anomalies(df):
    tokenized_messages = df['message'].apply(lambda msg: msg.lower().split())
    word2vec_model = Word2Vec(sentences=tokenized_messages, vector_size=100, window=5, min_count=1, workers=4, sg=1)
    df['word2vec_vector'] = df['message'].apply(lambda msg: np.mean([word2vec_model.wv[token] for token in msg.lower().split() if token in word2vec_model.wv], axis=0))
    
    word2vec_features = np.vstack(df['word2vec_vector'].values)
    clustering = DBSCAN(eps=0.3, min_samples=5, metric='euclidean').fit(word2vec_features)
    df['cluster'] = clustering.labels_
    df['clustering_anomaly'] = df['cluster'] == -1
    #Drop the word2vec
    df.drop(columns=['word2vec_vector'], inplace=True)


#Contextual and sequantial analysis 
def detect_sequential_anomalies(df, window_size=5, threshold=5):
    anomalies = df['combined_anomaly'].rolling(window=window_size).sum()
    df['sequential_anomaly'] = anomalies >= threshold


#APPLY
def apply_anomaly_detection(df):
    df['keyword_anomaly'] = df['message'].apply(detect_anomaly_extended)
    df['pattern_anomaly'] = df['message'].apply(detect_pattern_anomalies)
    detect_statistical_outliers(df)
    df['statistical_anomaly'] = df['statistical_anomaly'].astype(float)
    detect_clustering_anomalies(df)  
    df['clustering_anomaly'] = df['clustering_anomaly'].astype(float)

    weights = {
        'keyword_anomaly': 0.50,
        'pattern_anomaly': 0.25,
        'statistical_anomaly': 0.10,
        'clustering_anomaly': 0.15
    }

    #Calculate weights sum of anomaly categories
    df['combined_anomaly'] = (
        df['keyword_anomaly'] * weights['keyword_anomaly'] +
        df['pattern_anomaly'] * weights['pattern_anomaly'] +
        df['statistical_anomaly'] * weights['statistical_anomaly'] +
        df['clustering_anomaly'] + weights['clustering_anomaly']
    )


    df['combined_anomaly'] = df['combined_anomaly'].clip(0, 1)
    
    # Contextual anomaly detection
    detect_sequential_anomalies(df)



In [12]:
#Function to parse SSH logs
def parse_ssh_log(line):
    match = re.match(r'^(?P<date>\w+ \d+ \d+:\d+:\d+) (?P<source>\w+) sshd\[\d+\]: (?P<message>.+)$', line)
    if match:
        date_str = match.group('date')
        # SSH logs do not include the year, so assume the current year
        date = datetime.strptime(date_str, '%b %d %H:%M:%S')
        date = date.replace(year=datetime.now().year)
        message = match.group('message')
        message_length = len(message)

        return date, 'ssh', message, message_length
    return None, None, None, None



# Function to parse Apache logs
def parse_apache_log(line):
    match = re.match(r'^\[(?P<date>.+)\] \[(?P<log_level>\w+)\] (?P<message>.+)$', line)
    if match:
        date_str = match.group('date')
        try:
            # Attempt to parse the date with the year first
            date = datetime.strptime(date_str, '%a %b %d %H:%M:%S %Y')
        except ValueError:
            # If year is not present, assume the current year
            date = datetime.strptime(date_str, '%a %b %d %H:%M:%S')
            date = date.replace(year=datetime.now().year)
        message = match.group('message')
        message_length = len(message)

        return date, 'apache', message, message_length
    return None, None, None, None





#Path to log files
log_files = {
    'ssh': '../archive-4/SSH.log',
    'apache': '../archive-4/Apache.log',
}

#Parsing functions
parsers = {
    'ssh': parse_ssh_log,
    'apache': parse_apache_log,
}


#Merging all log files
all_logs = []
for log_type, file in log_files.items():
    with open(file, 'r') as f:
        count = 0
        for line in f:
            #if you want to scale dataset, just change the int to whatever scale you want
            if count >= 100000:
                break
            date, lt, message, message_length  = parsers[log_type](line)
            if date and lt and message:
                all_logs.append([date, log_type, message, message_length])
                count += 1


#Create DataFrame 
df = pd.DataFrame(all_logs, columns=['date', 'log_type', 'message', 'message_length'])



apply_anomaly_detection(df)

#Droping unnecessary columns
df.drop(columns=['keyword_anomaly'], inplace=True)
df.drop(columns=['pattern_anomaly'], inplace=True)
df.drop(columns=['statistical_anomaly'], inplace=True)
df.drop(columns=['clustering_anomaly'], inplace=True)



#if you have scaled the dataset, try to align the name of the dataset with number of dataset 
df.to_csv('../merged_dataset/merged_logs100000.csv', index=False)