## Define class for logs


In [1]:
import dataclasses
import datetime, time
from typing import Union, List, Dict, Type

@dataclasses.dataclass
class Logs:
    priority: int = None
    protocol_ver: int = 1
    timestamp: Union[datetime.datetime, float] = None
    host_name: str = None
    app_name: str = None
    process_id: int = None
    message_id: str = None
    struct_data: Dict = dataclasses.field(default_factory=dict)
    message: str = None

    raw: str = None
    creation_time: float = None
    error: bool = False

    def __str__(self):
        if self.struct_data:
            struct_data = '[' + ' '.join([f'{key}="{value}"' for key, value in self.struct_data.items()]) + ']'
        else:
            struct_data = '-'

        # transform each None value to '-'
        for key, value in self.__dict__.items():
            if value is None:
                self.__dict__[key] = '-'

        return f'<{self.priority}>{self.protocol_ver} {self.timestamp:%Y-%m-%dT%H:%M:%S.%f+%z} {self.host_name} {self.app_name} {self.process_id} {self.message_id} {struct_data} {self.message}'

### UNIX Logs

In [2]:
class UnixLogs(Logs):
    def __init__(self, raw):
        attributes = re.match(f'^(?P<timestamp>{MONTH} {DATE} {TIME}) (?P<host_name>\S+) (?:(?P<app_name>\S+)(?:\[(?P<process_id>\d+)\]): )?(?P<message>.*)$', raw).groupdict()
        # parse time
        attributes['timestamp'] = datetime.datetime.strptime(attributes['timestamp'], '%b %d %H:%M:%S').replace(year=datetime.datetime.now().year)
        super().__init__(**attributes, raw=raw)


## Log rules

In [3]:
import re

MONTH = '(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)'
DAY = '(Mon|Tue|Wed|Thu|Fri|Sat|Sun)'
DATE = '([0 ][1-9]|[12][0-9]|3[01])'
TIME = '([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9]'

# rules for parsing logs
rules = {
    'boot.log(\.\d+)?': UnixLogs,
    'cron(\.\d+)?': UnixLogs,
}


## Read logs

In [4]:
def read_logs(log_file: str) -> List[str]:
    with open(log_file, 'r', encoding='utf8') as f:
        return [line for line in f]

def parse_logs(logs: List[str], parser_class: Type) -> List[Logs]:
    parsed_logs = []
    for log in logs:
        try:
            parsed_logs.append(parser_class(log))
        except AttributeError:
            print('AttributeError', log)
            continue
    return parsed_logs

## Walk logs

In [5]:
import os, os.path, pathlib

LOG_DIR = 'hnet-hon-var-log-02282006'

for root, dirs, files in os.walk(LOG_DIR):
    for file in files:
        # skip files with weird encoding or compressed files
        try:
            logs = read_logs(os.path.join(root, file))
        except UnicodeDecodeError:
            print('UnicodeDecodeError', os.path.join(root, file))
            continue

        for key in rules:
            if re.match(key, file):
                logs = parse_logs(logs, UnixLogs)
                break
        else:
            continue

        print(*logs, sep='\n')



<->1 2023-01-26T11:53:05.000000+ combo - - - - sendmail: sendmail shutdown failed
<->1 2023-01-26T11:53:06.000000+ combo - - - - sendmail: sm-client shutdown failed
<->1 2023-01-26T12:20:03.000000+ combo - - - - messagebus: messagebus -TERM succeeded
<->1 2023-01-26T12:20:05.000000+ combo - - - - atd: atd shutdown succeeded
<->1 2023-01-26T12:20:06.000000+ combo - - - - privoxy: privoxy shutdown succeeded
<->1 2023-01-26T12:20:06.000000+ combo - - - - cups: cupsd shutdown succeeded
<->1 2023-01-26T12:20:07.000000+ combo - - - - xfs: xfs shutdown succeeded
<->1 2023-01-26T12:20:08.000000+ combo - - - - FreeWnn: jserver shutdown succeeded
<->1 2023-01-26T12:20:08.000000+ combo - - - - canna: Stopping Canna server: succeeded
<->1 2023-01-26T12:20:09.000000+ combo - - - - mysqld: Stopping MySQL:  failed
<->1 2023-01-26T12:20:09.000000+ combo - - - - gpm: gpm shutdown succeeded
<->1 2023-01-26T12:20:14.000000+ combo - - - - httpd: httpd shutdown succeeded
<->1 2023-01-26T12:20:15.000000+ co

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



<->1 2023-06-12T04:05:00.000000+ combo CROND 8882 - - (root) CMD (/usr/bin/mrtg /etc/mrtg/mrtg.cfg) 
<->1 2023-06-12T04:05:00.000000+ combo CROND 8884 - - (mailman) CMD (/usr/bin/python -S /var/mailman/cron/gate_news) 
<->1 2023-06-12T04:10:00.000000+ combo CROND 9080 - - (root) CMD (/usr/bin/mrtg /etc/mrtg/mrtg.cfg) 
<->1 2023-06-12T04:10:00.000000+ combo CROND 9082 - - (mailman) CMD (/usr/bin/python -S /var/mailman/cron/gate_news) 
<->1 2023-06-12T04:10:00.000000+ combo CROND 9078 - - (root) CMD (/usr/lib/sa/sa1 1 1) 
<->1 2023-06-12T04:15:00.000000+ combo CROND 9136 - - (root) CMD (/usr/bin/mrtg /etc/mrtg/mrtg.cfg) 
<->1 2023-06-12T04:15:00.000000+ combo CROND 9138 - - (mailman) CMD (/usr/bin/python -S /var/mailman/cron/gate_news) 
<->1 2023-06-12T04:20:00.000000+ combo CROND 9140 - - (root) CMD (/usr/lib/sa/sa1 1 1) 
<->1 2023-06-12T04:20:00.000000+ combo CROND 9143 - - (root) CMD (/usr/bin/mrtg /etc/mrtg/mrtg.cfg) 
<->1 2023-06-12T04:20:00.000000+ combo CROND 9145 - - (mailman) CM

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

