In [14]:
import re
from datetime import datetime

log_file_path = "confucius_access.log"

# Regular expression to match typical access log entries regex
log_pattern = re.compile(
    r'(?P<ip>\S+) - - \[(?P<timestamp>[^\]]+)\] "(?P<method>[A-Z]+) (?P<url>\S+) HTTP/\d\.\d" (?P<status>\d+) (?P<size>\S+)'
)

monthly_visitors = {}
yearly_visitors = {}

# Parse the log file
with open(log_file_path, "r") as file:
    for line in file:
        match = log_pattern.search(line)
        if match:
            # Extract IP and timestamp
            ip = match.group("ip")
            time_str = match.group("timestamp")
            time = datetime.strptime(time_str.split()[0], "%d/%b/%Y:%H:%M:%S")
            # Get year and month for grouping
            month_key = time.strftime("%Y-%m")
            year_key = time.strftime("%Y")

            # Use first three octets of IP for grouping similar visitors (e.g., "100.23.34")
            ip_prefix = ".".join(ip.split(".")[:3])

            # Initialize the month key if not present
            if month_key not in monthly_visitors:
                monthly_visitors[month_key] = {}

            # Only save a visitor once per IP prefix per month
            if ip_prefix not in monthly_visitors[month_key]:
                monthly_visitors[month_key][ip_prefix] = []

            # Initialize the year key if not present
            if year_key not in yearly_visitors:
                yearly_visitors[year_key] = {}

            # Only save a visitor once per IP prefix per year
            if ip_prefix not in yearly_visitors[year_key]:
                yearly_visitors[year_key][ip_prefix] = []

            # Append log entry if unique for that day
            date_key = time.strftime("%Y-%m-%d")
            if date_key not in [entry['date'] for entry in monthly_visitors[month_key][ip_prefix]]:
                monthly_visitors[month_key][ip_prefix].append({
                    "date": date_key,
                    "ip": ip,
                    "timestamp": time_str,
                    "method": match.group("method"),
                    "url": match.group("url")
                })
                yearly_visitors[year_key][ip_prefix].append({
                    "date_key": date_key,
                    "ip": ip,
                    "timestamp": time_str,
                    "method": match.group("method"),
                    "url": match.group("url")})

output_file_month = "monthly_visitors_report.txt"
output_file_year = "yearly_visitors_report.txt"
total_visitors = 0
with open(output_file_year, "w") as file:
    for year, visitors in yearly_visitors.items():
        total_visitors += len(visitors)
        print(f'{year}: {visitors}')
        file.write(f"Year: {year}\n")
        file.write(f"Total visitors: {total_visitors}\n")
        for ip in visitors:
            file.write(f"{ip}\n")        

with open(output_file_month, "w") as file:
    for month, visitors in monthly_visitors.items():
        file.write(f"Month: {month}\n")
        file.write(f"Total unique visitors: {len(visitors)}\n")        
        # for ip_prefix, entries in visitors.items():
        #     file.write(f"IP Prefix: {ip_prefix}\n")
        #     for entry in entries:
        #         file.write(f" - Date: {entry['date']}, IP: {entry['ip']}, URL: {entry['url']}, Status: {entry['status']}\n")
        
        file.write("\n")
print(f"Total visitors in a year: {total_visitors}")


2024: {'193.226.40': [{'date_key': '2024-02-02', 'ip': '193.226.40.43', 'timestamp': '02/Feb/2024:14:06:00 +0000', 'method': 'GET', 'url': '/'}, {'date_key': '2024-02-05', 'ip': '193.226.40.59', 'timestamp': '05/Feb/2024:13:39:03 +0000', 'method': 'GET', 'url': '/'}, {'date_key': '2024-02-06', 'ip': '193.226.40.59', 'timestamp': '06/Feb/2024:00:00:06 +0000', 'method': 'POST', 'url': '/wp-admin/admin-ajax.php'}, {'date_key': '2024-02-21', 'ip': '193.226.40.59', 'timestamp': '21/Feb/2024:14:25:42 +0000', 'method': 'GET', 'url': '/'}, {'date_key': '2024-02-22', 'ip': '193.226.40.59', 'timestamp': '22/Feb/2024:07:56:52 +0000', 'method': 'POST', 'url': '/inscriere_cursuri/'}, {'date_key': '2024-03-04', 'ip': '193.226.40.59', 'timestamp': '04/Mar/2024:08:21:26 +0000', 'method': 'GET', 'url': '/admin'}, {'date_key': '2024-03-13', 'ip': '193.226.40.54', 'timestamp': '13/Mar/2024:09:08:12 +0000', 'method': 'GET', 'url': '/'}, {'date_key': '2024-09-09', 'ip': '193.226.40.43', 'timestamp': '09/Se