<a href="https://colab.research.google.com/github/L-SanthoshKumar/log-analysis-script/blob/main/log_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
import re
from collections import defaultdict
import csv

# Define the log file path and output CSV path
log_file_path = '/content/sample.log'  # Replace with your actual log file path
output_csv_path = 'log_analysis_results.csv'

def parse_log_file(file_path):
    # Initialize data structures to hold parsed log data
    endpoint_requests = defaultdict(int)
    failed_logins = defaultdict(int)
    ip_requests = defaultdict(int)  # Keep track of IP requests as well

    # Open and read the log file
    with open(file_path, 'r') as file:
        for line in file:
            # Parse IP address from the log line
            ip_match = re.match(r'(\S+)', line)
            if ip_match:
                ip = ip_match.group(1)
                ip_requests[ip] += 1

            # Parse endpoint from the log line (for GET or POST requests)
            endpoint_match = re.search(r'\"(?:GET|POST) (\S+)', line)
            if endpoint_match:
                endpoint = endpoint_match.group(1)
                endpoint_requests[endpoint] += 1

            # Check for failed login attempts (HTTP status 401 with Invalid credentials)
            if "POST /login" in line and "401" in line and "Invalid credentials" in line:
                ip_failed = ip_match.group(1)
                failed_logins[ip_failed] += 1

    return ip_requests, endpoint_requests, failed_logins

def analyze_logs(ip_requests, endpoint_requests, failed_logins, failed_login_threshold=3):
    # Sort IP requests by count in descending order
    sorted_ip_requests = sorted(ip_requests.items(), key=lambda x: x[1], reverse=True)

    # Find the most frequently accessed endpoint
    most_accessed_endpoint = max(endpoint_requests.items(), key=lambda x: x[1], default=None)

    # Detect suspicious activity (failed logins exceeding threshold)
    suspicious_activity = {ip: count for ip, count in failed_logins.items() if count >= failed_login_threshold}

    return sorted_ip_requests, most_accessed_endpoint, suspicious_activity

def save_to_csv(sorted_ip_requests, most_accessed_endpoint, suspicious_activity, output_path):
    with open(output_path, 'w', newline='') as file:
        writer = csv.writer(file)

        # Write Requests per IP
        writer.writerow(['IP Address', 'Request Count'])  # Header for Requests per IP
        for ip, count in sorted_ip_requests:
            writer.writerow([ip, count])

        # Add an empty row between sections
        writer.writerow([])

        # Write Most Accessed Endpoint
        writer.writerow(['Most Accessed Endpoint'])  # Title for Most Accessed Endpoint
        writer.writerow(['Endpoint', 'Access Count'])  # Header for Most Accessed Endpoint
        if most_accessed_endpoint:
            writer.writerow([most_accessed_endpoint[0], most_accessed_endpoint[1]])
        else:
            writer.writerow(['None', '0'])

        # Add an empty row between sections
        writer.writerow([])

        # Write Suspicious Activity Detected
        writer.writerow(['Suspicious Activity Detected'])  # Title for Suspicious Activity
        writer.writerow(['IP Address', 'Failed Login Count'])  # Header for Suspicious Activity
        if suspicious_activity:
            for ip, count in suspicious_activity.items():
                writer.writerow([ip, count])
        else:
            writer.writerow(['None', '0'])

def main():
    # Parse the log file
    ip_requests, endpoint_requests, failed_logins = parse_log_file(log_file_path)

    # Analyze logs
    sorted_ip_requests, most_accessed_endpoint, suspicious_activity = analyze_logs(
        ip_requests, endpoint_requests, failed_logins
    )

    # Display the results in a structured format
    print("Requests per IP:")
    print(f"{'IP Address':<20} {'Request Count'}")
    for ip, count in sorted_ip_requests:
        print(f"{ip:<20} {count}")

    print("\nMost Frequently Accessed Endpoint:")
    if most_accessed_endpoint:
        print(f"{most_accessed_endpoint[0]:<20} {most_accessed_endpoint[1]}")
    else:
        print("No endpoint accessed.")

    print("\nSuspicious Activity Detected:")
    print(f"{'IP Address':<20} {'Failed Login Count'}")
    if suspicious_activity:
        for ip, count in suspicious_activity.items():
            print(f"{ip:<20} {count}")
    else:
        print("No Suspicious Activity Detected.")

    # Save results to CSV
    save_to_csv(sorted_ip_requests, most_accessed_endpoint, suspicious_activity, output_csv_path)
    print(f"\nResults saved to {output_csv_path}")

if __name__ == "__main__":
    main()


Requests per IP:
IP Address           Request Count
203.0.113.5          8
198.51.100.23        8
192.168.1.1          7
10.0.0.2             6
192.168.1.100        5

Most Frequently Accessed Endpoint:
/login               13

Suspicious Activity Detected:
IP Address           Failed Login Count
203.0.113.5          8
192.168.1.100        5

Results saved to log_analysis_results.csv
