In [30]:
import gzip
import os
import shutil
import boto3
from IPython import embed
import pandas


s3 = boto3.client('s3')

bucket_name = 'skoove-aws-logs'
# CloudWatch distro and date
prefix = 'lesson_content/E151JJXT4MVQ3V.2024-04-24'

# Create a directory to store the downloaded files
if not os.path.exists('lesson_content'):
    os.mkdir('lesson_content')

def download_file(bucket_name, prefix):
    # List all objects within the specified bucket with the specified prefix
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

    if 'Contents' in response:
        for item in response['Contents']:
            file_name = item['Key']
            # Download each file to the local file system
            s3.download_file(bucket_name, file_name, file_name)
    else:
        print("No files found with the specified prefix.")

def unzip_all_files_in_dir(dir_name):
    for filename in os.listdir(dir_name):
    # Check if the file is a gzipped file
        if filename.endswith('.gz'):
            # Construct the full file path
            file_path = os.path.join(dir_name, filename)
            # Construct the output file path
            output_path = os.path.join(dir_name, filename[:-3])  # remove '.gz' from filename

            # Open the gzipped file, decompress it and write it to the output file
            with gzip.open(file_path, 'rb') as f_in:
                with open(output_path, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)

def load_dataframe(dir_name, headers, skiprows):
    # Create an empty list to store the dataframes
    dataframes = []
    # Iterate over all files in the specified directory
    for filename in os.listdir(dir_name):
        # Check if the file is a CSV file
        if not filename.endswith('.gz'):
            # Construct the full file path

            file_path = os.path.join(dir_name, filename)
            # Load the CSV file into a pandas DataFrame
            df = pandas.read_csv(
                file_path,
                delimiter='\t',
                skiprows=skiprows,
                on_bad_lines='warn')
            assert df.shape[1] == len(headers)
            df.columns = headers
            # Append the DataFrame to the list
            dataframes.append(df)
    # Concatenate all DataFrames in the list into a single DataFrame
    return pandas.concat(dataframes, axis=0)

download_file(bucket_name, prefix)
unzip_all_files_in_dir('lesson_content')

In [31]:
df = load_dataframe(
	'lesson_content',
	[
		'date', 'time', 'x-edge-location', 'sc-bytes', 'c-ip', 'cs-method', 'cs(Host)', 'cs-uri-stem', 'sc-status', 'cs(Referer)', 'cs(User-Agent)', 'cs-uri-query', 'cs(Cookie)', 'x-edge-result-type', 'x-edge-request-id', 'x-host-header', 'cs-protocol', 'cs-bytes', 'time-taken', 'x-forwarded-for', 'ssl-protocol', 'ssl-cipher', 'x-edge-response-result-type', 'cs-protocol-version', 'fle-status', 'fle-encrypted-fields', 'c-port', 'time-to-first-byte', 'x-edge-detailed-result-type', 'sc-content-type', 'sc-content-len', 'sc-range-start', 'sc-range-end'
	],
	2
)

Amount of 400 errors

In [32]:
error_400 = df.loc[df['sc-status'] == 400]
error_400.shape[0]

3288

#### 400-causing URIs

Q: @workspace What does %25C0 mean in CloudFront logs?

A:
In CloudFront logs, `%25C0` is a URL-encoded string. It represents the `%C0` string when decoded. 

`%C0` is a URL-encoded representation of a non-printable ASCII character (a control character). In this case, it represents the ASCII `NULL` character. 

However, `%C0` is often used in a technique called `C0 control character encoding` to bypass security measures, as it can be used to represent [``/``](command:_github.copilot.openRelativePath?%5B%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2FUsers%2Fmartinfroehlich%2FDesktop%2FContent%20400%2F%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%5D "/Users/martinfroehlich/Desktop/Content 400/") (`%C0%AF` in Unicode fullwidth form or `%C0%2F` in overlong form) or [``.``](command:_github.copilot.openRelativePath?%5B%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2FUsers%2Fmartinfroehlich%2FDesktop%2FContent%20400%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%5D "/Users/martinfroehlich/Desktop/Content 400") (`%C0%AE` in Unicode fullwidth form or `%C0%2E` in overlong form) in a URL, which could lead to path traversal attacks if not properly handled. 

If you see `%25C0` in your logs, it's because the `%` character was also encoded to `%25`, resulting in `%25C0`. This might be a sign of someone trying to exploit a security vulnerability in your system.

In [33]:
error_400.groupby('cs-uri-stem').size().sort_values(ascending=False).head(20)


cs-uri-stem
/%25C0                          405
/%25C0/login                     62
/%25C0/                          60
/%25C0/index.php                 27
/%25C0/Login.aspx                21
/%25C0/login.html                20
/%25C0/login.php                 19
/%25C0/login/                    18
/%25C0/index.html                18
/%25C0/ui/                       16
/%25C0/admin.php                 14
/%25C0/admin/login/              11
/%25C0/user/login                11
/%25C0/favicon.ico               11
/%25C0/auth/login                11
/%25C0/user.php                  10
/%25C0/admin/index.php           10
/%25C0/accounts/login/           10
/%25C0/dashboard/                 9
/%25C0/webclient/Login.xhtml      8
dtype: int64