# Log Puzzle Starter

In [None]:
"""
Log Puzzle exercise

Copyright 2010 Google Inc.
Licensed under the Apache License, Version 2.0
http://www.apache.org/licenses/LICENSE-2.0

Given an Apache logfile, find the puzzle URLs and download the images.

Here's what a puzzle URL looks like (spread out onto multiple lines):
10.254.254.28 - - [06/Aug/2007:00:13:48 -0700] "GET /~foo/puzzle-bar-aaab.jpg
HTTP/1.0" 302 528 "-" "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US;
rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6"
"""

import os
import re
import sys
import urllib.request
import argparse


def read_urls(filename):
    """Returns a list of the puzzle URLs from the given log file,
    extracting the hostname from the filename itself, sorting
    alphabetically in increasing order, and screening out duplicates.
    """
    # extract the server's hostname from the filename
    host = filename[filename.index("_") + 1:]
    pattern = re.compile(r"GET (\S+)")  # pre-compile our regex pattern
    url_dict = {}
    with open(filename) as f:
        # search each line of the log file for a puzzle image
        for line in f:
            match = pattern.search(line)
            path = match.group(1)
            # if this line contains a puzzle image, combine its
            # path with the server name
            if "puzzle" in path:
                # we found a puzzle piece! build the full URL!
                # use a dictionary to de-duplicate the URLs
                url_dict[f"http://{host}{path}"] = 1
    # TODO: update the `sorted()` call to use a key= function to sort by the
    # last letter grouping of letters within each url
    return sorted(url_dict)


def download_images(img_urls, dest_dir):
    """Given the URLs already in the correct order, downloads
    each image into the given directory.
    Gives the images local filenames img0, img1, and so on.
    Creates an index.html in the directory with an <img> tag
    to show each local image file.
    Creates the directory if necessary.
    """
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    # open an index.html file within the dest_dir for writing
    # write the surrounding HTML tags to the HTML file
    for i, url in enumerate(img_urls):
        filename = f"img{i}"
        print(f"Retrieving {i+1} of {len(img_urls)}...")
        urllib.request.urlretrieve(url, os.path.join(dest_dir, filename))
        # write the img tag for this image to the HTML file
    # write the closing surrounding HTML tags to the HTML file


def create_parser():
    """Creates an argument parser object."""
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--todir',
                        help='destination directory for downloaded images')
    parser.add_argument('logfile', help='apache logfile to extract urls from')

    return parser


def main(args):
    """Parses args, scans for URLs, gets images from URLs."""
    parser = create_parser()

    if not args:
        parser.print_usage()
        sys.exit(1)

    parsed_args = parser.parse_args(args)

    img_urls = read_urls(parsed_args.logfile)

    if parsed_args.todir:
        download_images(img_urls, parsed_args.todir)
    else:
        print('\n'.join(img_urls))


if __name__ == '__main__':
    main(sys.argv[1:])
