In [None]:
from pprint import pprint
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

In [None]:
import re
import datetime
from pyspark.sql import Row

month_map = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}

def apache_time(s):
   return datetime.datetime(
      int(s[7:11]),
      month_map[s[3:6]],
      int(s[0:2]),
      int(s[12:14]),
      int(s[15:17]),
      int(s[18:20]))

In [None]:
# (1a) Parsing Each Log Line
APACHE_ACCESS_LOG_PATTERN = '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+)\s*(\S*)" (\d{3}) (\S+)'
def parse_apache_log_line(logline):
    match = re.search(APACHE_ACCESS_LOG_PATTERN, logline)
    if match is None: return (logline, 0)
    size_field = match.group(9)
    if size_field == '-': size = int(0)
    else: size = int(match.group(9))
    return (Row(
        host = match.group(1),
        client_identd = match.group(2),
        user_id = match.group(3),
        date_time = apache_time(match.group(4)),
        method = match.group(5),
        endpoint = match.group(6),
        protocol = match.group(7),
        response_code = int(match.group(8)),
        content_size = size))

In [None]:
subject = 'atl4-m52.ed.ac.uk - - [03/Jul/1995:09:31:12 -0400] "GET /images/NASA-logosmall.gif align=left HTTP/1.0" 200 786'
pprint(parse_apache_log_line(subject))
subject = 'ip157.vivanet.com - - [02/Jul/1995:00:00:00 -0400] "GET /images/shuttle-patch-logo.gif HTTP/1.0" 200 891'
pprint(parse_apache_log_line(subject))

Row(host='ip157.vivanet.com', client_identd='-', user_id='-', date_time=datetime.datetime(1995, 7, 2, 0, 0), method='GET', endpoint='/images/shuttle-patch-logo.gif', protocol='HTTP/1.0', response_code=200, content_size=891)


In [None]:
from google.colab import drive
drive.mount('/content/modules', force_remount=True)

In [None]:
# (1b) Configuration and Initial RDD Creation
logFile = "/content/modules/My Drive/NASAlog.txt"
def parseLogs():
    parsed_logs = sc.textFile(logFile).map(parse_apache_log_line).cache()
    access_logs = parsed_logs.filter(lambda s: s[1] == 1).map(lambda s: s[0]).cache()
    failed_logs = parsed_logs.filter(lambda s: s[1] == 0).map(lambda s: s[0])
    failed_logs_count = failed_logs.count()
    if failed_logs_count > 0:
        print('Number of invalid loglines: %d' % failed_logs.count())
        for line in failed_logs.take(20):
            print('Invalid logline: %s' % line)
    print('Read %d lines, successfully parsed %d lines, failed to parse %d lines'
        % (parsed_logs.count(), access_logs.count(), failed_logs.count()))
    return parsed_logs, access_logs, failed_logs
parsed_logs, access_logs, failed_logs = parseLogs()

In [None]:
# (1c) Data Cleaning
APACHE_ACCESS_LOG_PATTERN = (
    r"^(?P<host>[^ ]*)"
    r" (?P<client_id>[^ ]*)"
    r" (?P<user_id>[^ ]*)"
    r" \[(?P<date_time>[^]]*)\]"
    r" \"(?P<method>GET|HEAD|POST)"
    r" (?P<endpoint>.*?)"
    r"(?P<protocol> HTTP/(?:V|)1.0|)"
    r"\" (?P<response_code>[0-9]+)"
    r" (?P<content_size>[0-9]+|-)"
    r"$"
)
def parse_apache_log_line(logline):
    match = re.search(APACHE_ACCESS_LOG_PATTERN, logline)
    if match is None: return (logline, 0)
    size_field = match.groupdict()['content_size']
    if size_field == '-': size = int(0)
    else: size = int(size_field)
#   response_field = match.groupdict()['response_code']
#   if response_field is None: code = 0
#   elif response_field == '': code = 0
#   else: code = int(response_field)
    return (Row(
        host = match.groupdict()['host'],
        client_id = match.groupdict()['client_id'],
        user_id = match.groupdict()['user_id'],
        date_time = apache_time(match.groupdict()['date_time']),
        method = match.groupdict()['method'],
        endpoint = match.groupdict()['endpoint'],
        protocol = match.groupdict()['protocol'].strip(' '),
        response_code = match.groupdict()['response_code'],
        content_size = size
        ), 1)
parsed_logs, access_logs, failed_logs = parseLogs()

Read 314876 lines, successfully parsed 314876 lines, failed to parse 0 lines
