In [1]:
# start the Spark context
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install -q pyspark
import os
os.environ["PYSPARK_PYTHON"]="python3"
os.environ["JAVA_HOME"]="/usr/lib/jvm/java-8-openjdk-amd64/"

import pyspark
from pyspark import SparkConf, SparkContext

try:
  conf = SparkConf().setMaster("local[*]").set("spark.executor.memory", "1g")
  sc = SparkContext(conf = conf)
except ValueError:
  #it's ok if the server is already started
  pass

def dbg(x):
  """ A helper function to print debugging information on RDDs """
  if isinstance(x, pyspark.RDD):
    print([(t[0], list(t[1]) if 
            isinstance(t[1], pyspark.resultiterable.ResultIterable) else t[1])
           if isinstance(t, tuple) else t
           for t in x.take(100)])
  else:
    print(x)
    

import unittest
Test = unittest.TestCase()

[K     |████████████████████████████████| 217.8MB 55kB/s 
[K     |████████████████████████████████| 204kB 57.6MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [0]:
import re
import datetime

from pyspark.sql import Row


def parseApacheLogLine(logline):
    """ Parse a line in the Apache Common Log format
    Args:
        logline (str): a line of text in the Apache Common Log format
    Returns:
        tuple: either a dictionary containing the parts of the Apache Access Log and 1,
               or the original invalid log line and 0
    """
    match = logline.split('\t')
    try:
      size_field = match[6]
      if size_field == '-':
          size = int(0)
      else:
          size = int(match[6])
      return (Row(
          host          = match[0],
          log_name      = match[1],
          date_time     = datetime.datetime.fromtimestamp(int(match[2])),
          method        = match[3],
          endpoint      = match[4],
          response_code = int(match[5]),
          content_size  = size
      ), 1)
    except:
      return (logline, 0)


In [3]:
!wget -q http://indeedeng.github.io/imhotep/files/nasa_19950801.tsv
  


def parseLog(logFile):
    parsed_log = (sc
                   .textFile(logFile)
                   .map(parseApacheLogLine)
                   .cache())

    access_logs = (parsed_log
                   .filter(lambda s: s[1] == 1)
                   .map(lambda s: s[0])
                   .cache())

    failed_logs = (parsed_log
                   .filter(lambda s: s[1] == 0)
                   .map(lambda s: s[0]))
    failed_logs_count = failed_logs.count()
    if failed_logs_count > 0:
        print('Number of invalid logline: %d' % failed_logs.count())
        for line in failed_logs.take(20):
            print('Invalid logline: %s' % line)

    print('Read %d lines, successfully parsed %d lines, failed to parse %d lines' % (parsed_log.count(), access_logs.count(), failed_logs.count()))
    return parsed_log, access_logs, failed_logs


parsed_logs, access_logs, failed_logs = parseLog('nasa_19950801.tsv')

Number of invalid logline: 1
Invalid logline: host	logname	time	method	url	response	bytes	referer	useragent
Read 30970 lines, successfully parsed 30969 lines, failed to parse 1 lines
