# Exercise 1: Schema on Read

In [1]:
import findspark
import os

findspark.init(os.environ['SPARK_HOME'])

In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [3]:
spark = SparkSession.builder.getOrCreate()

# Load the dataset

Data source: http://ita.ee.lbl.gov/traces/NASA_access_log_Jul95.gz

In [5]:
log_df = spark.read.text('data/NASA_access_log_Jul95.txt')

# Quick inspection of  the data set

In [6]:
log_df.printSchema()

root
 |-- value: string (nullable = true)



In [7]:
log_df.count()

2000

In [8]:
log_df.show(5, truncate=False)

+-----------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                  |
+-----------------------------------------------------------------------------------------------------------------------+
|199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245                                 |
|unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] "GET /shuttle/countdown/ HTTP/1.0" 200 3985                      |
|199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] "GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0" 200 4085   |
|burger.letters.com - - [01/Jul/1995:00:00:11 -0400] "GET /shuttle/countdown/liftoff.html HTTP/1.0" 304 0               |
|199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] "GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0" 200 4179|
+-----------------------

In [7]:
import pandas as pd

In [8]:
# show in Pandas dataframe
pd.set_option('max_colwidth', 400)
log_df.limit(5).toPandas()

Unnamed: 0,value
0,"199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] ""GET /history/apollo/ HTTP/1.0"" 200 6245"
1,"unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] ""GET /shuttle/countdown/ HTTP/1.0"" 200 3985"
2,"199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] ""GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0"" 200 4085"
3,"burger.letters.com - - [01/Jul/1995:00:00:11 -0400] ""GET /shuttle/countdown/liftoff.html HTTP/1.0"" 304 0"
4,"199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] ""GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0"" 200 4179"


# Let' try simple parsing with split

In [9]:
log_df_split = log_df.withColumn('tokenized', F.split(F.col('value'), ' '))
log_df_split.limit(10).toPandas()

Unnamed: 0,value,tokenized
0,"199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] ""GET /history/apollo/ HTTP/1.0"" 200 6245","[199.72.81.55, -, -, [01/Jul/1995:00:00:01, -0400], ""GET, /history/apollo/, HTTP/1.0"", 200, 6245]"
1,"unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] ""GET /shuttle/countdown/ HTTP/1.0"" 200 3985","[unicomp6.unicomp.net, -, -, [01/Jul/1995:00:00:06, -0400], ""GET, /shuttle/countdown/, HTTP/1.0"", 200, 3985]"
2,"199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] ""GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0"" 200 4085","[199.120.110.21, -, -, [01/Jul/1995:00:00:09, -0400], ""GET, /shuttle/missions/sts-73/mission-sts-73.html, HTTP/1.0"", 200, 4085]"
3,"burger.letters.com - - [01/Jul/1995:00:00:11 -0400] ""GET /shuttle/countdown/liftoff.html HTTP/1.0"" 304 0","[burger.letters.com, -, -, [01/Jul/1995:00:00:11, -0400], ""GET, /shuttle/countdown/liftoff.html, HTTP/1.0"", 304, 0]"
4,"199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] ""GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0"" 200 4179","[199.120.110.21, -, -, [01/Jul/1995:00:00:11, -0400], ""GET, /shuttle/missions/sts-73/sts-73-patch-small.gif, HTTP/1.0"", 200, 4179]"
5,"burger.letters.com - - [01/Jul/1995:00:00:12 -0400] ""GET /images/NASA-logosmall.gif HTTP/1.0"" 304 0","[burger.letters.com, -, -, [01/Jul/1995:00:00:12, -0400], ""GET, /images/NASA-logosmall.gif, HTTP/1.0"", 304, 0]"
6,"burger.letters.com - - [01/Jul/1995:00:00:12 -0400] ""GET /shuttle/countdown/video/livevideo.gif HTTP/1.0"" 200 0","[burger.letters.com, -, -, [01/Jul/1995:00:00:12, -0400], ""GET, /shuttle/countdown/video/livevideo.gif, HTTP/1.0"", 200, 0]"
7,"205.212.115.106 - - [01/Jul/1995:00:00:12 -0400] ""GET /shuttle/countdown/countdown.html HTTP/1.0"" 200 3985","[205.212.115.106, -, -, [01/Jul/1995:00:00:12, -0400], ""GET, /shuttle/countdown/countdown.html, HTTP/1.0"", 200, 3985]"
8,"d104.aa.net - - [01/Jul/1995:00:00:13 -0400] ""GET /shuttle/countdown/ HTTP/1.0"" 200 3985","[d104.aa.net, -, -, [01/Jul/1995:00:00:13, -0400], ""GET, /shuttle/countdown/, HTTP/1.0"", 200, 3985]"
9,"129.94.144.152 - - [01/Jul/1995:00:00:13 -0400] ""GET / HTTP/1.0"" 200 7074","[129.94.144.152, -, -, [01/Jul/1995:00:00:13, -0400], ""GET, /, HTTP/1.0"", 200, 7074]"


# Second attempt, let's build a custom parsing UDF 

In [10]:
import re

In [14]:
@F.udf
def parseUDF(line):
    PATTERN = '^(\S+) (\S+) (\S+) \[([\w/:]+ [+-]\d{4})\] \"(\S+) (\S+) (\S+)\" (\d{3}) (\S+)'
    # (): marked group  +: repeat
    # check more for python regular expression here: https://pythex.org/
    match = re.search(PATTERN, line)
    if match == None:
        return (line, 0)
    else:
        if match.group(9) == '-':
            size = 0
        else:
            size = match.group(9)
        return {
                'host': match.group(1),
                'client_id': match.group(2),
                'user_id': match.group(3),
                'date_time': match.group(4),
                'method': match.group(5),
                'endpoint': match.group(6),
                'protocol': match.group(7),
                'reponse_code': match.group(8),
                'content_size': size
                }

In [15]:
log_df_udf_1 = log_df.withColumn('tokenized', parseUDF(log_df.value))
log_df_udf_1.limit(10).toPandas()

Unnamed: 0,value,tokenized
0,"199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] ""GET /history/apollo/ HTTP/1.0"" 200 6245","{protocol=HTTP/1.0, endpoint=/history/apollo/, content_size=6245, method=GET, date_time=01/Jul/1995:00:00:01 -0400, user_id=-, host=199.72.81.55, reponse_code=200, client_id=-}"
1,"unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] ""GET /shuttle/countdown/ HTTP/1.0"" 200 3985","{protocol=HTTP/1.0, endpoint=/shuttle/countdown/, content_size=3985, method=GET, date_time=01/Jul/1995:00:00:06 -0400, user_id=-, host=unicomp6.unicomp.net, reponse_code=200, client_id=-}"
2,"199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] ""GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0"" 200 4085","{protocol=HTTP/1.0, endpoint=/shuttle/missions/sts-73/mission-sts-73.html, content_size=4085, method=GET, date_time=01/Jul/1995:00:00:09 -0400, user_id=-, host=199.120.110.21, reponse_code=200, cl..."
3,"burger.letters.com - - [01/Jul/1995:00:00:11 -0400] ""GET /shuttle/countdown/liftoff.html HTTP/1.0"" 304 0","{protocol=HTTP/1.0, endpoint=/shuttle/countdown/liftoff.html, content_size=0, method=GET, date_time=01/Jul/1995:00:00:11 -0400, user_id=-, host=burger.letters.com, reponse_code=304, client_id=-}"
4,"199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] ""GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0"" 200 4179","{protocol=HTTP/1.0, endpoint=/shuttle/missions/sts-73/sts-73-patch-small.gif, content_size=4179, method=GET, date_time=01/Jul/1995:00:00:11 -0400, user_id=-, host=199.120.110.21, reponse_code=200,..."
5,"burger.letters.com - - [01/Jul/1995:00:00:12 -0400] ""GET /images/NASA-logosmall.gif HTTP/1.0"" 304 0","{protocol=HTTP/1.0, endpoint=/images/NASA-logosmall.gif, content_size=0, method=GET, date_time=01/Jul/1995:00:00:12 -0400, user_id=-, host=burger.letters.com, reponse_code=304, client_id=-}"
6,"burger.letters.com - - [01/Jul/1995:00:00:12 -0400] ""GET /shuttle/countdown/video/livevideo.gif HTTP/1.0"" 200 0","{protocol=HTTP/1.0, endpoint=/shuttle/countdown/video/livevideo.gif, content_size=0, method=GET, date_time=01/Jul/1995:00:00:12 -0400, user_id=-, host=burger.letters.com, reponse_code=200, client_..."
7,"205.212.115.106 - - [01/Jul/1995:00:00:12 -0400] ""GET /shuttle/countdown/countdown.html HTTP/1.0"" 200 3985","{protocol=HTTP/1.0, endpoint=/shuttle/countdown/countdown.html, content_size=3985, method=GET, date_time=01/Jul/1995:00:00:12 -0400, user_id=-, host=205.212.115.106, reponse_code=200, client_id=-}"
8,"d104.aa.net - - [01/Jul/1995:00:00:13 -0400] ""GET /shuttle/countdown/ HTTP/1.0"" 200 3985","{protocol=HTTP/1.0, endpoint=/shuttle/countdown/, content_size=3985, method=GET, date_time=01/Jul/1995:00:00:13 -0400, user_id=-, host=d104.aa.net, reponse_code=200, client_id=-}"
9,"129.94.144.152 - - [01/Jul/1995:00:00:13 -0400] ""GET / HTTP/1.0"" 200 7074","{protocol=HTTP/1.0, endpoint=/, content_size=7074, method=GET, date_time=01/Jul/1995:00:00:13 -0400, user_id=-, host=129.94.144.152, reponse_code=200, client_id=-}"


In [16]:
log_df_udf_1.printSchema()

root
 |-- value: string (nullable = true)
 |-- tokenized: string (nullable = true)



Need to specify the data type for 'tokenized' as map

# Third attempt, let's fix our UDF

In [11]:
# Specify that the udf function returns MapType with key being StringType and value being StringType
@F.udf(T.MapType(T.StringType(), T.StringType()))
def parse_UDF_better(line):
    PATTERN = '^(\S+) (\S+) (\S+) \[([\w/:]+ [+-]\d{4})\] \"(\S+) (\S+) (\S+)\" (\d{3}) (\S+)'
    match = re.search(PATTERN, line)
    if match is None:
        return (line, 0)
    else:
        size = match.group(9)
        if match.group(9) == '-':
            size = 0
        return {
            'host': match.group(1),
            'client_id': match.group(2),
            'user_id': match.group(3),
            'date_time': match.group(4),
            'method': match.group(5),
            'endpoint': match.group(6),
            'protocol': match.group(7),
            'response_code': match.group(8),
            'content_size': size,
        }

In [12]:
log_df_udf_2 = log_df.withColumn('tokenized', parse_UDF_better(F.col('value')))
log_df_udf_2.limit(10).toPandas()

Unnamed: 0,value,tokenized
0,"199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] ""GET /history/apollo/ HTTP/1.0"" 200 6245","{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/history/apollo/', 'content_size': '6245', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:01 -0400', 'user_id': '-', 'host': '199.72.81.55', 'client_id': '-'}"
1,"unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] ""GET /shuttle/countdown/ HTTP/1.0"" 200 3985","{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:06 -0400', 'user_id': '-', 'host': 'unicomp6.unicomp.net', 'client_id': '-'}"
2,"199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] ""GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0"" 200 4085","{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/missions/sts-73/mission-sts-73.html', 'content_size': '4085', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:09 -0400', 'user_id': '-', 'host': '199.120.110.21', 'client_id': '-'}"
3,"burger.letters.com - - [01/Jul/1995:00:00:11 -0400] ""GET /shuttle/countdown/liftoff.html HTTP/1.0"" 304 0","{'response_code': '304', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/liftoff.html', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:11 -0400', 'user_id': '-', 'host': 'burger.letters.com', 'client_id': '-'}"
4,"199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] ""GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0"" 200 4179","{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/missions/sts-73/sts-73-patch-small.gif', 'content_size': '4179', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:11 -0400', 'user_id': '-', 'host': '199.120.110.21', 'client_id': '-'}"
5,"burger.letters.com - - [01/Jul/1995:00:00:12 -0400] ""GET /images/NASA-logosmall.gif HTTP/1.0"" 304 0","{'response_code': '304', 'protocol': 'HTTP/1.0', 'endpoint': '/images/NASA-logosmall.gif', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '-', 'host': 'burger.letters.com', 'client_id': '-'}"
6,"burger.letters.com - - [01/Jul/1995:00:00:12 -0400] ""GET /shuttle/countdown/video/livevideo.gif HTTP/1.0"" 200 0","{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/video/livevideo.gif', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '-', 'host': 'burger.letters.com', 'client_id': '-'}"
7,"205.212.115.106 - - [01/Jul/1995:00:00:12 -0400] ""GET /shuttle/countdown/countdown.html HTTP/1.0"" 200 3985","{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/countdown.html', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '-', 'host': '205.212.115.106', 'client_id': '-'}"
8,"d104.aa.net - - [01/Jul/1995:00:00:13 -0400] ""GET /shuttle/countdown/ HTTP/1.0"" 200 3985","{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:13 -0400', 'user_id': '-', 'host': 'd104.aa.net', 'client_id': '-'}"
9,"129.94.144.152 - - [01/Jul/1995:00:00:13 -0400] ""GET / HTTP/1.0"" 200 7074","{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/', 'content_size': '7074', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:13 -0400', 'user_id': '-', 'host': '129.94.144.152', 'client_id': '-'}"


In [23]:
log_df_udf_2.printSchema()

root
 |-- value: string (nullable = true)
 |-- tokenized: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [13]:
log_df_udf_2.select('tokenized').limit(10).toPandas()

Unnamed: 0,tokenized
0,"{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/history/apollo/', 'content_size': '6245', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:01 -0400', 'user_id': '-', 'host': '199.72.81.55', 'client_id': '-'}"
1,"{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:06 -0400', 'user_id': '-', 'host': 'unicomp6.unicomp.net', 'client_id': '-'}"
2,"{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/missions/sts-73/mission-sts-73.html', 'content_size': '4085', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:09 -0400', 'user_id': '-', 'host': '199.120.110.21', 'client_id': '-'}"
3,"{'response_code': '304', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/liftoff.html', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:11 -0400', 'user_id': '-', 'host': 'burger.letters.com', 'client_id': '-'}"
4,"{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/missions/sts-73/sts-73-patch-small.gif', 'content_size': '4179', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:11 -0400', 'user_id': '-', 'host': '199.120.110.21', 'client_id': '-'}"
5,"{'response_code': '304', 'protocol': 'HTTP/1.0', 'endpoint': '/images/NASA-logosmall.gif', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '-', 'host': 'burger.letters.com', 'client_id': '-'}"
6,"{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/video/livevideo.gif', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '-', 'host': 'burger.letters.com', 'client_id': '-'}"
7,"{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/countdown.html', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '-', 'host': '205.212.115.106', 'client_id': '-'}"
8,"{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:13 -0400', 'user_id': '-', 'host': 'd104.aa.net', 'client_id': '-'}"
9,"{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/', 'content_size': '7074', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:13 -0400', 'user_id': '-', 'host': '129.94.144.152', 'client_id': '-'}"


# Let's build separate columns

In [15]:
log_df_udf_2.selectExpr("tokenized['host'] AS host").show(10)

+--------------------+
|                host|
+--------------------+
|        199.72.81.55|
|unicomp6.unicomp.net|
|      199.120.110.21|
|  burger.letters.com|
|      199.120.110.21|
|  burger.letters.com|
|  burger.letters.com|
|     205.212.115.106|
|         d104.aa.net|
|      129.94.144.152|
+--------------------+
only showing top 10 rows



In [16]:
log_df_udf_2.selectExpr(
    ["tokenized['host'] AS host", "tokenized['date_time'] AS date_time"]
).show(10, truncate=False)

+--------------------+--------------------------+
|host                |date_time                 |
+--------------------+--------------------------+
|199.72.81.55        |01/Jul/1995:00:00:01 -0400|
|unicomp6.unicomp.net|01/Jul/1995:00:00:06 -0400|
|199.120.110.21      |01/Jul/1995:00:00:09 -0400|
|burger.letters.com  |01/Jul/1995:00:00:11 -0400|
|199.120.110.21      |01/Jul/1995:00:00:11 -0400|
|burger.letters.com  |01/Jul/1995:00:00:12 -0400|
|burger.letters.com  |01/Jul/1995:00:00:12 -0400|
|205.212.115.106     |01/Jul/1995:00:00:12 -0400|
|d104.aa.net         |01/Jul/1995:00:00:13 -0400|
|129.94.144.152      |01/Jul/1995:00:00:13 -0400|
+--------------------+--------------------------+
only showing top 10 rows



In [18]:
fields = ["host", "client_id","user_id", "date_time", "method", "endpoint", "protocol", "response_code", "content_size"]
exprs = [f"tokenized['{field}'] AS {field}" for field in fields]
exprs

["tokenized['host'] AS host",
 "tokenized['client_id'] AS client_id",
 "tokenized['user_id'] AS user_id",
 "tokenized['date_time'] AS date_time",
 "tokenized['method'] AS method",
 "tokenized['endpoint'] AS endpoint",
 "tokenized['protocol'] AS protocol",
 "tokenized['response_code'] AS response_code",
 "tokenized['content_size'] AS content_size"]

In [19]:
log_df_clean = log_df_udf_2.selectExpr(exprs)
log_df_clean.limit(5).toPandas()

Unnamed: 0,host,client_id,user_id,date_time,method,endpoint,protocol,response_code,content_size
0,199.72.81.55,-,-,01/Jul/1995:00:00:01 -0400,GET,/history/apollo/,HTTP/1.0,200,6245
1,unicomp6.unicomp.net,-,-,01/Jul/1995:00:00:06 -0400,GET,/shuttle/countdown/,HTTP/1.0,200,3985
2,199.120.110.21,-,-,01/Jul/1995:00:00:09 -0400,GET,/shuttle/missions/sts-73/mission-sts-73.html,HTTP/1.0,200,4085
3,burger.letters.com,-,-,01/Jul/1995:00:00:11 -0400,GET,/shuttle/countdown/liftoff.html,HTTP/1.0,304,0
4,199.120.110.21,-,-,01/Jul/1995:00:00:11 -0400,GET,/shuttle/missions/sts-73/sts-73-patch-small.gif,HTTP/1.0,200,4179


## Popular hosts

In [20]:
log_df_clean.groupBy('host').count().sort(F.desc('count')).limit(10).toPandas()

Unnamed: 0,host,count
0,uplherc.upl.com,66
1,129.188.154.200,41
2,in24.inetnebr.com,35
3,teleman.pr.mcs.net,32
4,haraway.ucet.ufl.edu,32
5,piweba3y.prodigy.com,31
6,piweba1y.prodigy.com,30
7,143.158.26.50,29
8,pm9.j51.com,28
9,133.43.96.45,27


## Popular content

In [21]:
log_df_clean.groupBy('endpoint').count().sort(F.desc('count')).limit(10).toPandas()

Unnamed: 0,endpoint,count
0,/images/KSC-logosmall.gif,110
1,/images/NASA-logosmall.gif,101
2,/images/MOSAIC-logosmall.gif,59
3,/images/WORLD-logosmall.gif,58
4,/images/USA-logosmall.gif,57
5,/shuttle/countdown/,55
6,/images/ksclogo-medium.gif,52
7,/images/launch-logo.gif,44
8,/,39
9,/shuttle/countdown/count.gif,38


## Large Files

In [22]:
log_df_clean.createOrReplaceTempView('log_sql_clean')

In [23]:
spark.sql(
    """SELECT endpoint, content_size
    FROM log_sql_clean
    ORDER BY content_size DESC
    """
).limit(10).toPandas()

Unnamed: 0,endpoint,content_size
0,/shuttle/missions/sts-56/sts-56-patch-small.gif,9978
1,/elv/elvhead3.gif,9925
2,/software/winvn/winvn.html,9867
3,/software/winvn/winvn.html,9867
4,/software/winvn/winvn.html,9867
5,/software/winvn/winvn.html,9867
6,/software/winvn/winvn.html,9867
7,/software/winvn/winvn.html,9867
8,/software/winvn/winvn.html,9867
9,/software/winvn/winvn.html,9867


In [24]:
# expr(): takes SQL expression in string
log_df_clean_typed = log_df_clean.withColumn('content_size_bytes', F.expr("CAST(content_size AS int)"))
# log_df_clean_typed = log_df_clean.withColumn('content_size_bytes', log_df_clean.content_size.cast('int'))

In [25]:
log_df_clean_typed.printSchema()

root
 |-- host: string (nullable = true)
 |-- client_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- date_time: string (nullable = true)
 |-- method: string (nullable = true)
 |-- endpoint: string (nullable = true)
 |-- protocol: string (nullable = true)
 |-- response_code: string (nullable = true)
 |-- content_size: string (nullable = true)
 |-- content_size_bytes: integer (nullable = true)



In [26]:
log_df_clean_typed.limit(5).toPandas()

Unnamed: 0,host,client_id,user_id,date_time,method,endpoint,protocol,response_code,content_size,content_size_bytes
0,199.72.81.55,-,-,01/Jul/1995:00:00:01 -0400,GET,/history/apollo/,HTTP/1.0,200,6245,6245
1,unicomp6.unicomp.net,-,-,01/Jul/1995:00:00:06 -0400,GET,/shuttle/countdown/,HTTP/1.0,200,3985,3985
2,199.120.110.21,-,-,01/Jul/1995:00:00:09 -0400,GET,/shuttle/missions/sts-73/mission-sts-73.html,HTTP/1.0,200,4085,4085
3,burger.letters.com,-,-,01/Jul/1995:00:00:11 -0400,GET,/shuttle/countdown/liftoff.html,HTTP/1.0,304,0,0
4,199.120.110.21,-,-,01/Jul/1995:00:00:11 -0400,GET,/shuttle/missions/sts-73/sts-73-patch-small.gif,HTTP/1.0,200,4179,4179


In [27]:
log_df_clean_typed.createOrReplaceTempView('cleantypedlog')

spark.sql("""SELECT endpoint, content_size
          FROM cleantypedlog
          ORDER BY content_size DESC
          """).limit(10).toPandas()

Unnamed: 0,endpoint,content_size
0,/shuttle/missions/sts-56/sts-56-patch-small.gif,9978
1,/elv/elvhead3.gif,9925
2,/software/winvn/winvn.html,9867
3,/software/winvn/winvn.html,9867
4,/software/winvn/winvn.html,9867
5,/software/winvn/winvn.html,9867
6,/software/winvn/winvn.html,9867
7,/software/winvn/winvn.html,9867
8,/software/winvn/winvn.html,9867
9,/software/winvn/winvn.html,9867


In [69]:
# Left for you, clean the date column :)
# 1- Create a udf that parses that weird format,
# 2- Create a new column with a data tiem string that spark would understand
# 3- Add a new date-time column properly typed
# 4- Print your schemac

In [28]:
log_df_clean.limit(5).toPandas()

Unnamed: 0,host,client_id,user_id,date_time,method,endpoint,protocol,response_code,content_size
0,199.72.81.55,-,-,01/Jul/1995:00:00:01 -0400,GET,/history/apollo/,HTTP/1.0,200,6245
1,unicomp6.unicomp.net,-,-,01/Jul/1995:00:00:06 -0400,GET,/shuttle/countdown/,HTTP/1.0,200,3985
2,199.120.110.21,-,-,01/Jul/1995:00:00:09 -0400,GET,/shuttle/missions/sts-73/mission-sts-73.html,HTTP/1.0,200,4085
3,burger.letters.com,-,-,01/Jul/1995:00:00:11 -0400,GET,/shuttle/countdown/liftoff.html,HTTP/1.0,304,0
4,199.120.110.21,-,-,01/Jul/1995:00:00:11 -0400,GET,/shuttle/missions/sts-73/sts-73-patch-small.gif,HTTP/1.0,200,4179


In [29]:
from datetime import datetime
import re

In [38]:
@F.udf(T.TimestampType())
def construct_datetime(dt_str):
    PATTERN = '(\d{2})/([a-zA-Z]{3})/(\d{4}):\d{2}:(\d{2}):(\d{2})\s*[+-](\d{2})\d{2}'
    match = re.search(PATTERN, dt_str)
    if match is None:
        return None
    else:
        day = match.group(1)
        month = match.group(2)
        year = match.group(3)
        minute = match.group(4)
        second = match.group(5)
        hour = match.group(6)

        dt = datetime.strptime(
            f'{day} {month} {year}, {hour}:{minute}:{second}',
            '%d %b %Y, %H:%M:%S'
        )

    return dt

In [43]:
df = log_df_clean.withColumn('date_time', construct_datetime(F.col('date_time')))

In [44]:
df.printSchema()

root
 |-- host: string (nullable = true)
 |-- client_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- date_time: timestamp (nullable = true)
 |-- method: string (nullable = true)
 |-- endpoint: string (nullable = true)
 |-- protocol: string (nullable = true)
 |-- response_code: string (nullable = true)
 |-- content_size: string (nullable = true)



In [45]:
df.limit(5).toPandas()

  series = series.astype(t, copy=False)


Unnamed: 0,host,client_id,user_id,date_time,method,endpoint,protocol,response_code,content_size
0,199.72.81.55,-,-,1995-07-01 04:00:01,GET,/history/apollo/,HTTP/1.0,200,6245
1,unicomp6.unicomp.net,-,-,1995-07-01 04:00:06,GET,/shuttle/countdown/,HTTP/1.0,200,3985
2,199.120.110.21,-,-,1995-07-01 04:00:09,GET,/shuttle/missions/sts-73/mission-sts-73.html,HTTP/1.0,200,4085
3,burger.letters.com,-,-,1995-07-01 04:00:11,GET,/shuttle/countdown/liftoff.html,HTTP/1.0,304,0
4,199.120.110.21,-,-,1995-07-01 04:00:11,GET,/shuttle/missions/sts-73/sts-73-patch-small.gif,HTTP/1.0,200,4179
