## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [0]:
import glob
import os 
import pandas as pd
from pyspark.sql.functions import split
import re
import numpy as np

In [0]:
# File location and type
file_location = "/FileStore/tables/NASA_access_log_Jul95-1.gz"
my_list = []
# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.csv(file_location, sep=' ')
#df = df.select(split(df._c5, ' ').alias(['type','a1','a2'])).collect()
my_list = df.select('_c5').rdd.flatMap(lambda x: x).collect()
my_list2 = df.select('_c0').rdd.flatMap(lambda x: x).collect()
my_list3 = df.select('_c7').rdd.flatMap(lambda x: x).collect()
my_list4 = df.select('_c3').rdd.flatMap(lambda x: x).collect()
my_list5 = df.select('_c4').rdd.flatMap(lambda x: x).collect()
my_list6 = df.select('_c6').rdd.flatMap(lambda x: x).collect()

In [0]:
display(df.tail(1000))

_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7
esdmac166.arlut.utexas.edu,-,-,[28/Jul/1995:13:16:12,-0400],GET /icons/menu.xbm HTTP/1.0,200.0,527
esdmac166.arlut.utexas.edu,-,-,[28/Jul/1995:13:16:12,-0400],GET /icons/image.xbm HTTP/1.0,200.0,509
esdmac166.arlut.utexas.edu,-,-,[28/Jul/1995:13:16:13,-0400],GET /icons/blank.xbm HTTP/1.0,200.0,509
131.182.170.137,-,-,[28/Jul/1995:13:16:16,-0400],GET /images/ksclogo-medium.gif HTTP/1.0,200.0,5866
128.203.26.245,-,-,[28/Jul/1995:13:16:20,-0400],GET /software/winvn/winvn.html HTTP/1.0,200.0,9866
198.240.108.240,-,-,[28/Jul/1995:13:16:21,-0400],GET /images/ HTTP/1.0,200.0,17688
128.203.26.245,-,-,[28/Jul/1995:13:16:21,-0400],GET /software/winvn/winvn.gif HTTP/1.0,304.0,0
128.203.26.245,-,-,[28/Jul/1995:13:16:21,-0400],GET /images/construct.gif HTTP/1.0,304.0,0
128.203.26.245,-,-,[28/Jul/1995:13:16:21,-0400],GET /software/winvn/bluemarb.gif HTTP/1.0,304.0,0
128.203.26.245,-,-,[28/Jul/1995:13:16:21,-0400],GET /software/winvn/wvsmall.gif HTTP/1.0,304.0,0


In [0]:
df_py = df[df['_c6']=='HTTP/1.0\"']
df_py.show()

In [0]:
get_list = []
request_list1 = []
request_list2 = []
invalid = []
a = ''
b = ''
c = ''

for element in my_list: 
    #print('element = ',element)
    if element != None:
        get = element.split()
        #print('get = ',get)
        if len(get) >= 4:
            print('get = ',get)
            invalid.append(get)
        if len(get) == 3:
            a = get[0]
            b = get[1]
            c = get[2]
            get_list.append(a)
            request_list1.append(b)
            request_list2.append(c)
        elif len(get) == 2:
            a = get[0]
            b = get[1]
            c = 'NaN'
            get_list.append(a)
            request_list1.append(b)
            request_list2.append(c)
        elif len(get) == 1:
            match = re.match('\d',get[0])
            if match == None:
                a = get[0]
                b = 'NaN'
                c = 'NaN'
                get_list.append(a)
                request_list1.append(b)
                request_list2.append(c)
            else:
                a = 'NaN'
                b = get[0]
                c = 'NaN'
                get_list.append(a)
                request_list1.append(b)
                request_list2.append(c)
        else:
            a = 'NaN'
            b = 'NaN'
            c = 'NaN'
            get_list.append(a)
            request_list1.append(b)
            request_list2.append(c)
    else:
        a = 'NaN'
        b = 'NaN'
        c = 'NaN'
        get_list.append(a)
        request_list1.append(b)
        request_list2.append(c)
        
        

#print(get_list)
#print(request_list1)


In [0]:
print(len(get_list))

In [0]:
my_new_list = []
for (i,j) in zip(my_list4,my_list5):
    my_new_list.append(str(i)+str(j))

print(my_new_list)

In [0]:
# convert lists into pyspark dataframe

s1 = pd.Series(my_list2)
s2 = pd.Series(my_new_list)
s3 = pd.Series(my_list5)
s4 = pd.Series(get_list)
s5 = pd.Series(request_list1)
s6 = pd.Series(request_list2)
s7 = pd.Series(my_list6)
s8 = pd.Series(my_list3)


# Pandas DataFrame formed using series

In [0]:
#new_df = pd.DataFrame(data = [s1,s2,s3], columns=['type','req1','req2'])
new_df = pd.DataFrame({'host': s1,'timestamp': s2, 'method': s4, 'endpoint': s5, 'protocol': s6,  'http_status': s7, 'content_size': s8})
print(new_df)

# Replacing the method type with the correct one

In [0]:
new_df['method'].replace(to_replace ="\"GET",
                 value ="GET", inplace=True)
#a_df = new_df[new_df['method'] == '"GET']

In [0]:
new_df['domain_name'] = new_df['host'].str.extract(r'(\.com|\.jp|\.net|\.gov|\.edu|\.ca|\.au|\.de|\.nz|\.uk|\.mil|\.sg|\.no|\.se|\.it|\.be|\.dk|\.org|\.ni|\.fr|\.fi|\.si|\.nl|\.arpa|\.ch|\.kr|\.za|\.su|\.at|\.lu|\.pl|\.il|\.es|\.cz|\.us|\.ar|\.mt|\.is|\.pt|\.my|\.ee|\.cl|\.gr|\.ie|\.mx|\.cr|\.br|\.ru|\.int|\.ve|\.cn|\.hu|\.gb|\.sk|\.pa|\.ge|\.bm)')
print(df_new.head(700))

In [0]:
new_df['domain_name'].replace(to_replace =np.NaN,
                 value ="IP Address", inplace=True)

new_df

Unnamed: 0,host,timestamp,method,endpoint,protocol,http_status,content_size,domain_name
0,199.72.81.55,[01/Jul/1995:00:00:01-0400],GET,/history/apollo/,HTTP/1.0,200,6245,IP Address
1,unicomp6.unicomp.net,[01/Jul/1995:00:00:06-0400],GET,/shuttle/countdown/,HTTP/1.0,200,3985,.net
2,199.120.110.21,[01/Jul/1995:00:00:09-0400],GET,/shuttle/missions/sts-73/mission-sts-73.html,HTTP/1.0,200,4085,IP Address
3,burger.letters.com,[01/Jul/1995:00:00:11-0400],GET,/shuttle/countdown/liftoff.html,HTTP/1.0,304,0,.com
4,199.120.110.21,[01/Jul/1995:00:00:11-0400],GET,/shuttle/missions/sts-73/sts-73-patch-small.gif,HTTP/1.0,200,4179,IP Address
...,...,...,...,...,...,...,...,...
1891710,163.205.53.14,[28/Jul/1995:13:32:23-0400],GET,/images/KSC-logosmall.gif,HTTP/1.0,200,1204,IP Address
1891711,tiger2.ocs.lsu.edu,[28/Jul/1995:13:32:23-0400],GET,/shuttle/missions/missions.html,HTTP/1.0,200,8677,.edu
1891712,199.0.2.27,[28/Jul/1995:13:32:23-0400],GET,/images/ksclogo-medium.gif,HTTP/1.0,200,5866,IP Address
1891713,tornado.umd.edu,[28/Jul/1995:13:32:25-0400],GET,/shuttle/missions/sts-74/sts-74-patch-small.gif,HTTP/1.0,200,5494,.edu


# Created the PySpark DataFrame

In [0]:
spark_df = spark.createDataFrame(new_df)
display(spark_df)

host,timestamp,method,endpoint,protocol,http_status,content_size,domain_name
199.72.81.55,[01/Jul/1995:00:00:01-0400],GET,/history/apollo/,HTTP/1.0,200,6245,IP Address
unicomp6.unicomp.net,[01/Jul/1995:00:00:06-0400],GET,/shuttle/countdown/,HTTP/1.0,200,3985,.net
199.120.110.21,[01/Jul/1995:00:00:09-0400],GET,/shuttle/missions/sts-73/mission-sts-73.html,HTTP/1.0,200,4085,IP Address
burger.letters.com,[01/Jul/1995:00:00:11-0400],GET,/shuttle/countdown/liftoff.html,HTTP/1.0,304,0,.com
199.120.110.21,[01/Jul/1995:00:00:11-0400],GET,/shuttle/missions/sts-73/sts-73-patch-small.gif,HTTP/1.0,200,4179,IP Address
burger.letters.com,[01/Jul/1995:00:00:12-0400],GET,/images/NASA-logosmall.gif,HTTP/1.0,304,0,.com
burger.letters.com,[01/Jul/1995:00:00:12-0400],GET,/shuttle/countdown/video/livevideo.gif,HTTP/1.0,200,0,.com
205.212.115.106,[01/Jul/1995:00:00:12-0400],GET,/shuttle/countdown/countdown.html,HTTP/1.0,200,3985,IP Address
d104.aa.net,[01/Jul/1995:00:00:13-0400],GET,/shuttle/countdown/,HTTP/1.0,200,3985,.net
129.94.144.152,[01/Jul/1995:00:00:13-0400],GET,/,HTTP/1.0,200,7074,IP Address


# Named the PySpark DataFrame to be used in SQL

In [0]:
spark_df.registerTempTable("df_final")

In [0]:
sqldf1 = spark.sql("SELECT host, timestamp, method, endpoint, protocol, http_status, content_size, domain_name FROM df_final WHERE http_status = 'HTTP/1.0\"'")
sqldf1.show()

# SQL used to create a table

In [0]:
%sql
SELECT *
FROM df_final
--WHERE (method != 'GET') and (method != 'HEAD') and (method != 'POST')
GROUP BY host, timestamp, method, endpoint, protocol, http_status, content_size, domain_name

host,timestamp,method,endpoint,protocol,http_status,content_size,domain_name
daddy-bock.tamu.edu,[02/Jul/1995:22:12:50-0400],GET,/images/NASA-logosmall.gif,HTTP/1.0,200,786,.edu
ts4-16.inforamp.net,[02/Jul/1995:22:17:24-0400],GET,/images/NASA-logosmall.gif,HTTP/1.0,304,0,.net
199.172.50.40,[02/Jul/1995:22:28:47-0400],GET,/shuttle/countdown/liftoff.html,HTTP/1.0,200,4538,IP Address
soc12.socsci.unc.edu,[02/Jul/1995:22:30:33-0400],GET,/shuttle/technology/images/sts_body_2-small.gif,HTTP/1.0,200,30067,.edu
knownet.cpbi.org,[02/Jul/1995:22:16:06-0400],GET,/shuttle/countdown/,HTTP/1.0,200,3985,.org
ts4-16.inforamp.net,[02/Jul/1995:22:18:18-0400],GET,"/cgi-bin/imagemap/countdown?316,276",HTTP/1.0,302,98,.net
ix-mvo-ca1-20.ix.netcom.com,[02/Jul/1995:22:19:01-0400],GET,"/cgi-bin/imagemap/countdown?103,168",HTTP/1.0,302,110,.net
ns26.moran.com,[02/Jul/1995:22:20:30-0400],GET,/shuttle/missions/sts-71/images/images.html,HTTP/1.0,200,7634,.com
rvr0149.deltanet.com,[02/Jul/1995:22:21:10-0400],GET,/shuttle/countdown/video/livevideo.gif,HTTP/1.0,200,57344,.de
mica.saglac.qc.ca,[02/Jul/1995:22:26:03-0400],GET,/history/apollo/apollo-6/images/,HTTP/1.0,200,514,.ca


# SQL used to count the number of http methods

In [0]:
%sql
SELECT method, COUNT(method) 
FROM df_final
GROUP BY method
ORDER BY method

method,count(method)
,1
GET,1886808
HEAD,3950
,843
POST,111
k��tx��tG��t̓�,2


In [0]:
%sql
SELECT *
FROM df_final
WHERE (method != 'GET') and (method != 'POST') and (method != 'HEAD')

host,timestamp,method,endpoint,protocol,http_status,content_size,domain_name
204.120.229.63,[01/Jul/1995:04:29:05-0400],,,,200.0,1502,IP Address
nccse.gsfc.nasa.gov,[01/Jul/1995:07:36:13-0400],,,,200.0,8677,.gov
ix-nbw-nj1-22.ix.netcom.com,[01/Jul/1995:10:42:09-0400],,,,404.0,-,.net
gpotterpc.llnl.gov,[01/Jul/1995:22:26:51-0400],,,,200.0,317,.gov
wxs6-7.worldaccess.nl,[02/Jul/1995:08:09:27-0400],,,,200.0,7074,.nl
wxs6-7.worldaccess.nl,[02/Jul/1995:08:11:20-0400],,,,200.0,7074,.nl
s29.abqslip.indirect.com,[02/Jul/1995:14:24:26-0400],,,,200.0,7020,.com
pipe3.nyc.pipeline.com,[02/Jul/1995:22:24:41-0400],,,,404.0,-,.com
pipe3.nyc.pipeline.com,[02/Jul/1995:22:25:48-0400],,,,404.0,-,.com
wxs6-4.worldaccess.nl,[03/Jul/1995:08:14:40-0400],,,,304.0,0,.nl


# count number of methods for invalid endpoints - '/'

In [0]:
%sql
SELECT method, endpoint, COUNT(endpoint)
FROM df_final
WHERE endpoint = '/' and (method = 'GET' or method = 'POST')
GROUP BY method, endpoint 

method,endpoint,count(endpoint)
GET,/,32709
POST,/,2


# Identify the HTTP status and assign response

In [0]:
%sql
--SELECT OrderID, Quantity,
--CASE
--    WHEN Quantity > 30 THEN 'The quantity is greater than 30'
--    WHEN Quantity = 30 THEN 'The quantity is 30'
--    ELSE 'The quantity is under 30'
--END AS QuantityText
--FROM OrderDetails;

SELECT http_status, COUNT(http_status) as number_of_http_status, 
CASE 
            WHEN http_status = '200' THEN 'OK'
            WHEN http_status = '302' THEN 'FOUND'
            WHEN http_status = '304' THEN 'NOT MODIFIED'
            WHEN http_status = '400' THEN 'BAD REQUEST'
            WHEN http_status = '403' THEN 'FORBIDDEN'
            WHEN http_status = '404' THEN 'NOT FOUND'
            WHEN http_status = '500' THEN 'INTERNAL SERVICE ERROR'
            WHEN http_status = '501' THEN 'NOT IMPLEMENTED'
            WHEN http_status = 'HTTP/1.0\"' THEN 'FORBIDDEN or INVALID'
            WHEN http_status = 'of'  THEN 'INVALID'
END as HTTP_RESPONSE
FROM df_final
GROUP BY http_status
ORDER BY http_status



http_status,number_of_http_status,HTTP_RESPONSE
,0,
200,1701534,OK
302,46573,FOUND
304,132627,NOT MODIFIED
400,5,BAD REQUEST
403,53,FORBIDDEN
404,10832,NOT FOUND
500,62,INTERNAL SERVICE ERROR
501,14,NOT IMPLEMENTED
"HTTP/1.0""",13,FORBIDDEN or INVALID


# Extract the http requests that has content size 0 or NULL

In [0]:
%sql
SELECT *
FROM df_final
WHERE (content_size = '0') or (content_size is NULL)

host,timestamp,method,endpoint,protocol,http_status,content_size,domain_name
burger.letters.com,[01/Jul/1995:00:00:11-0400],GET,/shuttle/countdown/liftoff.html,HTTP/1.0,304,0,.com
burger.letters.com,[01/Jul/1995:00:00:12-0400],GET,/images/NASA-logosmall.gif,HTTP/1.0,304,0,.com
burger.letters.com,[01/Jul/1995:00:00:12-0400],GET,/shuttle/countdown/video/livevideo.gif,HTTP/1.0,200,0,.com
129.94.144.152,[01/Jul/1995:00:00:17-0400],GET,/images/ksclogo-medium.gif,HTTP/1.0,304,0,IP Address
onyx.southwind.net,[01/Jul/1995:00:01:39-0400],GET,/images/KSC-logosmall.gif,HTTP/1.0,304,0,.net
netport-27.iu.net,[01/Jul/1995:00:02:00-0400],GET,/images/ksclogo-medium.gif,HTTP/1.0,304,0,.net
netport-27.iu.net,[01/Jul/1995:00:02:01-0400],GET,/images/NASA-logosmall.gif,HTTP/1.0,304,0,.net
netport-27.iu.net,[01/Jul/1995:00:02:01-0400],GET,/images/MOSAIC-logosmall.gif,HTTP/1.0,304,0,.net
netport-27.iu.net,[01/Jul/1995:00:02:01-0400],GET,/images/USA-logosmall.gif,HTTP/1.0,304,0,.net
netport-27.iu.net,[01/Jul/1995:00:02:04-0400],GET,/images/WORLD-logosmall.gif,HTTP/1.0,304,0,.net


# Extract the number of methods that has content = 0 or content = 'null'

In [0]:
%sql 
SELECT method, COUNT(method) as number_of_methods, http_status, content_size
FROM df_final
WHERE (content_size = '0') or (content_size is NULL)
GROUP BY method, http_status, content_size
ORDER BY method, http_status

method,number_of_methods,http_status,content_size
GET,2961,200.0,0.0
GET,20,302.0,0.0
GET,132626,304.0,0.0
GET,1,404.0,0.0
GET,62,500.0,0.0
HEAD,3941,200.0,0.0
HEAD,5,302.0,0.0
,1,,
,1,304.0,0.0


In [0]:
%sql 
SELECT method, COUNT(method) as number_of_methods, http_status, content_size
FROM df_final
WHERE (content_size != '0')
GROUP BY method, http_status, content_size
ORDER BY method, http_status, content_size

method,number_of_methods,http_status,content_size
,1,400,-
GET,83,200,-
GET,1,200,1001
GET,7,200,100203
GET,5,200,100223
GET,34,200,100268
GET,35,200,100269
GET,205,200,10032
GET,63,200,10036
GET,4,200,1004


# Extract the domain_name and count number of domain_names

In [0]:
%sql
SELECT domain_name, COUNT(domain_name) as number_of_domain_name
FROM df_final
GROUP BY domain_name
ORDER BY number_of_domain_name


domain_name,number_of_domain_name
.arpa,177
.my,643
.ve,740
.gb,917
.cz,951
.mx,1111
.bm,1240
.lu,1392
.ie,1802
.kr,1862


In [0]:
%sql
SELECT host, COUNT(host)
FROM df_final
WHERE domain_name LIKE "IP %"
GROUP BY host

host,count(host)
164.147.207.25,7
163.205.166.15,271
129.252.7.29,30
159.191.6.50,17
192.86.22.98,96
144.191.11.42,1
156.99.46.21,5
146.186.186.23,10
203.1.75.214,26
152.74.44.3,1
