## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [0]:
import glob
import os 
import pandas as pd
from pyspark.sql.functions import split
import re
import numpy as np

In [0]:
# File location and type
file_location = "/FileStore/tables/NASA_access_log_Aug95-1.gz"
my_list = []
# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.csv(file_location, sep=' ')
#df = df.select(split(df._c5, ' ').alias(['type','a1','a2'])).collect()
my_list = df.select('_c5').rdd.flatMap(lambda x: x).collect()
my_list2 = df.select('_c0').rdd.flatMap(lambda x: x).collect()
my_list3 = df.select('_c7').rdd.flatMap(lambda x: x).collect()
my_list4 = df.select('_c3').rdd.flatMap(lambda x: x).collect()
my_list5 = df.select('_c4').rdd.flatMap(lambda x: x).collect()
my_list6 = df.select('_c6').rdd.flatMap(lambda x: x).collect()

In [0]:
display(df.tail(1000))

_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7
kabuto.phys.wani.osaka-u.ac.jp,-,-,[31/Aug/1995:23:32:34,-0400],GET /shuttle/missions/sts-70/images/KSC-95EC-1019.jpg HTTP/1.0,200,69005
c30.eds.ecip.nagoya-u.ac.jp,-,-,[31/Aug/1995:23:32:34,-0400],GET /facilities/mlp.html HTTP/1.0,200,2653
c30.eds.ecip.nagoya-u.ac.jp,-,-,[31/Aug/1995:23:32:37,-0400],GET /images/mlp-logo.gif HTTP/1.0,200,28426
198.211.77.73,-,-,[31/Aug/1995:23:32:38,-0400],GET /shuttle/missions/61-c/61-c-patch-small.gif HTTP/1.0,200,11963
157.2.11.70,-,-,[31/Aug/1995:23:32:41,-0400],GET /images/launchmedium.gif HTTP/1.0,200,11853
c30.eds.ecip.nagoya-u.ac.jp,-,-,[31/Aug/1995:23:32:42,-0400],GET /images/kscmap-tiny.gif HTTP/1.0,200,2537
ix-dgr-il1-12.ix.netcom.com,-,-,[31/Aug/1995:23:32:44,-0400],GET /shuttle/missions/sts-69/images/KSC-95EC-1244.jpg HTTP/1.0,200,136166
157.2.11.70,-,-,[31/Aug/1995:23:32:46,-0400],GET /images/NASA-logosmall.gif HTTP/1.0,200,786
198.211.77.73,-,-,[31/Aug/1995:23:32:50,-0400],GET /shuttle/missions/51-c/mission-51-c.html HTTP/1.0,200,4556
198.211.77.73,-,-,[31/Aug/1995:23:32:51,-0400],GET /shuttle/missions/51-c/51-c-patch-small.gif HTTP/1.0,200,12632


In [0]:
df_py = df[df['_c6']=='HTTP/1.0\"']
df_py.show()

In [0]:
get_list = []
request_list1 = []
request_list2 = []
invalid = []
a = ''
b = ''
c = ''

for element in my_list: 
    #print('element = ',element)
    if element != None:
        get = element.split()
        #print('get = ',get)
        if len(get) >= 4:
            print('get = ',get)
            invalid.append(get)
        if len(get) == 3:
            a = get[0]
            b = get[1]
            c = get[2]
            get_list.append(a)
            request_list1.append(b)
            request_list2.append(c)
        elif len(get) == 2:
            a = get[0]
            b = get[1]
            c = 'NaN'
            get_list.append(a)
            request_list1.append(b)
            request_list2.append(c)
        elif len(get) == 1:
            match = re.match('\d',get[0])
            if match == None:
                a = get[0]
                b = 'NaN'
                c = 'NaN'
                get_list.append(a)
                request_list1.append(b)
                request_list2.append(c)
            else:
                a = 'NaN'
                b = get[0]
                c = 'NaN'
                get_list.append(a)
                request_list1.append(b)
                request_list2.append(c)
        else:
            a = 'NaN'
            b = 'NaN'
            c = 'NaN'
            get_list.append(a)
            request_list1.append(b)
            request_list2.append(c)
    else:
        a = 'NaN'
        b = 'NaN'
        c = 'NaN'
        get_list.append(a)
        request_list1.append(b)
        request_list2.append(c)
        
        

#print(get_list)
#print(request_list1)


In [0]:
print(len(get_list))

In [0]:
my_new_list = []
for (i,j) in zip(my_list4,my_list5):
    my_new_list.append(str(i)+str(j))

print(my_new_list)

In [0]:
# convert lists into pyspark dataframe

s1 = pd.Series(my_list2)
s2 = pd.Series(my_new_list)
s3 = pd.Series(my_list5)
s4 = pd.Series(get_list)
s5 = pd.Series(request_list1)
s6 = pd.Series(request_list2)
s7 = pd.Series(my_list6)
s8 = pd.Series(my_list3)


# Pandas DataFrame formed using series

In [0]:
#new_df = pd.DataFrame(data = [s1,s2,s3], columns=['type','req1','req2'])
new_df = pd.DataFrame({'host': s1,'timestamp': s2, 'method': s4, 'endpoint': s5, 'protocol': s6,  'http_status': s7, 'content_size': s8})
print(new_df)

# Replacing the method type with the correct one

In [0]:
new_df['method'].replace(to_replace ="\"GET",
                 value ="GET", inplace=True)
#a_df = new_df[new_df['method'] == '"GET']

In [0]:
new_df['domain_name'] = new_df['host'].str.extract(r'(\.com|\.jp|\.net|\.gov|\.edu|\.ca|\.au|\.de|\.nz|\.uk|\.mil|\.sg|\.no|\.se|\.it|\.be|\.dk|\.org|\.ni|\.fr|\.fi|\.si|\.nl|\.arpa|\.ch|\.kr|\.za|\.su|\.at|\.lu|\.pl|\.il|\.es|\.cz|\.us|\.ar|\.mt|\.is|\.pt|\.my|\.ee|\.cl|\.gr|\.ie|\.mx|\.cr|\.br|\.ru|\.int|\.ve|\.cn|\.hu|\.gb|\.sk|\.pa|\.ge|\.bm)')
print(df_new.head(700))

In [0]:
new_df['domain_name'].replace(to_replace =np.NaN,
                 value ="IP Address", inplace=True)

new_df

Unnamed: 0,host,timestamp,method,endpoint,protocol,http_status,content_size,domain_name
0,in24.inetnebr.com,[01/Aug/1995:00:00:01-0400],GET,/shuttle/missions/sts-68/news/sts-68-mcc-05.txt,HTTP/1.0,200,1839,.com
1,uplherc.upl.com,[01/Aug/1995:00:00:07-0400],GET,/,HTTP/1.0,304,0,.com
2,uplherc.upl.com,[01/Aug/1995:00:00:08-0400],GET,/images/ksclogo-medium.gif,HTTP/1.0,304,0,.com
3,uplherc.upl.com,[01/Aug/1995:00:00:08-0400],GET,/images/MOSAIC-logosmall.gif,HTTP/1.0,304,0,.com
4,uplherc.upl.com,[01/Aug/1995:00:00:08-0400],GET,/images/USA-logosmall.gif,HTTP/1.0,304,0,.com
...,...,...,...,...,...,...,...,...
1569893,gatekeeper.uccu.com,[31/Aug/1995:23:59:49-0400],GET,/images/ksclogosmall.gif,HTTP/1.0,304,0,.com
1569894,gatekeeper.uccu.com,[31/Aug/1995:23:59:49-0400],GET,/images/lc39a-logo.gif,HTTP/1.0,304,0,.com
1569895,cys-cap-9.wyoming.com,[31/Aug/1995:23:59:52-0400],GET,/shuttle/missions/sts-71/movies/sts-71-launch-...,HTTP/1.0,200,57344,.com
1569896,www-c8.proxy.aol.com,[31/Aug/1995:23:59:52-0400],GET,/icons/unknown.xbm,HTTP/1.0,200,515,.com


# Created the PySpark DataFrame

In [0]:
spark_df = spark.createDataFrame(new_df)
display(spark_df)

host,timestamp,method,endpoint,protocol,http_status,content_size,domain_name
in24.inetnebr.com,[01/Aug/1995:00:00:01-0400],GET,/shuttle/missions/sts-68/news/sts-68-mcc-05.txt,HTTP/1.0,200,1839,.com
uplherc.upl.com,[01/Aug/1995:00:00:07-0400],GET,/,HTTP/1.0,304,0,.com
uplherc.upl.com,[01/Aug/1995:00:00:08-0400],GET,/images/ksclogo-medium.gif,HTTP/1.0,304,0,.com
uplherc.upl.com,[01/Aug/1995:00:00:08-0400],GET,/images/MOSAIC-logosmall.gif,HTTP/1.0,304,0,.com
uplherc.upl.com,[01/Aug/1995:00:00:08-0400],GET,/images/USA-logosmall.gif,HTTP/1.0,304,0,.com
ix-esc-ca2-07.ix.netcom.com,[01/Aug/1995:00:00:09-0400],GET,/images/launch-logo.gif,HTTP/1.0,200,1713,.net
uplherc.upl.com,[01/Aug/1995:00:00:10-0400],GET,/images/WORLD-logosmall.gif,HTTP/1.0,304,0,.com
slppp6.intermind.net,[01/Aug/1995:00:00:10-0400],GET,/history/skylab/skylab.html,HTTP/1.0,200,1687,.int
piweba4y.prodigy.com,[01/Aug/1995:00:00:10-0400],GET,/images/launchmedium.gif,HTTP/1.0,200,11853,.com
slppp6.intermind.net,[01/Aug/1995:00:00:11-0400],GET,/history/skylab/skylab-small.gif,HTTP/1.0,200,9202,.int


# Named the PySpark DataFrame to be used in SQL

In [0]:
spark_df.registerTempTable("df_final")

In [0]:
sqldf1 = spark.sql("SELECT host, timestamp, method, endpoint, protocol, http_status, content_size, domain_name FROM df_final WHERE http_status = 'HTTP/1.0\"'")
sqldf1.show()

# SQL used to create a table

In [0]:
%sql
SELECT *
FROM df_final
--WHERE (method != 'GET') and (method != 'HEAD') and (method != 'POST')
GROUP BY host, timestamp, method, endpoint, protocol, http_status, content_size, domain_name

host,timestamp,method,endpoint,protocol,http_status,content_size,domain_name
kgtyk4.kj.yamagata-u.ac.jp,[01/Aug/1995:00:00:21-0400],GET,/images/NASA-logosmall.gif,HTTP/1.0,304,0,.jp
uplherc.upl.com,[01/Aug/1995:00:01:18-0400],GET,/history/apollo/apollo-17/apollo-17-patch-small.gif,HTTP/1.0,200,14977,.com
van10271.direct.ca,[01/Aug/1995:00:06:05-0400],GET,/images/KSC-logosmall.gif,HTTP/1.0,200,1204,.ca
dd09-012.compuserve.com,[01/Aug/1995:00:24:25-0400],GET,/images/launchpalms-small.gif,HTTP/1.0,200,11473,.com
ix-min1-02.ix.netcom.com,[01/Aug/1995:00:25:39-0400],GET,/images/ksclogosmall.gif,HTTP/1.0,304,0,.net
tia1.eskimo.com,[01/Aug/1995:00:26:48-0400],GET,/software/winvn/winvn.gif,HTTP/1.0,200,25218,.es
adam.tower.com.au,[01/Aug/1995:00:30:36-0400],GET,/shuttle/missions/sts-71/images/images.html,HTTP/1.0,200,8529,.com
www-d1.proxy.aol.com,[01/Aug/1995:00:35:51-0400],GET,/elv/ATLAS_CENTAUR/atlas.gif,HTTP/1.0,200,2286,.com
ix-cha-nc1-02.ix.netcom.com,[01/Aug/1995:00:35:59-0400],GET,/images/launch-logo.gif,HTTP/1.0,200,1713,.net
slppp6.intermind.net,[01/Aug/1995:00:00:39-0400],GET,/history/skylab/skylab-logo.gif,HTTP/1.0,200,3274,.int


# SQL used to count the number of http methods

In [0]:
%sql
SELECT method, COUNT(method) 
FROM df_final
GROUP BY method
ORDER BY method

method,count(method)
��6��T7�F��F,4
?,1
GET,1565095
HEAD,3965
,717
POST,111
huttle/countdown/,1
�|t�9ð'À|u,2
���.�,2


In [0]:
%sql
SELECT *
FROM df_final
WHERE (method != 'GET') and (method != 'POST') and (method != 'HEAD')

host,timestamp,method,endpoint,protocol,http_status,content_size,domain_name
ztm-13.dial.xs4all.nl,[04/Aug/1995:09:34:52-0400],,,,200,7034,.nl
pc32.cis.uoguelph.ca,[04/Aug/1995:10:57:21-0400],,,,200,7034,.ca
sgate08.st-and.ac.uk,[04/Aug/1995:17:52:59-0400],,,,200,6858,.uk
ppp-nyc-2-64.ios.com,[05/Aug/1995:20:45:33-0400],,,,200,2443,.com
ppp-nyc-2-64.ios.com,[05/Aug/1995:20:47:52-0400],,,,200,4347,.com
client-71-162.online.apple.com,[05/Aug/1995:22:53:19-0400],,,,404,-,.com
client-71-162.online.apple.com,[05/Aug/1995:22:53:47-0400],,,,404,-,.com
rt99-9.rotterdam.nl.net,[06/Aug/1995:11:35:40-0400],,,,200,7034,.nl
client-71-69.online.apple.com,[06/Aug/1995:12:21:18-0400],,,,404,-,.com
spark1.ecf.toronto.edu,[06/Aug/1995:16:13:52-0400],,,,200,7034,.edu


# count number of methods for invalid endpoints - '/'

In [0]:
%sql
SELECT method, endpoint, COUNT(endpoint)
FROM df_final
WHERE endpoint = '/' and (method = 'GET' or method = 'POST')
GROUP BY method, endpoint 

method,endpoint,count(endpoint)
GET,/,30101


# Identify the HTTP status and assign response

In [0]:
%sql
--SELECT OrderID, Quantity,
--CASE
--    WHEN Quantity > 30 THEN 'The quantity is greater than 30'
--    WHEN Quantity = 30 THEN 'The quantity is 30'
--    ELSE 'The quantity is under 30'
--END AS QuantityText
--FROM OrderDetails;

SELECT http_status, COUNT(http_status) as number_of_http_status, 
CASE 
            WHEN http_status = '200' THEN 'OK'
            WHEN http_status = '302' THEN 'FOUND'
            WHEN http_status = '304' THEN 'NOT MODIFIED'
            WHEN http_status = '400' THEN 'BAD REQUEST'
            WHEN http_status = '403' THEN 'FORBIDDEN'
            WHEN http_status = '404' THEN 'NOT FOUND'
            WHEN http_status = '500' THEN 'INTERNAL SERVICE ERROR'
            WHEN http_status = '501' THEN 'NOT IMPLEMENTED'
            WHEN http_status = 'HTTP/1.0\"' THEN 'FORBIDDEN or INVALID'
            WHEN http_status = 'of'  THEN 'INVALID'
END as HTTP_RESPONSE
FROM df_final
GROUP BY http_status
ORDER BY http_status



http_status,number_of_http_status,HTTP_RESPONSE
200,1398987,OK
302,26497,FOUND
304,134146,NOT MODIFIED
400,10,BAD REQUEST
403,171,FORBIDDEN
404,10039,NOT FOUND
500,3,INTERNAL SERVICE ERROR
501,27,NOT IMPLEMENTED
"HTTP/1.0""",17,FORBIDDEN or INVALID
images/ssbuv1.gif,1,


# Extract the http requests that has content size 0 or NULL

In [0]:
%sql
SELECT *
FROM df_final
WHERE (content_size = '0') or (content_size is NULL)

host,timestamp,method,endpoint,protocol,http_status,content_size,domain_name
uplherc.upl.com,[01/Aug/1995:00:00:07-0400],GET,/,HTTP/1.0,304,0,.com
uplherc.upl.com,[01/Aug/1995:00:00:08-0400],GET,/images/ksclogo-medium.gif,HTTP/1.0,304,0,.com
uplherc.upl.com,[01/Aug/1995:00:00:08-0400],GET,/images/MOSAIC-logosmall.gif,HTTP/1.0,304,0,.com
uplherc.upl.com,[01/Aug/1995:00:00:08-0400],GET,/images/USA-logosmall.gif,HTTP/1.0,304,0,.com
uplherc.upl.com,[01/Aug/1995:00:00:10-0400],GET,/images/WORLD-logosmall.gif,HTTP/1.0,304,0,.com
uplherc.upl.com,[01/Aug/1995:00:00:14-0400],GET,/images/NASA-logosmall.gif,HTTP/1.0,304,0,.com
kgtyk4.kj.yamagata-u.ac.jp,[01/Aug/1995:00:00:21-0400],GET,/images/NASA-logosmall.gif,HTTP/1.0,304,0,.jp
kgtyk4.kj.yamagata-u.ac.jp,[01/Aug/1995:00:00:21-0400],GET,/images/MOSAIC-logosmall.gif,HTTP/1.0,304,0,.jp
kgtyk4.kj.yamagata-u.ac.jp,[01/Aug/1995:00:00:22-0400],GET,/images/USA-logosmall.gif,HTTP/1.0,304,0,.jp
kgtyk4.kj.yamagata-u.ac.jp,[01/Aug/1995:00:00:22-0400],GET,/images/WORLD-logosmall.gif,HTTP/1.0,304,0,.jp


# Extract the number of methods that has content = 0 or content = 'null'

In [0]:
%sql 
SELECT method, COUNT(method) as number_of_methods, http_status, content_size
FROM df_final
WHERE (content_size = '0') or (content_size is NULL)
GROUP BY method, http_status, content_size
ORDER BY method, http_status

method,number_of_methods,http_status,content_size
GET,2287,200,0
GET,15,302,0
GET,134138,304,0
HEAD,3959,200,0
HEAD,3,302,0
,8,304,0


In [0]:
%sql 
SELECT method, COUNT(method) as number_of_methods, http_status, content_size
FROM df_final
WHERE (content_size != '0')
GROUP BY method, http_status, content_size
ORDER BY method, http_status, content_size

method,number_of_methods,http_status,content_size
��6��T7�F��F,4,400,-
?,1,400,-
GET,78,200,-
GET,30,200,1000
GET,4,200,10000
GET,122,200,10002
GET,1,200,1001
GET,24,200,10015
GET,11,200,100203
GET,2,200,100223


# Extract the domain_name and count number of domain_names

In [0]:
%sql
SELECT domain_name, COUNT(domain_name) as number_of_domain_name
FROM df_final
GROUP BY domain_name
ORDER BY number_of_domain_name


domain_name,number_of_domain_name
.arpa,302
.my,550
.gb,636
.cz,661
.mx,921
.ve,1102
.ie,1239
.bm,1303
.lu,1537
.gr,1862


In [0]:
%sql
SELECT host, COUNT(host)
FROM df_final
WHERE domain_name LIKE "IP %"
GROUP BY host

host,count(host)
163.205.166.15,335
199.3.230.80,9
128.159.63.129,12
132.170.244.49,12
128.159.143.43,101
163.205.80.44,248
192.195.243.61,7
164.116.78.80,29
158.114.228.10,110
140.251.205.85,5
