# Weblog data Analysis by using pyspark

<div style="text-align: right">

2018.11.03 / 발제자 임지훈
</div>

In [1]:
sc

In [2]:
import collections

from pyspark import StorageLevel
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import functions
from pyspark.sql.types import *
from pyspark.sql.window import Window
import time
from pyspark.sql.functions import udf
from pyspark.sql.types import *

## 1. 데이터 불러오기

In [3]:
weblog = spark.read.csv("file:///home/ubuntu/18-2Engineering/Week05_181103/resources/webLog.csv")
weblog.show()

+----------+--------------------+--------------------+---+
|       _c0|                 _c1|                 _c2|_c3|
+----------+--------------------+--------------------+---+
|10.128.2.1|[29/Nov/2017:06:5...|GET /login.php HT...|200|
|10.128.2.1|[29/Nov/2017:06:5...|POST /process.php...|302|
|10.128.2.1|[29/Nov/2017:06:5...|GET /home.php HTT...|200|
|10.131.2.1|[29/Nov/2017:06:5...|GET /js/vendor/mo...|200|
|10.130.2.1|[29/Nov/2017:06:5...|GET /bootstrap-3....|200|
|10.130.2.1|[29/Nov/2017:06:5...|GET /profile.php?...|200|
|10.128.2.1|[29/Nov/2017:06:5...|GET /js/jquery.mi...|200|
|10.131.2.1|[29/Nov/2017:06:5...|GET /js/chart.min...|200|
|10.131.2.1|[29/Nov/2017:06:5...|GET /edit.php?nam...|200|
|10.131.2.1|[29/Nov/2017:06:5...|GET /logout.php H...|302|
|10.131.2.1|[29/Nov/2017:06:5...|GET /login.php HT...|200|
|10.130.2.1|[29/Nov/2017:07:0...|GET /login.php HT...|200|
|10.130.2.1|[29/Nov/2017:07:0...|GET /login.php HT...|200|
|10.130.2.1|[29/Nov/2017:13:3...|      GET / HTTP/1.1|30

## 2. column name 생성

In [4]:
weblog = weblog.selectExpr("_c0 as IP", "_c1 as Time", "_c2 as URL", "_c3 as Status")
weblog.show()

+----------+--------------------+--------------------+------+
|        IP|                Time|                 URL|Status|
+----------+--------------------+--------------------+------+
|10.128.2.1|[29/Nov/2017:06:5...|GET /login.php HT...|   200|
|10.128.2.1|[29/Nov/2017:06:5...|POST /process.php...|   302|
|10.128.2.1|[29/Nov/2017:06:5...|GET /home.php HTT...|   200|
|10.131.2.1|[29/Nov/2017:06:5...|GET /js/vendor/mo...|   200|
|10.130.2.1|[29/Nov/2017:06:5...|GET /bootstrap-3....|   200|
|10.130.2.1|[29/Nov/2017:06:5...|GET /profile.php?...|   200|
|10.128.2.1|[29/Nov/2017:06:5...|GET /js/jquery.mi...|   200|
|10.131.2.1|[29/Nov/2017:06:5...|GET /js/chart.min...|   200|
|10.131.2.1|[29/Nov/2017:06:5...|GET /edit.php?nam...|   200|
|10.131.2.1|[29/Nov/2017:06:5...|GET /logout.php H...|   302|
|10.131.2.1|[29/Nov/2017:06:5...|GET /login.php HT...|   200|
|10.130.2.1|[29/Nov/2017:07:0...|GET /login.php HT...|   200|
|10.130.2.1|[29/Nov/2017:07:0...|GET /login.php HT...|   200|
|10.130.

## 3. dataframe size, IP 종류 개수

In [5]:
weblog.select(count("IP"), countDistinct("IP")).show()

+---------+------------------+
|count(IP)|count(DISTINCT IP)|
+---------+------------------+
|    10787|                 5|
+---------+------------------+



## 4. IP별 count

In [6]:
weblog.groupBy('IP').count().show()

+----------+-----+
|        IP|count|
+----------+-----+
|10.131.2.1| 1626|
|10.128.2.1| 2416|
|10.130.2.1| 2493|
|10.131.0.1| 2600|
|10.129.2.1| 1652|
+----------+-----+



## 5. Status별 count

In [7]:
weblog.groupBy('Status').count().show()

+------+-----+
|Status|count|
+------+-----+
|   200| 8048|
|   206|   48|
|   302| 1859|
|   404|  213|
|   304|  619|
+------+-----+



## Q1. IP별 Status를 살펴보자

In [8]:
weblog.sort(asc("IP"), asc("Status")).groupBy("IP", "Status").count().show()

+----------+------+-----+
|        IP|Status|count|
+----------+------+-----+
|10.128.2.1|   200| 1745|
|10.128.2.1|   206|   11|
|10.128.2.1|   302|  479|
|10.128.2.1|   304|  130|
|10.128.2.1|   404|   51|
|10.129.2.1|   200| 1286|
|10.129.2.1|   206|    7|
|10.129.2.1|   302|  206|
|10.129.2.1|   304|  112|
|10.129.2.1|   404|   41|
|10.130.2.1|   200| 1861|
|10.130.2.1|   206|   12|
|10.130.2.1|   302|  444|
|10.130.2.1|   304|  137|
|10.130.2.1|   404|   39|
|10.131.0.1|   200| 1909|
|10.131.0.1|   206|   10|
|10.131.0.1|   302|  509|
|10.131.0.1|   304|  120|
|10.131.0.1|   404|   52|
+----------+------+-----+
only showing top 20 rows



## Q2. IP별 404 count를 출력해보자

In [9]:
weblog.where(weblog["Status"]==404).groupBy("IP").count().show()

+----------+-----+
|        IP|count|
+----------+-----+
|10.131.2.1|   30|
|10.128.2.1|   51|
|10.130.2.1|   39|
|10.131.0.1|   52|
|10.129.2.1|   41|
+----------+-----+



별 의미는 없었다

## Q3. 시간대별 404 count를 출력해보자

우선 시간대(Hour) column을 만들자

In [10]:
def get_hour(Time):
    return substring(Time,14,2)

hour = udf(get_hour, StringType())
weblog = weblog.withColumn("Hour", lit(get_hour("Time")))
weblog.show()

+----------+--------------------+--------------------+------+----+
|        IP|                Time|                 URL|Status|Hour|
+----------+--------------------+--------------------+------+----+
|10.128.2.1|[29/Nov/2017:06:5...|GET /login.php HT...|   200|  06|
|10.128.2.1|[29/Nov/2017:06:5...|POST /process.php...|   302|  06|
|10.128.2.1|[29/Nov/2017:06:5...|GET /home.php HTT...|   200|  06|
|10.131.2.1|[29/Nov/2017:06:5...|GET /js/vendor/mo...|   200|  06|
|10.130.2.1|[29/Nov/2017:06:5...|GET /bootstrap-3....|   200|  06|
|10.130.2.1|[29/Nov/2017:06:5...|GET /profile.php?...|   200|  06|
|10.128.2.1|[29/Nov/2017:06:5...|GET /js/jquery.mi...|   200|  06|
|10.131.2.1|[29/Nov/2017:06:5...|GET /js/chart.min...|   200|  06|
|10.131.2.1|[29/Nov/2017:06:5...|GET /edit.php?nam...|   200|  06|
|10.131.2.1|[29/Nov/2017:06:5...|GET /logout.php H...|   302|  06|
|10.131.2.1|[29/Nov/2017:06:5...|GET /login.php HT...|   200|  06|
|10.130.2.1|[29/Nov/2017:07:0...|GET /login.php HT...|   200| 

In [11]:
weblog.where(weblog["Status"]==404).groupBy("Hour").count().sort(desc("count")).show()

+----+-----+
|Hour|count|
+----+-----+
|  15|   32|
|  16|   14|
|  06|   14|
|  20|   13|
|  23|   12|
|  08|   12|
|  19|   12|
|  00|    9|
|  10|    9|
|  18|    9|
|  21|    9|
|  14|    8|
|  11|    8|
|  07|    7|
|  12|    6|
|  22|    6|
|  09|    5|
|  13|    5|
|  02|    5|
|  04|    4|
+----+-----+
only showing top 20 rows



별 의미 없었다

## Q4. 시간대별 login 횟수를 count해보자

우선 URL이 "login.php"를 포함하는 경우를 필터링하자

In [12]:
weblog.filter("URL like '%login.php%'").count()

1761

In [13]:
login = weblog.filter("URL like '%login.php%'")
login.groupBy("Hour").count().sort(desc("count")).show()

+----+-----+
|Hour|count|
+----+-----+
|  20|  615|
|  15|  105|
|  16|   97|
|  18|   93|
|  17|   92|
|  19|   77|
|  12|   74|
|  14|   73|
|  13|   63|
|  06|   47|
|  08|   44|
|  05|   43|
|  07|   42|
|  09|   38|
|  22|   34|
|  04|   33|
|  23|   29|
|  11|   29|
|  00|   28|
|  10|   27|
+----+-----+
only showing top 20 rows



드디어 유의미한 결과가 나왔다

20시에 압도적으로 로그인을 많이 했다는 것이

    1) 특정 날짜의 20시에 많이 했다는 것인지 (ex. 수강신청)

    2) 평상시 20시에 원래 그렇게 로그인이 많은 것인지 

아직 모른다

## Q5. 20시에 login한 데이터들의 date 분포를 살피자
### 특정 날짜에 몰렸는지, 고르게 분포하는지

Date column을 추가하여 login dataFrame을 새롭게 만들자

In [14]:
def get_date(Time):
    return substring(Time,2,11)

date = udf(get_date, StringType())
login = login.withColumn("Date", lit(get_date("Time")))
login.show()

+----------+--------------------+--------------------+------+----+-----------+
|        IP|                Time|                 URL|Status|Hour|       Date|
+----------+--------------------+--------------------+------+----+-----------+
|10.128.2.1|[29/Nov/2017:06:5...|GET /login.php HT...|   200|  06|29/Nov/2017|
|10.131.2.1|[29/Nov/2017:06:5...|GET /login.php HT...|   200|  06|29/Nov/2017|
|10.130.2.1|[29/Nov/2017:07:0...|GET /login.php HT...|   200|  07|29/Nov/2017|
|10.130.2.1|[29/Nov/2017:07:0...|GET /login.php HT...|   200|  07|29/Nov/2017|
|10.130.2.1|[29/Nov/2017:13:3...|GET /login.php HT...|   200|  13|29/Nov/2017|
|10.131.2.1|[29/Nov/2017:13:3...|GET /login.php HT...|   200|  13|29/Nov/2017|
|10.131.2.1|[29/Nov/2017:13:4...|GET /login.php HT...|   200|  13|29/Nov/2017|
|10.131.2.1|[29/Nov/2017:13:4...|GET /login.php?va...|   200|  13|29/Nov/2017|
|10.129.2.1|[29/Nov/2017:13:5...|GET /login.php HT...|   200|  13|29/Nov/2017|
|10.129.2.1|[29/Nov/2017:14:5...|GET /login.php HT..

위에서 확인했지만, 다시 한 번 20시에 로그인한 횟수를 카운트 해보자

In [15]:
login.where(login['Hour']==20).count()

615

로그인 dataFrame의 Date중 중복된 데이터를 제외한 unique한 값을 뽑아보자

In [16]:
login.select(countDistinct("Date")).show()

+--------------------+
|count(DISTINCT Date)|
+--------------------+
|                  40|
+--------------------+



Hour가 20인 경우의 unique한 date 값을 뽑아보자

In [17]:
login.where(login['Hour']=='20').select(countDistinct("Date")).show()

+--------------------+
|count(DISTINCT Date)|
+--------------------+
|                  18|
+--------------------+



실망스럽다. 만약 값이 1, 2처럼 작은 수가 나왔더라면,
특정 일의 20시에 로그인이 몰렸다고 볼 수 있었을텐데

그럼 로그인한 횟수가 가장 많은 날짜가 언젠지 찾아보자

## Q6. 어느 날짜에 로그인을 가장 많이 했을까?

In [18]:
login.groupBy("Date").count().sort(desc("count")).show()

+-----------+-----+
|       Date|count|
+-----------+-----+
|29/Jan/2018|  560|
|30/Nov/2017|  207|
|29/Nov/2017|   71|
|17/Nov/2017|   65|
|13/Nov/2017|   60|
|09/Nov/2017|   60|
|16/Nov/2017|   58|
|01/Dec/2017|   50|
|12/Nov/2017|   44|
|16/Dec/2017|   42|
|08/Nov/2017|   40|
|14/Dec/2017|   36|
|14/Nov/2017|   36|
|02/Dec/2017|   32|
|23/Nov/2017|   32|
|15/Nov/2017|   28|
|19/Nov/2017|   27|
|26/Nov/2017|   26|
|11/Nov/2017|   26|
|13/Dec/2017|   25|
+-----------+-----+
only showing top 20 rows



## Q7. 29/Jan/2018의 20시와 30/Nov/2017의 20시 살펴보기

In [19]:
login.where(login['Hour']=='20').groupBy("Date", "Hour")\
.count().sort(desc("count")).show()

+-----------+----+-----+
|       Date|Hour|count|
+-----------+----+-----+
|29/Jan/2018|  20|  560|
|09/Nov/2017|  20|    6|
|26/Nov/2017|  20|    6|
|16/Nov/2017|  20|    6|
|14/Dec/2017|  20|    6|
|19/Nov/2017|  20|    6|
|30/Nov/2017|  20|    6|
|12/Nov/2017|  20|    4|
|21/Nov/2017|  20|    2|
|01/Dec/2017|  20|    2|
|17/Jan/2018|  20|    2|
|08/Nov/2017|  20|    2|
|13/Dec/2017|  20|    2|
|16/Dec/2017|  20|    1|
|02/Dec/2017|  20|    1|
|17/Dec/2017|  20|    1|
|17/Nov/2017|  20|    1|
|19/Dec/2017|  20|    1|
+-----------+----+-----+



## 번외. 30/Nov/2017 살펴보기

In [20]:
login.where(login['Date'] == '30/Nov/2017').groupBy("Date", "Hour")\
.count().sort(desc("count")).show()

+-----------+----+-----+
|       Date|Hour|count|
+-----------+----+-----+
|30/Nov/2017|  12|   39|
|30/Nov/2017|  15|   32|
|30/Nov/2017|  16|   30|
|30/Nov/2017|  17|   23|
|30/Nov/2017|  14|   21|
|30/Nov/2017|  09|   13|
|30/Nov/2017|  13|   10|
|30/Nov/2017|  08|    9|
|30/Nov/2017|  07|    9|
|30/Nov/2017|  19|    6|
|30/Nov/2017|  20|    6|
|30/Nov/2017|  18|    3|
|30/Nov/2017|  06|    2|
|30/Nov/2017|  21|    2|
|30/Nov/2017|  11|    1|
|30/Nov/2017|  03|    1|
+-----------+----+-----+

