#  네트워크에 불법적으로 침입하는 사용자의 분석

## 스파크 기본설정

In [1]:
import os
import sys

#home=os.path.expanduser("~") # HOME이 설정되어 있지 않으면 expanduser('~')를 사용한다.
#osn.environ["PYSPARK_PYTHON"] = "/usr/bin/python"
os.environ["SPARK_HOME"]=os.path.join(os.path.expanduser("~"),r"C:\Users\user\spark-2.0.0-bin-hadoop2.7\spark-2.0.0-bin-hadoop2.7")
os.environ["PYLIB"]=os.path.join(os.environ["SPARK_HOME"],'python','lib')
sys.path.insert(0,os.path.join(os.environ["PYLIB"],'py4j-0.10.1-src.zip'))
sys.path.insert(0,os.path.join(os.environ["PYLIB"],'pyspark.zip'))

In [6]:
import pyspark
myConf=pyspark.SparkConf()
spark = pyspark.sql.SparkSession.builder\
    .master("local[*]")\
    .appName("a")\
    .config("spark.sql.warehouse.dir", "C:/Users/jsl/myTemp")\
    .getOrCreate()

## Dataframe 생성
0,1,2,3,4,5,41번째 열을 스키마로 정해서 RDD 생성

In [17]:
from pyspark.sql import Row

_csv = _rdd.map(lambda l: l.split(","))
_csvRdd = _csv.map(lambda p: 
    Row(
        duration=int(p[0]), 
        protocol=p[1],
        service=p[2],
        flag=p[3],
        src_bytes=int(p[4]),
        dst_bytes=int(p[5]),
        attack=p[41]
    )
)

In [18]:
# RDD를 DataFrame으로 변환.
_df=spark.createDataFrame(_csvRdd)

In [19]:
_df.printSchema()
_df.show(5)

root
 |-- attack: string (nullable = true)
 |-- dst_bytes: long (nullable = true)
 |-- duration: long (nullable = true)
 |-- flag: string (nullable = true)
 |-- protocol: string (nullable = true)
 |-- service: string (nullable = true)
 |-- src_bytes: long (nullable = true)

+-------+---------+--------+----+--------+-------+---------+
| attack|dst_bytes|duration|flag|protocol|service|src_bytes|
+-------+---------+--------+----+--------+-------+---------+
|normal.|     5450|       0|  SF|     tcp|   http|      181|
|normal.|      486|       0|  SF|     tcp|   http|      239|
|normal.|     1337|       0|  SF|     tcp|   http|      235|
|normal.|     1337|       0|  SF|     tcp|   http|      219|
|normal.|     2032|       0|  SF|     tcp|   http|      217|
+-------+---------+--------+----+--------+-------+---------+
only showing top 5 rows



## attack 분류
'attack' or 'normal'로 분류하는 **udf 함수** 이용

In [20]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
attack_udf = udf(lambda x: "normal" if x =="normal." else "attack", StringType())
myDf=_df.withColumn("attackB", attack_udf(_df.attack))

In [21]:
myDf.printSchema()

root
 |-- attack: string (nullable = true)
 |-- dst_bytes: long (nullable = true)
 |-- duration: long (nullable = true)
 |-- flag: string (nullable = true)
 |-- protocol: string (nullable = true)
 |-- service: string (nullable = true)
 |-- src_bytes: long (nullable = true)
 |-- attackB: string (nullable = true)



네트워크 침입 attack을 'normal, dos, r2l, u2r, probling' 종류로 구분.

In [22]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
def classify41(s):
    _5=""
    if s=="normal.":
        _5="normal"
    elif s=="back." or s=="land." or s=="neptune." or s=="pod." or s=="smurf." or s=="teardrop.":
        _5="dos"
    elif s=="ftp_write." or s=="guess_passwd." or s=="imap." or s=="multihop." or s=="phf." or\
        s=="spy." or s=="warezclient." or s=="warezmaster.":
        _5="r2l"
    elif s=="buffer_overflow." or s=="loadmodule." or s=="perl." or s=="rootkit.":
        _5="u2r"
    elif s=="ipsweep." or s=="nmap." or s=="portsweep." or s=="satan.":
        _5="probing"
    return _5

attack5_udf = udf(classify41, StringType())

In [23]:
myDf=myDf.withColumn("attack5", attack5_udf(_df.attack))

In [24]:
myDf.printSchema()

root
 |-- attack: string (nullable = true)
 |-- dst_bytes: long (nullable = true)
 |-- duration: long (nullable = true)
 |-- flag: string (nullable = true)
 |-- protocol: string (nullable = true)
 |-- service: string (nullable = true)
 |-- src_bytes: long (nullable = true)
 |-- attackB: string (nullable = true)
 |-- attack5: string (nullable = true)



In [25]:
myDf.show(5)

+-------+---------+--------+----+--------+-------+---------+-------+-------+
| attack|dst_bytes|duration|flag|protocol|service|src_bytes|attackB|attack5|
+-------+---------+--------+----+--------+-------+---------+-------+-------+
|normal.|     5450|       0|  SF|     tcp|   http|      181| normal| normal|
|normal.|      486|       0|  SF|     tcp|   http|      239| normal| normal|
|normal.|     1337|       0|  SF|     tcp|   http|      235| normal| normal|
|normal.|     1337|       0|  SF|     tcp|   http|      219| normal| normal|
|normal.|     2032|       0|  SF|     tcp|   http|      217| normal| normal|
+-------+---------+--------+----+--------+-------+---------+-------+-------+
only showing top 5 rows



### attack, normal 특징 분석

In [26]:
myDf.groupBy('attack5').count().show()

+-------+------+
|attack5| count|
+-------+------+
|probing|  4107|
|    u2r|    52|
| normal| 97278|
|    r2l|  1126|
|    dos|391458|
+-------+------+



In [27]:
myDf.groupBy('attackB','protocol').count().show()

+-------+--------+------+
|attackB|protocol| count|
+-------+--------+------+
| normal|     udp| 19177|
| normal|    icmp|  1288|
| normal|     tcp| 76813|
| attack|    icmp|282314|
| attack|     tcp|113252|
| attack|     udp|  1177|
+-------+--------+------+



In [28]:
myDf.groupBy('attackB').pivot('protocol').count().show()

+-------+------+------+-----+
|attackB|  icmp|   tcp|  udp|
+-------+------+------+-----+
| normal|  1288| 76813|19177|
| attack|282314|113252| 1177|
+-------+------+------+-----+



In [29]:
myDf.groupBy('attack5').pivot('protocol').avg('src_bytes').show()

+-------+------------------+------------------+------------------+
|attack5|              icmp|               tcp|               udp|
+-------+------------------+------------------+------------------+
|probing|10.700793650793651| 261454.6003016591|25.235897435897435|
|    u2r|              null| 960.8979591836735|13.333333333333334|
| normal| 91.47049689440993|1439.3120305156679| 98.01220211711947|
|    r2l|              null|271972.57460035523|              null|
|    dos| 936.2672084368129| 1090.303422435458|              28.0|
+-------+------------------+------------------+------------------+

