In [1]:
import os
import sys

os.environ["SPARK_HOME"]=os.path.join(os.path.expanduser("~"),'spark-2.0.0-bin-hadoop2.7')
os.environ["PYLIB"]=os.path.join(os.environ["SPARK_HOME"],'python','lib')
sys.path.insert(0,os.path.join(os.environ["PYLIB"],'py4j-0.10.1-src.zip'))
sys.path.insert(0,os.path.join(os.environ["PYLIB"],'pyspark.zip'))

import pyspark
myConf=pyspark.SparkConf()
spark = pyspark.sql.SparkSession.builder\
    .master("local")\
    .appName("myApp")\
    .config('spark.sql.warehouse.dir','C:\Users\qorgk\code\spark')\
    .getOrCreate()

from pyspark.sql import Row
from pyspark.sql.types import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests

#### 데이터 불러오기 & dataFrame 만들기

In [3]:
df=spark.read.text(os.path.join("data", "20191021_policeAddress.txt"))
police=spark.read.option("header","true").option("delimiter"," ").option("inferSchema","true")\
    .schema(
        StructType([
            StructField("sent",StringType()),
            ])).text(os.path.join("data", "20191021_policeAddress.txt")) 

#### Tokenizer

In [4]:
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol="sent", outputCol="words")
tokDf = tokenizer.transform(police)

#### StopWords
한 글자 불용어 지우기

In [8]:
from pyspark.ml.feature import StopWordsRemover
stop = StopWordsRemover(inputCol="words", outputCol="stop")

stopwords=list()
_mystopwords=[u"참", u"큰",u"에", u"더",u"될",u"돌",u"그"]
for e in _mystopwords:
    stopwords.append(e)
stop.setStopWords(stopwords)
stopDf=stop.transform(tokDf)

#### TF/IDF

In [9]:
from pyspark.ml.feature import HashingTF, IDF

hashTF = HashingTF(inputCol="stop", outputCol="hash", numFeatures=50)
hashDf = hashTF.transform(stopDf)
idf = IDF(inputCol="hash", outputCol="idf")
idfModel = idf.fit(hashDf)
idfDf = idfModel.transform(hashDf)

#### VectorAssembler
features 추출하기

In [10]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

va = VectorAssembler(inputCols=["idf"],outputCol="features")
vaDf = va.transform(idfDf)
vaDf.printSchema()
vaDf.show(truncate=False)

root
 |-- sent: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- stop: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hash: vector (nullable = true)
 |-- idf: vector (nullable = true)
 |-- features: vector (nullable = true)

+-------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------