# NLP using PySpark

1. spark sql로 데이터를 로드
2. document 열 생성
3. Spark NLP로 처리
4. 관심있는 애너테이션을 Spark SQL 데이터 타입으로 변환
5. 추가적이 MLlib stage를 실행

## SparkSession 연결

In [1]:
import findspark

findspark.init()
findspark.find()

'C:\\Spark\\spark-3.1.2-bin-hadoop3.2'

In [2]:
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *

packages = ','.join([
    "com.johnsnowlabs.nlp:spark-nlp_2.12:3.3.2"
])

spark_conf = SparkConf()\
    .setAppName('Spark NLP Pipeline')\
    .setAppName('master[*]')\
    .set('spark.jars.packages', packages)
spark = SparkSession.builder\
    .config(conf=spark_conf)\
    .getOrCreate()

## Data Load (with Spark SQL)

In [3]:
import os

text_path = os.path.join('../Data', '3구간', '1시기', '1시기_ST')

# 파일 경로-텍스트 와 같은 쌍을 포함하는 RDD
texts = spark.sparkContext.wholeTextFiles(text_path)

schema = StructType([
    StructField('path', StringType()),
    StructField('text', StringType()),
])

texts = spark.createDataFrame(texts, schema=schema)

In [4]:
texts.limit(5).toPandas()

Unnamed: 0,path,text
0,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,America's Germany\r\n\r\nAmid ruins the occupa...
1,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,ILYA EHRENBURG'S AMERICA\r\n\r\nTranslations o...
2,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,NO REST FOR THE\r\nWEARY RUSSIANS\r\n\r\nJOHN ...
3,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,"Our Ally, Russia\r\n\r\nON THURSDAY, September..."
4,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,Our record in Japan \r\n\r\nMaxwell Stewart \r...


## Documetn Assembler

텍스트를 문서 객체로 변환

- inputCol : 도큐먼트의 텍스트를 포함하는 열
- outputCol : 새로 생성된 도큐먼트를 포함하는 열 이름
- idCol : 식별자가 포함된 열 이름 (Optional)
- metadataCol : 도큐먼트 메타데이터를 나타내는 Map 타입의 열 이름(선택 사항)
- trimAndClearNewLines : 개행 문자와 문자열 공백을 제거할지를 결정(Optional, default=True)

In [5]:
from sparknlp import DocumentAssembler, Finisher

document_assembler = DocumentAssembler()\
    .setInputCol('text')\
    .setOutputCol('document')\
    .setIdCol('path')
docs = document_assembler.transform(texts)

docs.limit(5).toPandas()

Unnamed: 0,path,text,document
0,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,America's Germany\r\n\r\nAmid ruins the occupa...,"[(document, 0, 30585, America's Germany\r\n\r\..."
1,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,ILYA EHRENBURG'S AMERICA\r\n\r\nTranslations o...,"[(document, 0, 61303, ILYA EHRENBURG'S AMERICA..."
2,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,NO REST FOR THE\r\nWEARY RUSSIANS\r\n\r\nJOHN ...,"[(document, 0, 41364, NO REST FOR THE\r\nWEARY..."
3,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,"Our Ally, Russia\r\n\r\nON THURSDAY, September...","[(document, 0, 58337, Our Ally, Russia\r\n\r\n..."
4,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,Our record in Japan \r\n\r\nMaxwell Stewart \r...,"[(document, 0, 16662, Our record in Japan \r\n..."


## 애너테이터

### SentenceDetector

In [7]:
from sparknlp.annotator import SentenceDetector

sent_detector = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentences')\
    .setUseAbbreviations(True)

sentences = sent_detector.transform(docs)
sentences.limit(5).toPandas()

Unnamed: 0,path,text,document,sentences
0,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,America's Germany\r\n\r\nAmid ruins the occupa...,"[(document, 0, 30585, America's Germany\r\n\r\...","[(document, 0, 138, America's Germany\r\n\r\nA..."
1,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,ILYA EHRENBURG'S AMERICA\r\n\r\nTranslations o...,"[(document, 0, 61303, ILYA EHRENBURG'S AMERICA...","[(document, 0, 255, ILYA EHRENBURG'S AMERICA\r..."
2,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,NO REST FOR THE\r\nWEARY RUSSIANS\r\n\r\nJOHN ...,"[(document, 0, 41364, NO REST FOR THE\r\nWEARY...","[(document, 0, 117, NO REST FOR THE\r\nWEARY R..."
3,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,"Our Ally, Russia\r\n\r\nON THURSDAY, September...","[(document, 0, 58337, Our Ally, Russia\r\n\r\n...","[(document, 0, 202, Our Ally, Russia\r\n\r\nON..."
4,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,Our record in Japan \r\n\r\nMaxwell Stewart \r...,"[(document, 0, 16662, Our record in Japan \r\n...","[(document, 0, 140, Our record in Japan \r\n\r..."


### Tokenizer

In [8]:
from sparknlp.annotator import Tokenizer

tokenizer = Tokenizer()\
    .setInputCols(['sentences'])\
    .setOutputCol('tokens')\
    .fit(sentences)

tokens = tokenizer.transform(sentences)
tokens.limit(5).toPandas()

Unnamed: 0,path,text,document,sentences,tokens
0,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,America's Germany\r\n\r\nAmid ruins the occupa...,"[(document, 0, 30585, America's Germany\r\n\r\...","[(document, 0, 138, America's Germany\r\n\r\nA...","[(token, 0, 8, America's, {'sentence': '0'}, [..."
1,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,ILYA EHRENBURG'S AMERICA\r\n\r\nTranslations o...,"[(document, 0, 61303, ILYA EHRENBURG'S AMERICA...","[(document, 0, 255, ILYA EHRENBURG'S AMERICA\r...","[(token, 0, 3, ILYA, {'sentence': '0'}, []), (..."
2,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,NO REST FOR THE\r\nWEARY RUSSIANS\r\n\r\nJOHN ...,"[(document, 0, 41364, NO REST FOR THE\r\nWEARY...","[(document, 0, 117, NO REST FOR THE\r\nWEARY R...","[(token, 0, 1, NO, {'sentence': '0'}, []), (to..."
3,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,"Our Ally, Russia\r\n\r\nON THURSDAY, September...","[(document, 0, 58337, Our Ally, Russia\r\n\r\n...","[(document, 0, 202, Our Ally, Russia\r\n\r\nON...","[(token, 0, 2, Our, {'sentence': '0'}, []), (t..."
4,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,Our record in Japan \r\n\r\nMaxwell Stewart \r...,"[(document, 0, 16662, Our record in Japan \r\n...","[(document, 0, 140, Our record in Japan \r\n\r...","[(token, 0, 2, Our, {'sentence': '0'}, []), (t..."


### Lemmatier

In [12]:
from sparknlp.annotator import Lemmatizer

lemmatizer = Lemmatizer()\
    .setInputCols(["tokens"])\
    .setOutputCol("lemma")\
    .setDictionary('en_lemmas.txt', '\t', ',')\
    .fit(tokens)

lemmas = lemmatizer.transform(tokens)
lemmas.limit(5).toPandas()

Unnamed: 0,path,text,document,sentences,tokens,lemma
0,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,America's Germany\r\n\r\nAmid ruins the occupa...,"[(document, 0, 30585, America's Germany\r\n\r\...","[(document, 0, 138, America's Germany\r\n\r\nA...","[(token, 0, 8, America's, {'sentence': '0'}, [...","[(token, 0, 8, America's, {'sentence': '0'}, [..."
1,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,ILYA EHRENBURG'S AMERICA\r\n\r\nTranslations o...,"[(document, 0, 61303, ILYA EHRENBURG'S AMERICA...","[(document, 0, 255, ILYA EHRENBURG'S AMERICA\r...","[(token, 0, 3, ILYA, {'sentence': '0'}, []), (...","[(token, 0, 3, ILYA, {'sentence': '0'}, []), (..."
2,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,NO REST FOR THE\r\nWEARY RUSSIANS\r\n\r\nJOHN ...,"[(document, 0, 41364, NO REST FOR THE\r\nWEARY...","[(document, 0, 117, NO REST FOR THE\r\nWEARY R...","[(token, 0, 1, NO, {'sentence': '0'}, []), (to...","[(token, 0, 1, NO, {'sentence': '0'}, []), (to..."
3,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,"Our Ally, Russia\r\n\r\nON THURSDAY, September...","[(document, 0, 58337, Our Ally, Russia\r\n\r\n...","[(document, 0, 202, Our Ally, Russia\r\n\r\nON...","[(token, 0, 2, Our, {'sentence': '0'}, []), (t...","[(token, 0, 2, Our, {'sentence': '0'}, []), (t..."
4,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,Our record in Japan \r\n\r\nMaxwell Stewart \r...,"[(document, 0, 16662, Our record in Japan \r\n...","[(document, 0, 140, Our record in Japan \r\n\r...","[(token, 0, 2, Our, {'sentence': '0'}, []), (t...","[(token, 0, 2, Our, {'sentence': '0'}, []), (t..."


### Pos tagger

In [13]:
from sparknlp.annotator import PerceptronModel

pos_tagger = PerceptronModel.pretrained()\
    .setInputCols(["tokens", "sentences"])\
    .setOutputCol("pos")

postags = pos_tagger.transform(lemmas)
postags.limit(5).toPandas()

pos_anc download started this may take some time.
Approximate size to download 3.9 MB
[OK!]


Unnamed: 0,path,text,document,sentences,tokens,lemma,pos
0,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,America's Germany\r\n\r\nAmid ruins the occupa...,"[(document, 0, 30585, America's Germany\r\n\r\...","[(document, 0, 138, America's Germany\r\n\r\nA...","[(token, 0, 8, America's, {'sentence': '0'}, [...","[(token, 0, 8, America's, {'sentence': '0'}, [...","[(pos, 0, 8, NNP, {'sentence': '0', 'word': 'A..."
1,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,ILYA EHRENBURG'S AMERICA\r\n\r\nTranslations o...,"[(document, 0, 61303, ILYA EHRENBURG'S AMERICA...","[(document, 0, 255, ILYA EHRENBURG'S AMERICA\r...","[(token, 0, 3, ILYA, {'sentence': '0'}, []), (...","[(token, 0, 3, ILYA, {'sentence': '0'}, []), (...","[(pos, 0, 3, NNP, {'sentence': '0', 'word': 'I..."
2,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,NO REST FOR THE\r\nWEARY RUSSIANS\r\n\r\nJOHN ...,"[(document, 0, 41364, NO REST FOR THE\r\nWEARY...","[(document, 0, 117, NO REST FOR THE\r\nWEARY R...","[(token, 0, 1, NO, {'sentence': '0'}, []), (to...","[(token, 0, 1, NO, {'sentence': '0'}, []), (to...","[(pos, 0, 1, DT, {'sentence': '0', 'word': 'NO..."
3,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,"Our Ally, Russia\r\n\r\nON THURSDAY, September...","[(document, 0, 58337, Our Ally, Russia\r\n\r\n...","[(document, 0, 202, Our Ally, Russia\r\n\r\nON...","[(token, 0, 2, Our, {'sentence': '0'}, []), (t...","[(token, 0, 2, Our, {'sentence': '0'}, []), (t...","[(pos, 0, 2, PRP$, {'sentence': '0', 'word': '..."
4,file:/c:/Text-Mining-Project/Data/3구간/1시기/1시기_...,Our record in Japan \r\n\r\nMaxwell Stewart \r...,"[(document, 0, 16662, Our record in Japan \r\n...","[(document, 0, 140, Our record in Japan \r\n\r...","[(token, 0, 2, Our, {'sentence': '0'}, []), (t...","[(token, 0, 2, Our, {'sentence': '0'}, []), (t...","[(pos, 0, 2, PRP$, {'sentence': '0', 'word': '..."
