# English Text Preprocessing
- 영어 텍스트 정제

## Stemming
- `Stem` : 어간. 단어를 구성하는 기본 단위
  - `lovely`의 어간: `love`
  - `beautiful` : `beauti`

영어는 현재 진행형(`~ing`), 복수(`~s, es`), 과거형(`~ed`)

In [1]:
#nltk : 영어를 처리할 수 있는 패키지
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()

In [2]:
samples = ['working','works','worked']

for sample in samples:
  sample_stem = stemmer.stem(sample)
  print(sample, '==>', sample_stem)

working ==> work
works ==> work
worked ==> work


## Lemmitazation
- 원형 찾기
  - `is, are,am` ==> `be`
  

In [3]:
import nltk

#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [4]:
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [8]:
print( lemmatizer.lemmatize("working", "v"), lemmatizer.lemmatize("works", "v"), lemmatizer.lemmatize("worked", "v") )
print( lemmatizer.lemmatize("am", "v"), lemmatizer.lemmatize("is", "v"), lemmatizer.lemmatize("are", "v"))

print( lemmatizer.lemmatize("dance", "n"), lemmatizer.lemmatize("this", "n"))

work work work
be be be
dance this


# spaCy 활용
- 서구권 언어 토큰화, 어간 찾기 등

In [9]:
import spacy

#언어팩 다운로드 
nlp = spacy.load('en_core_web_sm')

In [10]:
eng_sent = "I'm at a payphone, trying to call home. All of my change I spent on you. Where have the times gone.. Baby, it's all wrong. Where are the plans we made for two."

In [11]:
# 분석할 문장을 nlp 객체에 넣는다.
doc = nlp(eng_sent)
doc.text

"I'm at a payphone, trying to call home. All of my change I spent on you. Where have the times gone.. Baby, it's all wrong. Where are the plans we made for two."

In [12]:
# 단어 토큰화, 형태소 확인(POS) , Lemma 확인
for word_token in doc:
  print("word : {}\t\tPOS : {}\t\tLemma : {}".format(word_token.text, word_token.pos_, word_token.lemma_))

word : I		POS : PRON		Lemma : I
word : 'm		POS : AUX		Lemma : be
word : at		POS : ADP		Lemma : at
word : a		POS : DET		Lemma : a
word : payphone		POS : NOUN		Lemma : payphone
word : ,		POS : PUNCT		Lemma : ,
word : trying		POS : VERB		Lemma : try
word : to		POS : PART		Lemma : to
word : call		POS : VERB		Lemma : call
word : home		POS : NOUN		Lemma : home
word : .		POS : PUNCT		Lemma : .
word : All		POS : PRON		Lemma : all
word : of		POS : ADP		Lemma : of
word : my		POS : PRON		Lemma : my
word : change		POS : NOUN		Lemma : change
word : I		POS : PRON		Lemma : I
word : spent		POS : VERB		Lemma : spend
word : on		POS : ADP		Lemma : on
word : you		POS : PRON		Lemma : you
word : .		POS : PUNCT		Lemma : .
word : Where		POS : SCONJ		Lemma : where
word : have		POS : AUX		Lemma : have
word : the		POS : DET		Lemma : the
word : times		POS : NOUN		Lemma : time
word : gone		POS : VERB		Lemma : go
word : ..		POS : PUNCT		Lemma : ..
word : Baby		POS : PROPN		Lemma : Baby
word : ,		POS : PUNCT		Lemma 

# 한국어 텍스트 정제

In [1]:
korean_text_sample="""
                    
                        1주째는 양호했는데 이번에받은건 스티로폼박스깨져서 일회성비닐팩으로 덧붙여서 보내왔네요
                        
                        
샐러드팩도 뜯어져있고..

먹어도되나싶을정도 실망스럽습니다.
                    
                """

In [2]:
# 정규식을 이용해 정제
import re

korea_text_re = re.sub('\n', '', korean_text_sample)
korea_text_re = korea_text_re.strip()

# 공백 문자 2번 이상 반복되면 공백 하나로 치환
korea_text_re = re.sub('\s{2,}', ' ', korea_text_re)
    
print(korea_text_re)

1주째는 양호했는데 이번에받은건 스티로폼박스깨져서 일회성비닐팩으로 덧붙여서 보내왔네요 샐러드팩도 뜯어져있고..먹어도되나싶을정도 실망스럽습니다.


In [None]:
# 맞춤법 정리
# * py-hanspell : 딥러닝 기반으로 한국어 맞춤법 및 띄어쓰기를 정리
#!pip install git+https://github.com/ssut/py-hanspell.git

In [3]:
#from hanspell import spell_checker

# 맞춤법 검사 테스트
#text = '맞춤뻡 틀리면 외 않되?'
#hanspell_text = spell_checker.check
#hanspell_text

<function hanspell.spell_checker.check(text)>

In [None]:
# 띄어쓰기 관리 패키지 - PykoSpacing
#!pip install git+https://github.com/haven-jeon/PyKoSpacing.git

In [4]:
from pykospacing import Spacing

text = '아버지가방에들어가신다'

spacing = Spacing()
spacing_text = spacing(text)
print(spacing_text)

아버지가 방에 들어가신다


In [5]:
korean_text_spacing = spacing(korea_text_re)
print(korean_text_spacing)

1주째는 양호했는데 이번에 받은 건 스티로폼 박스 깨져서 일회성 비닐팩으로 덧붙여서 보내왔네요 샐러드팩도 뜯어져 있고.. 먹어도 되나 싶을 정도 실망스럽습니다.


# 한국어 어간 추출
- Okt 형태소 분석기

In [22]:
#!pip install konlpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m
Collecting JPype1>=0.7.0
  Downloading JPype1-1.4.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.3/465.3 kB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.1 konlpy-0.6.0


In [6]:
from konlpy.tag import Okt

okt = Okt()

In [7]:
# morphs : 문장을 형태소 단위로 분리
# pos : 형태소 내용까지 같이 나옴
okt.morphs(korean_text_spacing)

['1',
 '주',
 '째',
 '는',
 '양호',
 '했는데',
 '이번',
 '에',
 '받은',
 '건',
 '스티로폼',
 '박스',
 '깨져서',
 '일',
 '회',
 '성',
 '비닐',
 '팩',
 '으로',
 '덧붙여서',
 '보내왔네요',
 '샐러드',
 '팩',
 '도',
 '뜯어져',
 '있고',
 '..',
 '먹어도',
 '되나',
 '싶을',
 '정도',
 '실망',
 '스럽습니다',
 '.']

In [8]:
okt.pos(korean_text_spacing)

[('1', 'Number'),
 ('주', 'Noun'),
 ('째', 'Suffix'),
 ('는', 'Josa'),
 ('양호', 'Noun'),
 ('했는데', 'Verb'),
 ('이번', 'Noun'),
 ('에', 'Josa'),
 ('받은', 'Verb'),
 ('건', 'Noun'),
 ('스티로폼', 'Noun'),
 ('박스', 'Noun'),
 ('깨져서', 'Verb'),
 ('일', 'Modifier'),
 ('회', 'Noun'),
 ('성', 'Suffix'),
 ('비닐', 'Noun'),
 ('팩', 'Noun'),
 ('으로', 'Josa'),
 ('덧붙여서', 'Verb'),
 ('보내왔네요', 'Verb'),
 ('샐러드', 'Noun'),
 ('팩', 'Noun'),
 ('도', 'Josa'),
 ('뜯어져', 'Verb'),
 ('있고', 'Adjective'),
 ('..', 'Punctuation'),
 ('먹어도', 'Verb'),
 ('되나', 'Verb'),
 ('싶을', 'Verb'),
 ('정도', 'Noun'),
 ('실망', 'Noun'),
 ('스럽습니다', 'Adjective'),
 ('.', 'Punctuation')]

In [27]:
text = '오늘 강남역에서 봐욬ㅋㅋㅋㅋ'

In [None]:
# 어간 추출 : 동사의 원형
okt.morphs(korean_text_spacing, stem=True)

In [9]:
okt.pos(text)

[('아버지', 'Noun'), ('가방', 'Noun'), ('에', 'Josa'), ('들어가신다', 'Verb')]

In [28]:
# 정규화 - 받침처리
okt.pos(text, norm=True)

[('오늘', 'Noun'),
 ('강남역', 'Noun'),
 ('에서', 'Josa'),
 ('봐요', 'Verb'),
 ('ㅋㅋㅋ', 'KoreanParticle')]

In [10]:
# 정규화, 어간 추출을 동시에
okt.pos(text, stem=True, norm=True)

[('아버지', 'Noun'), ('가방', 'Noun'), ('에', 'Josa'), ('들어가다', 'Verb')]