# English Text Preprocessing

## Stemming (어간 추출과정)
- `Stem`: 어간. 단어를 구성하는 기본 단위
  - 'lovely'의 어간 : 'love'
  - 'Beautiful'의 어간: 'Beauty'

영어는 현재 진행형(`~ing`), 복수(`~s`,`es`),과거형(`ed`)

In [1]:
# nltk: 영어를 처리할 수 있는 패키지
from nltk.stem import LancasterStemmer

stemmer = LancasterStemmer()

In [2]:
samples = ['working','works','worked']

for sample in samples:
  sample_stem = stemmer.stem(sample)
  print(sample,'의 어간 ==>', sample_stem)

working 의 어간 ==> work
works 의 어간 ==> work
worked 의 어간 ==> work


## Lemmatization
- 원형 찾기
  `is`,`are` 의 원형 -> `be`

In [3]:
import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [4]:
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [5]:
print( lemmatizer.lemmatize("working", "v"), lemmatizer.lemmatize("works", "v"), lemmatizer.lemmatize("worked", "v") )
print( lemmatizer.lemmatize("am", "v"), lemmatizer.lemmatize("is", "v"), lemmatizer.lemmatize("are", "v"))

print( lemmatizer.lemmatize("dance", "n"), lemmatizer.lemmatize("this", "n"))

work work work
be be be
dance this


# spaCy 활용
- 서구권 언어들에 대한 토큰화, 어간 찾기 등을 손쉽게 수행해주는 라이브러리

In [6]:
import spacy

# 언어팩 다운로드
nlp = spacy.load('en_core_web_sm')

In [7]:
eng_sent = "I'm at a payphone, trying to call home. All of my change I spent on you. Where have the times gone.. Baby, it's all wrong. Where are the plans we made for two."

In [23]:
# 분석할 문장을 nlp 객체에 넣는다.
doc = nlp(eng_sent)
doc.text

"I'm at a payphone, trying to call home. All of my change I spent on you. Where have the times gone.. Baby, it's all wrong. Where are the plans we made for two."

In [9]:
#  단어 토큰화, 형태소(POS), Lemma 확인
for word_token in doc:
  print("word : {}\t\tPOS : {}\t\tLemma : {}".format(word_token.text, word_token.pos_, word_token.lemma_))

word : I		POS : PRON		Lemma : I
word : 'm		POS : AUX		Lemma : be
word : at		POS : ADP		Lemma : at
word : a		POS : DET		Lemma : a
word : payphone		POS : NOUN		Lemma : payphone
word : ,		POS : PUNCT		Lemma : ,
word : trying		POS : VERB		Lemma : try
word : to		POS : PART		Lemma : to
word : call		POS : VERB		Lemma : call
word : home		POS : NOUN		Lemma : home
word : .		POS : PUNCT		Lemma : .
word : All		POS : PRON		Lemma : all
word : of		POS : ADP		Lemma : of
word : my		POS : PRON		Lemma : my
word : change		POS : NOUN		Lemma : change
word : I		POS : PRON		Lemma : I
word : spent		POS : VERB		Lemma : spend
word : on		POS : ADP		Lemma : on
word : you		POS : PRON		Lemma : you
word : .		POS : PUNCT		Lemma : .
word : Where		POS : SCONJ		Lemma : where
word : have		POS : AUX		Lemma : have
word : the		POS : DET		Lemma : the
word : times		POS : NOUN		Lemma : time
word : gone		POS : VERB		Lemma : go
word : ..		POS : PUNCT		Lemma : ..
word : Baby		POS : PROPN		Lemma : Baby
word : ,		POS : PUNCT		Lemma 

In [10]:
anti_hero = '''
It's me, hi, I'm the problem, it's me.At tea time, everybody agrees

'''

In [11]:
doc1 = nlp(anti_hero)


In [12]:
dir(doc1)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '_bulk_merge',
 '_context',
 '_get_array_attrs',
 '_realloc',
 '_vector',
 '_vector_norm',
 'cats',
 'char_span',
 'copy',
 'count_by',
 'doc',
 'ents',
 'extend_tensor',
 'from_array',
 'from_bytes',
 'from_dict',
 'from_disk',
 'from_docs',
 'from_json',
 'get_extension',
 'get_lca_matrix',
 'has_annotation',
 'has_extension',
 'has_unknown_spaces',
 'has_vector',
 'is_nered',
 'is_parsed',
 'is_sentenced',
 'is_tagged',
 'lang',
 'lang_',
 'mem',
 'noun_chunks',
 'noun_chunks_iterator',
 'remove_extension',
 'retokenize',
 'sentiment',
 'sents',
 'set

In [13]:
#  단어 토큰화, 형태소(POS), Lemma 확인
for word_token in doc1:
  print("word : {}\t\tPOS : {}\t\tLemma : {}".format(word_token.text, word_token.pos_, word_token.lemma_))

word : 
		POS : SPACE		Lemma : 

word : It		POS : PRON		Lemma : it
word : 's		POS : AUX		Lemma : be
word : me		POS : PRON		Lemma : I
word : ,		POS : PUNCT		Lemma : ,
word : hi		POS : INTJ		Lemma : hi
word : ,		POS : PUNCT		Lemma : ,
word : I		POS : PRON		Lemma : I
word : 'm		POS : AUX		Lemma : be
word : the		POS : DET		Lemma : the
word : problem		POS : NOUN		Lemma : problem
word : ,		POS : PUNCT		Lemma : ,
word : it		POS : PRON		Lemma : it
word : 's		POS : AUX		Lemma : be
word : me		POS : PRON		Lemma : I
word : .		POS : PUNCT		Lemma : .
word : At		POS : ADP		Lemma : at
word : tea		POS : NOUN		Lemma : tea
word : time		POS : NOUN		Lemma : time
word : ,		POS : PUNCT		Lemma : ,
word : everybody		POS : PRON		Lemma : everybody
word : agrees		POS : VERB		Lemma : agree
word : 

		POS : SPACE		Lemma : 




# 한국어 텍스트 정제

In [14]:
korean_text_sample="""
                    
                        1주째는 양호했는데 이번에받은건 스티로폼박스깨져서 일회성비닐팩으로 덧붙여서 보내왔네요
                        
                        
샐러드팩도 뜯어져있고..

먹어도되나싶을정도 실망스럽습니다.
                    
                """

In [29]:
# 정규식을 이용해 정제
import re

korean_text_re = re.sub('\n','',korean_text_sample)
korean_text_re = korean_text_re.strip()

# # 공백 문자가 2번 이상 반복되면 공백 하나로 치환
korean_text_re = re.sub('\s{2,}',' ',korean_text_sample)

print(korean_text_re)

 1주째는 양호했는데 이번에받은건 스티로폼박스깨져서 일회성비닐팩으로 덧붙여서 보내왔네요 샐러드팩도 뜯어져있고.. 먹어도되나싶을정도 실망스럽습니다. 


## 맞춤법 정리 : 보류✂️
- py-hanspell: 딥러닝 기반으로 한국어 맞춤법 및 띄어쓰기를 정리리

In [16]:
!pip install git+https://github.com/ssut/py-hanspell.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/ssut/py-hanspell.git
  Cloning https://github.com/ssut/py-hanspell.git to /tmp/pip-req-build-674p7dak
  Running command git clone --filter=blob:none --quiet https://github.com/ssut/py-hanspell.git /tmp/pip-req-build-674p7dak
  Resolved https://github.com/ssut/py-hanspell.git to commit 8e993cf46f97f9d665c15633a0fc78ac1b727713
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: py-hanspell
  Building wheel for py-hanspell (setup.py) ... [?25l[?25hdone
  Created wheel for py-hanspell: filename=py_hanspell-1.1-py3-none-any.whl size=4865 sha256=f38aa77983422aac77aaf213bc26c7518787fa03c6e7bfc577b1fa24492ef552
  Stored in directory: /tmp/pip-ephem-wheel-cache-k41blxqz/wheels/94/bc/ef/2cb90c305d609f8086489e7e1bd69f21e955118f26938609b7
Successfully built py-hanspell
Installing collected packages: py-hanspell
Successfully 

In [17]:
from hanspell import spell_checker
!pip install --upgrade hanspell

# 맞춤법 검사 테스트
text ='맞춤뻡. 틀리면 외 않되?'

# hanspell_text = spell_checker.check(text).checked
hanspell_text = spell_checker.check(text).checked

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Could not find a version that satisfies the requirement hanspell (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for hanspell[0m[31m
[0m

JSONDecodeError: ignored

## 띄어쓰기 관리 패키지 - PyKoSpacing

In [31]:
!pip install git+https://github.com/haven-jeon/PyKoSpacing.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/haven-jeon/PyKoSpacing.git
  Cloning https://github.com/haven-jeon/PyKoSpacing.git to /tmp/pip-req-build-5hpqjeak
  Running command git clone --filter=blob:none --quiet https://github.com/haven-jeon/PyKoSpacing.git /tmp/pip-req-build-5hpqjeak
  Resolved https://github.com/haven-jeon/PyKoSpacing.git to commit a058e90c9de41889c63bf2ee454bf1de064d70ff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorflow==2.9.3
  Downloading tensorflow-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (511.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.8/511.8 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h5py==3.1.0
  Downloading h5py-3.1.0-cp39-cp39-manylinux1_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m


In [33]:
!pip install --upgrade tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow
  Downloading tensorflow-2.12.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (585.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m585.9/585.9 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras<2.13,>=2.12.0
  Downloading keras-2.12.0-py2.py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard<2.13,>=2.12
  Downloading tensorboard-2.12.2-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3
  Downloading protobuf-4.22.3-cp37-abi3-manylinux2014_x86_64.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.

In [34]:
from pykospacing import Spacing


text = "아버지가방에들어가신다"

spacing = Spacing()
spacing_text = spacing(text)
print(spacing_text)

아버지가 방에 들어가신다


In [35]:
korean_text_spacing = spacing(korean_text_re)
print(korean_text_spacing)

1주째는 양호했는데 이번에 받은 건 스티로폼 박스 깨져서 일회성 비닐팩으로 덧붙여서 보내왔네요 샐러드팩도 뜯어져 있고.. 먹어도 되나 싶을 정도 실망스럽습니다.


# 한국어 어간 추출
- Okt 형태소 분석기

In [36]:
!pip install konlpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
Collecting JPype1>=0.7.0
  Downloading JPype1-1.4.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.3/465.3 kB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.1 konlpy-0.6.0


In [37]:
from konlpy.tag import Okt

okt = Okt()


In [38]:
# mophs: 문장을 형태소 단위로 분리 (단어만 나옴)
# pos: 형태소 내용까지 같이 나옴 (단어가 어떤 형태소인지까지 나옴)

okt.morphs(korean_text_spacing)

['1',
 '주',
 '째',
 '는',
 '양호',
 '했는데',
 '이번',
 '에',
 '받은',
 '건',
 '스티로폼',
 '박스',
 '깨져서',
 '일',
 '회',
 '성',
 '비닐',
 '팩',
 '으로',
 '덧붙여서',
 '보내왔네요',
 '샐러드',
 '팩',
 '도',
 '뜯어져',
 '있고',
 '..',
 '먹어도',
 '되나',
 '싶을',
 '정도',
 '실망',
 '스럽습니다',
 '.']

In [39]:
okt.pos(korean_text_spacing)

[('1', 'Number'),
 ('주', 'Noun'),
 ('째', 'Suffix'),
 ('는', 'Josa'),
 ('양호', 'Noun'),
 ('했는데', 'Verb'),
 ('이번', 'Noun'),
 ('에', 'Josa'),
 ('받은', 'Verb'),
 ('건', 'Noun'),
 ('스티로폼', 'Noun'),
 ('박스', 'Noun'),
 ('깨져서', 'Verb'),
 ('일', 'Modifier'),
 ('회', 'Noun'),
 ('성', 'Suffix'),
 ('비닐', 'Noun'),
 ('팩', 'Noun'),
 ('으로', 'Josa'),
 ('덧붙여서', 'Verb'),
 ('보내왔네요', 'Verb'),
 ('샐러드', 'Noun'),
 ('팩', 'Noun'),
 ('도', 'Josa'),
 ('뜯어져', 'Verb'),
 ('있고', 'Adjective'),
 ('..', 'Punctuation'),
 ('먹어도', 'Verb'),
 ('되나', 'Verb'),
 ('싶을', 'Verb'),
 ('정도', 'Noun'),
 ('실망', 'Noun'),
 ('스럽습니다', 'Adjective'),
 ('.', 'Punctuation')]

In [40]:
# 어간 추출
okt.morphs(korean_text_spacing, stem=True)

['1',
 '주',
 '째',
 '는',
 '양호',
 '하다',
 '이번',
 '에',
 '받다',
 '건',
 '스티로폼',
 '박스',
 '깨다',
 '일',
 '회',
 '성',
 '비닐',
 '팩',
 '으로',
 '덧붙이다',
 '보내오다',
 '샐러드',
 '팩',
 '도',
 '뜯다',
 '있다',
 '..',
 '먹다',
 '되다',
 '싶다',
 '정도',
 '실망',
 '스럽다',
 '.']

In [41]:
text='오늘 강남역에서 봅시닿ㅎㅎㅎㅎㅎ'
okt.pos(text)

[('오늘', 'Noun'),
 ('강남역', 'Noun'),
 ('에서', 'Josa'),
 ('봅시', 'Verb'),
 ('닿', 'Verb'),
 ('ㅎㅎㅎㅎㅎ', 'KoreanParticle')]

In [42]:
# 정규화
okt.pos(text, norm=True)

[('오늘', 'Noun'),
 ('강남역', 'Noun'),
 ('에서', 'Josa'),
 ('봅시다', 'Verb'),
 ('ㅎㅎㅎ', 'KoreanParticle')]

In [43]:
# 정규화, 어간 추출을 동시에 
okt.pos(text, stem=True, norm=True)

[('오늘', 'Noun'),
 ('강남역', 'Noun'),
 ('에서', 'Josa'),
 ('보다', 'Verb'),
 ('ㅎㅎㅎ', 'KoreanParticle')]