In [2]:
import re
import nltk

# English Preprocessing
[NLTK로 데이터 전처리(Preprocessing) 하기](https://m.blog.naver.com/PostView.naver?isHttpsRedirect=true&blogId=qbxlvnf11&logNo=221434157182)  
[딥 러닝을 이용한 자연어 처리 입문](https://wikidocs.net/21694)  

# 표제어 추출(Lemmatization)
기본 사전형 단어. ie) am, are, is => be  
단어의 형태가 적절히 보존됨. 하지만 의미를 알 수 없는 단어가 나오기도 함  
표제어 추출기(lemmatizer)가 본래 단어의 품사 정보를 알아야 정확한 결과를 얻을 수 있음

In [8]:
from nltk.stem import WordNetLemmatizer
n=WordNetLemmatizer()
words=['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']
# dies => dy, has => ha, watched => watched
print([n.lemmatize(w) for w in words])

['policy', 'doing', 'organization', 'have', 'going', 'love', 'life', 'fly', 'dy', 'watched', 'ha', 'starting']


In [7]:
n.lemmatize('dies', 'v')

'die'

In [9]:
n.lemmatize('watched', 'v')

'watch'

# 어간 추출(Stemming)
형태학적 분석을 단순화한 버전. 정해진 규칙만 보고 단어의 어미를 자르는 작업

In [12]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
s = PorterStemmer()
text="This was not the map we found in Billy Bones's chest, but an accurate copy, complete in all things--names and heights and soundings--with the single exception of the red crosses and the written notes."
words=word_tokenize(text)
print(words)

['This', 'was', 'not', 'the', 'map', 'we', 'found', 'in', 'Billy', 'Bones', "'s", 'chest', ',', 'but', 'an', 'accurate', 'copy', ',', 'complete', 'in', 'all', 'things', '--', 'names', 'and', 'heights', 'and', 'soundings', '--', 'with', 'the', 'single', 'exception', 'of', 'the', 'red', 'crosses', 'and', 'the', 'written', 'notes', '.']


In [14]:
print([s.stem(w) for w in words])

['thi', 'wa', 'not', 'the', 'map', 'we', 'found', 'in', 'billi', 'bone', "'s", 'chest', ',', 'but', 'an', 'accur', 'copi', ',', 'complet', 'in', 'all', 'thing', '--', 'name', 'and', 'height', 'and', 'sound', '--', 'with', 'the', 'singl', 'except', 'of', 'the', 'red', 'cross', 'and', 'the', 'written', 'note', '.']


In [17]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

text = "The meaning is well known, even if it is not in complete accord with the reality. The restored stream evokes the environment but is not environmental, evokes history but is not historical, and evokes tradition without being traditional. The reality is conflicted. The restoration was huge in scale and high in cost. The cost overruns alone amounted to $34 million out of a total of about $351 million. Annual maintenance costs have been increasing while the overall number of visitors has declined."

# Example
1. tokenize
2. remove stopwords
3. remove word less than three letters
4. lower
5. lemmatization
6. stemming


In [23]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

# Data Preprocessing
def preprocessing(text):
    # 1. tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text)
              for word in nltk.word_tokenize(sent)]

    print( "- tokenize into words -" )
    print( tokens )
    print()
    
    # 2. remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]

    print( "- remove stopwords -" )
    print( tokens )
    print()
    
    # 3. remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 3]

    print( "- remove words less than three letters -" )
    print( tokens )
    print()
    
    # 4. lower capitalization
    tokens = [word.lower() for word in tokens]

    print( "- lower capitalization -" )
    print( tokens )
    print()
    
    # 5. lemmatization
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]

    print( "- lemmatization -" )
    print( tokens )
    print()

    tokens = [lmtzr.lemmatize(word, 'v') for word in tokens]

    print( "- lemmatization/verb -" )
    print( tokens )
    print()

    # 6. stemming
    stemmer = PorterStemmer()
    tokens = [ stemmer.stem(word) for word in tokens ]

    print( "- stemming -" )
    print(tokens)
    print()
    
    preprocessed_text= ' '.join(tokens)
    return preprocessed_text

# Example : expanding a contraction(영어 줄임말 제거)

In [26]:
def abbreviation_handler(text):
    # case replacement
    ln = text
    ln = ln.replace(r"'t", " not")
    ln = ln.replace(r"'s", " is")
    ln = ln.replace(r"'ll", " will")
    ln = ln.replace(r"'ve", " have")
    ln = ln.replace(r"'re", " are")
    ln = ln.replace(r"'m", " am")

    # delete single '
    ln = ln.replace(r"'", " ")

# Example : wiki corpus preprocessing

In [27]:
WIKI_REMOVE_CHARS = re.compile("'+|(=+.{2,30}=+)|__TOC__|(ファイル:).+|:(en|de|it|fr|es|kr|zh|no|fi):|\n", re.UNICODE)
WIKI_SPACE_CHARS = re.compile("(\\s|゙|゚|　)+", re.UNICODE)
EMAIL_PATTERN = re.compile("(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", re.UNICODE)
URL_PATTERN = re.compile("(ftp|http|https)?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", re.UNICODE)
WIKI_REMOVE_TOKEN_CHARS = re.compile("(\\*$|:$|^파일:.+|^;)", re.UNICODE)
MULTIPLE_SPACES = re.compile(' +', re.UNICODE)


def tokenize(content, token_min_len=2, token_max_len=100, lower=True):
    content = re.sub(EMAIL_PATTERN, ' ', content)  # remove email pattern
    content = re.sub(URL_PATTERN, ' ', content) # remove url pattern
    content = re.sub(WIKI_REMOVE_CHARS, ' ', content)  # remove unnecessary chars
    content = re.sub(WIKI_SPACE_CHARS, ' ', content)
    content = re.sub(MULTIPLE_SPACES, ' ', content)
    tokens = content.replace(", )", "").split(" ")
    result = []
    for token in tokens:
        if not token.startswith('_'):
            token_candidate = to_unicode(re.sub(WIKI_REMOVE_TOKEN_CHARS, '', token))
        else:
            token_candidate = ""
        if len(token_candidate) > 0:
            result.append(token_candidate)
    return result

# Example : Corpus preprocessing with remove emoji code

In [28]:
def cleansing(text):
    # 대괄호 텍스트 제거
    text = re.sub('\[.*?\]', '', text)
    
    # link 제거
    text = re.sub('https?://\S+|www\.\S+', '', text)
    
    # punctuation 제거
    text = re.sub('<.*?>+', '', text)
    
    
    text = abbreviation_handler(text)

    # e-mail 주소 제거
    pattern = '(\[a-zA-Z0-9\_.+-\]+@\[a-zA-Z0-9-\]+.\[a-zA-Z0-9-.\]+)'
    text = re.sub(pattern=pattern, repl=' ', string=text)

    # url 제거
    pattern = '(http|ftp|https)://(?:[-\w.]|(?:\da-fA-F]{2}))+'
    text = re.sub(pattern=pattern, repl=' ', string=text)

    # email 제거
    pattern = '(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)'
    text = re.sub(pattern=pattern, repl=' ', string=text)

    # url 제거(ratsgo github)
    pattern = '(ftp|http|https)?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    text = re.sub(pattern=pattern, repl=' ', string=text)

    # 한글 자음, 모음 제거
    pattern = '([ㄱ-ㅎㅏ-ㅣ])+'
    text = re.sub(pattern=pattern, repl=' ', string=text)

    # html tag 제거
    pattern = '<[^>]*>'
    text = re.sub(pattern=pattern, repl=' ', string=text)

    # \r, \n 제거
    pattern = '[\r|\n]'
    text = re.sub(pattern=pattern, repl=' ', string=text)

    # 특수기호 제거
    pattern = '[^\w\s+.]'
    text = re.sub(pattern=pattern, repl=' ', string=text)

    # 이모지 제거
    pattern = re.compile("["
                         u"\U0001F600-\U0001F64F"  # emoticons
                         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                         u"\U0001F680-\U0001F6FF"  # transport & map symbols
                         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                         "]+", flags=re.UNICODE)
    text = pattern.sub(r'', text)

    # 이중 space 제거
    pattern = re.compile(r'\s+')
    text = re.sub(pattern=pattern, repl=' ', string=text)

    return (text)