# 문장 전처리 함수

konlpy 설치하는 방법
1) java 23 win 64 bit 설치: https://www.oracle.com/java/technologies/downloads/?er=221886
2) 환경 변수 잡기: https://www.codeit.kr/tutorials/43/KoNLPy-%EC%84%A4%EC%B9%98%ED%95%98%EA%B8%B0-Windows
3) pip install jpype1
4) pip install konlpy

In [1]:
# pos_tags = [
#     "Noun",       # 명사
#     "Verb",       # 동사
#     "Adjective",  # 형용사
#     "Adverb",     # 부사
#     "Determiner", # 관형사
#     "Exclamation",# 감탄사
#     "Josa",       # 조사
#     "PreEomi",    # 선어말어미
#     "Eomi",       # 어미
#     "Conjunction",# 접속사
#     "Noun",       # 명사
#     "Suffix",     # 접미사
#     "VerbPrefix", # 동사 접두사
#     "Alpha",      # 알파벳
#     "Number",     # 숫자
#     "Foreign",    # 외국어
#     "Punctuation",# 구두점
#     "Hashtag",    # 해시태그
#     "KoreanParticle", # 한국어 입자
#     "ScreenName", # 스크린 이름
#     "Email",      # 이메일 주소
#     "URL"         # URL
# ]

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

def keras_tokenizer(sentences, num_words, padding, truncating, oov_token="<OOV>", max_len=None):
    """
    Tokenizes, sequences, pads, and one-hot encodes a list of sentences.

    Parameters:
    sentences (list of str): A list of sentences to process.
    num_words (int): The maximum number of words to keep, based on word frequency.
    padding (str): The type of padding to apply ('pre' or 'post').
    truncating (str): The type of truncating to apply ('pre' or 'post').
    oov_token (str): Token for out-of-vocabulary words.
    max_len (int, optional): Maximum length for padding/truncating. If None, the longest sentence length is used.
    
    Returns:
    tokenizer (Tokenizer): The Keras tokenizer fitted on the input sentences.
    sequences (list of list of int): The tokenized sentences in sequence format.
    padded (ndarray): Padded sequences as a 2D array.
    one_hot_padded (ndarray): One-hot encoded padded sequences.
    word_index (dict): Dictionary mapping words to their index.
    index_word (dict): Dictionary mapping indices to words.
    """
    
    # Tokenizer 생성 및 문장 학습
    tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
    tokenizer.fit_on_texts(sentences)
    
    # 정수 시퀀스 변환
    sequences = tokenizer.texts_to_sequences(sentences)
    
    # max_len이 주어지지 않으면 가장 긴 문장의 길이를 계산
    if max_len is None:
        max_len = max([len(seq) for seq in sequences])
    
    # 패딩 및 트렁케이팅
    padded = pad_sequences(sequences, maxlen=max_len, padding=padding, truncating=truncating)
    
    # One-hot 인코딩
    one_hot_padded = to_categorical(padded, num_classes=num_words)
    
    # 단어 인덱스와 인덱스 단어
    word_index = tokenizer.word_index
    index_word = tokenizer.index_word
    
    return tokenizer, sequences, padded, one_hot_padded, word_index, index_word

2024-11-14 08:46:20.430541: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-14 08:46:20.468949: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
import numpy as np
import re
from konlpy.tag import Okt

def text_preprocessing(corpus, lang, stop_words=None, tokenizer=None, pos_tags=None):
    """
    This is a text preprocessing function. It processes the given list of texts (corpus) by removing stopwords and performing POS tagging based on the specified language.

    Parameters:
    - corpus (list of str): A list of texts to be preprocessed.
    - lang (str): Specifies the language of the text. Must be either 'eng' or 'kor'.
        - 'eng': Preprocesses English texts.
        - 'kor': Preprocesses Korean texts.
    - stop_words (list of str, optional): A list of stopwords to be removed. Default is None.
    - tokenizer (Okt, optional): A Korean tokenizer (morphological analyzer) object, primarily using Okt. Default is None.
        - Not used when 'lang' is 'eng'.
        - If 'lang' is 'kor' and no tokenizer is provided, an Okt object will be automatically created.
    - pos_tags (list of str, optional): A list of POS tags to filter when processing Korean texts.
        - If provided, only the specified POS tags will be retained. Default is None.

    Returns:
    - list of str: A list of preprocessed texts.
    """
    if lang == 'eng':
        cleaned_corpus = [re.sub(r'[^a-zA-Z ]', '', text).lower() for text in corpus]
        results = []
        if stop_words:
            for text in cleaned_corpus:
                filtered_text = ' '.join(word for word in text.split() if word not in stop_words)
                results.append(filtered_text)
        else:
            results = cleaned_corpus
                
    elif lang == 'kor':
        if tokenizer is None:
            tokenizer = Okt()
            
        cleaned_corpus = [re.sub(r'[^가-힣 ]', '', text).lower() for text in corpus]
        results = []
        if stop_words or pos_tags:
            for text in cleaned_corpus:
                morphs_with_pos = tokenizer.pos(text)
                filtered_morphs = [
                    word for word, pos in morphs_with_pos 
                    if (pos_tags is None or pos in pos_tags) and (word not in stop_words if stop_words else True)
                ]
                filtered_text = ' '.join(filtered_morphs)
                results.append(filtered_text)
        else:
            for text in cleaned_corpus:
                filtered_text = ' '.join(tokenizer.morphs(text))
                results.append(filtered_text)
                
    else:
        raise ValueError("The 'lang' parameter must be either 'eng' or 'kor'.")
                
    return results

corpus = ['왕은 매우 완강기 호랑이 강한 남자이다',
          '여왕은 현명한 예쁜 여자이다',
          '소년은 젊은 남자이다',
          '소녀는 젊은 예쁜 수학을 공부하면서 즐기는 여자이다',
          '왕자는 젊고 현명한 왕이 될 것이다',
          '공주는 젊고 예쁜 현명한 여왕이 될 것이다',
          '남자는 강하다',
          '여자는 예쁘다',
          '왕자는 왕이 될 소년이다',
          '공주는 왕비가 될 소녀이다']
    
lang = 'kor'
stop_words = ['은', '가', '이다', '는', '이', '될']
pos_tags = ["Noun", "Verb","Adjective","Adverb"]
cleaned_corpus = text_preprocessing(corpus=corpus, lang='kor', stop_words=stop_words, pos_tags=pos_tags)

num_words = 100
padding = 'post'
truncating = "post"
oov_token = "<OOV>"

tokenizer, sequences, padded, one_hot_padded, word_index, index_word =( 
    keras_tokenizer(cleaned_corpus, num_words, padding, truncating, oov_token)
)

In [4]:
cleaned_corpus

['왕 매우 완강 기 호랑이 강한 남자',
 '여왕 현명한 예쁜 여자',
 '소년 젊은 남자',
 '소녀 젊은 예쁜 수학 공부 하면서 즐기는 여자',
 '왕자 젊고 현명한 왕 것',
 '공주 젊고 예쁜 현명한 여왕 것',
 '남자 강하다',
 '여자 예쁘다',
 '왕자 왕 소년',
 '공주 왕비 소녀']