# Load packages and data

In [None]:
import csv
import re
import os
import pandas as pd
import numpy as np
from ckonlpy.tag import Twitter
from pykospacing import Spacing
from tqdm import tqdm
from hanspell import spell_checker

spacing = Spacing()
tagpos= Twitter()

In [None]:
stopwords_csv = pd.read_csv("https://raw.githubusercontent.com/yoonkt200/FastCampusDataset/master/korean_stopwords.txt").values.tolist()
stopwords = [i[0] for i in stopwords_csv]

tagpos.add_dictionary(['단원','차시','교과서','지도서','학습지','아이들','꾸미기','여러가지','아꿈선','프레파라트', '가운데','테이프','움직임','가운데','프레젠테이션', '하이클래스','실관','실험관찰'], 'Noun')


In [None]:
def my_tokenizer(doc):
    return [token for token, pos in tagpos.pos(doc, norm=True, stem=True) if pos in ['Noun','adverb'] and (len(token) > 1)]

In [None]:
def extract_num_reply(test_list):
    result = [x[3:4] for x in test_list]
    return result

In [None]:
def pre_processing(filename, rules=None):

    indimath = pd.read_excel(filename, engine='openpyxl')
    
    indimath['numreply']= extract_num_reply(indimath['reply'])

    #conbining 'title' and 'post text' into 'document'
    indimath['document'] = indimath['title'].map(str) + " "+ indimath['content']
    
    #clean text
    indimath['document'] = indimath['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣;.!? ]"," ") 
    indimath['document'] = indimath['document'].str.replace('[\s]{2,}', " ")
    indimath.loc[indimath['document']==".", "document"] = ""
    indimath['document'] =indimath['document'].replace("", np.nan, inplace=True)
    indimath = indimath.dropna(axis = 0) 

    #spacing
    indimath['document'] = indimath['document'].apply(spacing)

    #spell checker
    for i in tqdm(range(len(indimath))):
        try:
            result = spell_checker.check(indimath['document'][i])
            indimath['document'][i] = result.as_dict()['checked']
        except:
            pass

    #tokenization
    inditext = []
    for sentence in tqdm(indimath['text']):    
        tokenized_sentence = my_tokenizer(sentence)
        stopwords_removed = [word for word in tokenized_sentence if word not in stopwords]
        inditext.append(stopwords_removed)

    indimath['tokenized'] = inditext

    tokenized_words = inditext.apply(lambda x: ','.join(map(str, x)))
    tokenized_list = tokenized_words.tolist()

    indimath['list_token'] = tokenized_list
    
    return indimath



In [None]:
filename = 'indimath.xlsx'

pp_inditext = pre_processing(filename)