In [1]:
import pandas as pd
from tqdm import tqdm
from konlpy.tag import Mecab

df = pd.read_csv('/Users/stillssi/Desktop/ASSJ/data/news_article_list.csv')

In [2]:
import re
import string

def preprocessing(text, writer=None, press=None):
    '''
    뉴스기사 전처리 함수입니다.
    writer = 기자명
    press = 언론사
    '''
    # 숫자, 소숫점 제거
    text = re.sub('\d+\.\d*', '',text)

    # 이메일, URl 제거
    text = re.sub("([a-zA-Z0-9-]+(\@|\.)[a-zA-Z0-9-.]+)", '', text)
    text = re.sub(r"[a-zA-Z0-9]", '', text)
    # 다수의 점 (ex : ...) 점 한개로 대체
    text = re.sub("\.+\.", '.', text)
    # 다수의 공백 축소
    parenthesize_pattern = re.compile(r"\[[^]]*\]|\([^)]*\)|\<[^>]*\>|\【[^\】]*\】|\＜[^\＞]*\＞")
    text = parenthesize_pattern.sub("", text).strip()
    reporter_pattern = re.compile(r"([가-힣]{2,5} 기자)|([가-힣]{2,5}기자)")
    text = reporter_pattern.sub('', text)
    symbols = string.punctuation.replace(".", "").replace("?", "").replace("!", "") + "·ㆍ■◆△▷▶▼�""''…※↑↓▲☞ⓒ⅔①②③④⑤⑥⑦⑧⑨⑩○●〈〉‘´´“”?★◇’◇㎡!©→㎞∼ℓ━"
    text = text.translate(str.maketrans("", "", symbols))
    text = re.sub(' +', ' ', text)
    text = ' '.join(text.split())
    # 불용어
    if writer:
        text = text.replace(writer, '')
    if press:
        text = text.replace(press, '')

    text = text.replace('Copyrights', '').replace('무단 전재 및 재배포 금지', '')

    return text

In [3]:
def mecab_morphs(data):
    pos_list = {'NNG','NNP'}
    mecab = Mecab()
    data_token = mecab.morphs(data)
    data_token = ' '.join(data_token)
    pos_token = mecab.pos(data_token)
    pos_result = []
    for p in pos_token:
        if p[1] in pos_list:
            pos_result.append(p[0])
    pos_result = list(set(pos_result))
    
    return ' '.join(pos_result)

In [4]:
df['prep'] = ''

In [6]:
df.drop(['Unnamed: 0','Unnamed: 0.1'], axis=1, inplace=True)

In [None]:
for i in tqdm(range(len(df))):
  text = df['content'].iloc[i]
  text = preprocessing(text)
  text = ' '.join(text.split('.')[:-1])
  text = mecab_morphs(text)
  df['prep'].iloc[i] = text


100%|██████████| 222726/222726 [51:56<00:00, 71.48it/s]


In [None]:
df.drop(['content'], axis=1, inplace=True)

In [None]:
f = open('/content/drive/MyDrive/ko-dic.txt', 'r')
lines = f.readlines()
stopwords_list = []
for l in lines:
    l = l.replace('\n','')
    stopwords_list.append(l)
f.close()

In [7]:
df['ngram'] = ''

In [None]:
from konlpy.tag import Mecab

mecab = Mecab()
w = open('/content/drive/MyDrive/stopword.txt', 'a+')
for i in tqdm(range(len(df))):
    data_token = mecab.morphs(df['prep'].iloc[i]) 
    result = []
    for token in data_token:
        if token not in stopwords_list:
            result.append(token)
        else:
          w.write(token+'\n')
    token_result = ' '.join(result)
    token_result = token_result.replace('불 확실', '불확실')
    token_result = token_result.replace('불 균형', '불균형')
    token_result = token_result.replace('불 호황', '불호황')
    token_result = token_result.replace('입 주량', '입주량')
    token_result = token_result.replace('종 부세', '종부세')
    
    df['ngram'].iloc[i] = token_result

100%|██████████| 222726/222726 [23:30<00:00, 157.93it/s]


In [None]:
df.drop(['prep'],axis=1, inplace=True)
df.rename(columns={'ngram':'prep'}, inplace=True)

In [None]:
df.to_csv('/content/drive/MyDrive/news_preprocessed.csv')