In [1]:
import sys, os
from gensim.corpora import WikiCorpus
from tqdm import tqdm
import pickle
import pandas as pd
import numpy as np
import re

### Wikipedia 데이터는 pre-trained 모델 사용(메모리, 스토리지 문제)

In [2]:
# output = open('../data/wiki_eng.data', 'w')
# wiki = WikiCorpus('../data/enwiki-latest-pages-articles.xml.bz2')
# i = 0
# for text in tqdm(wiki.get_texts()) :
#     output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
#     i = i + 1
# output.close()
# print('Processing complete!')

### SMS_Spam_Collection

In [3]:
def clean_sms(s):
    # only include alphanumerics
    s = re.sub(r"[^A-Za-z0-9(),.!?\'\`&]", " ", s)
    
    # insert spaces in special characters
    s = re.sub(r",", " , ", s)
    s = re.sub(r"!", " ! ", s)
    s = re.sub(r"\(", " \( ", s)
    s = re.sub(r"\)", " \) ", s)
    s = re.sub(r"\?", " \? ", s)
    s = re.sub(r"\&", " \& ", s)
    
    # only include alphanumerics again
    s = re.sub(r"[^A-Za-z0-9(),.!?\'\`&]", " ", s)
    
    # chagne all numbers to <num> tag
    s = re.sub(r"[0-9]{1,}", "<num> ", s)
    
    # reduce multiple spaces to single spaces
    s = re.sub(r"\s{2,}", " ", s)
    
    # reduce multiple dots to single dot
    s = re.sub(r"\.{2,}", ".", s)
    s = re.sub(r"\.", " . ", s)
    return s.strip().lower()

In [4]:
sms_data = pd.read_csv('../data/raw/sms-spam.csv', encoding = 'ISO-8859-1')

In [5]:
sms_text = sms_data.v2.values

In [6]:
sms_text_cleaned = np.vectorize(clean_sms)(sms_text)

In [7]:
# add <s> </s> tag for all sentence
sms_text_cleaned = np.vectorize(lambda x : '<s> ' + x + ' </s>')(sms_text_cleaned)

In [8]:
with open('../data/eng/sms.pickle', 'wb') as f :
    pickle.dump(sms_text_cleaned, f)

In [9]:
del sms_data, sms_text, sms_text_cleaned, clean_sms

### Twitter data

In [10]:
twitter_data = pd.read_csv('../data/raw/sentiment140.csv', encoding = 'ISO-8859-1', header = None)

In [11]:
twitter_text = twitter_data[5].values

In [12]:
def clean_twits(s):
    # del url and tag(@)
    s = re.sub(r'http\S+', '', s)
    s = re.sub('\@\S+', '', s)
    
    # only include alphanumerics
    s = re.sub(r"[^A-Za-z0-9(),.!?;:\'\`&]", " ", s)   # added ; and : because of emoji
    
    # insert spaces in special characters
    s = re.sub(r",", " , ", s)
    s = re.sub(r"!", " ! ", s)
    s = re.sub(r"\?", " \? ", s)
    s = re.sub(r"\&", " \& ", s)
    
    # only include alphanumerics again
    s = re.sub(r"[^A-Za-z0-9(),.!?;:\'\`&]", " ", s)
    
    # chagne all numbers to <num> tag
    s = re.sub(r"[0-9]{1,}", "<num> ", s)
    
    # reduce multiple spaces to single spaces
    s = re.sub(r"\s{2,}", " ", s)
    
    # reduce multiple dots to single dot
    s = re.sub(r"\.{2,}", ".", s)
    s = re.sub(r"\.", " . ", s)
    return s.strip().lower()

In [13]:
twitter_text_cleaned = np.vectorize(clean_twits)(twitter_text)

In [14]:
# add <s> </s> tag for all sentence
twitter_text_cleaned = np.vectorize(lambda x : '<s> ' + x + ' </s>')(twitter_text_cleaned)

In [15]:
with open('../data/eng/twitter.pickle', 'wb') as f :
    pickle.dump(twitter_text_cleaned, f)

In [16]:
del twitter_data, twitter_text, twitter_text_cleaned, clean_twits

### 한국어 Wikidata

In [17]:
# wiki = WikiCorpus('../data/raw/kowiki-20181101-pages-articles.xml.bz2')

In [18]:
# output = open('../data/kor/wiki_kor.data', 'w')
# i = 0
# for text in tqdm(wiki.get_texts()) :
#     output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
#     i = i + 1
# output.close()
# print('Processing complete!')

In [19]:
# del wiki

### Naver Kin

In [20]:
kin_data = pd.read_csv('../data/raw/raw_kin_data.csv')

In [21]:
kin_text = kin_data.text.values.astype(np.str).tolist()

In [22]:
def clean_kin(s) :
    # del url
    s = re.sub(r'http\S+', '', s)
    
    # only include alphanumerics
    s = re.sub(r"[^A-Za-z0-9가-힣(),.!?\'\`&]", " ", s)
    
    # insert spaces in special characters
    s = re.sub(r",", " , ", s)
    s = re.sub(r"!", " ! ", s)
    s = re.sub(r"\?", " \? ", s)
    s = re.sub(r"\&", " \& ", s)
        
    # replace \xa0 tag
    s = re.sub(r"\xa0", " ", s)
    
    # only include alphanumerics again
    s = re.sub(r"[^A-Za-z0-9가-힣(),.!?\'\`&]", " ", s)
    
    # chagne all numbers to <num> tag
    s = re.sub(r"[0-9]{1,}", "<num> ", s)
    
    # reduce multiple spaces to single spaces
    s = re.sub(r"\s{2,}", " ", s)
    
    # reduce multiple dots to single dot
    s = re.sub(r"\.{2,}", ".", s)
    s = re.sub(r"\.", " . ", s)
    
    return s.strip().lower()

In [23]:
def sentence_slicer(text) :
    result = list()
    splited_text = [a + '다.' for a in text.split('다.')]
    if len(splited_text) > 1 :
        del splited_text[-1]
    else :
        splited_text[0] = splited_text[0][:-2]
    for sentence in splited_text : 
        if '요.' in sentence :
            splited_sentence = [a + '요.' for a in sentence.split('요.')]
            splited_sentence[-1] = splited_sentence[-1][:-2]
            result.extend(splited_sentence)
        else :
            result.extend([sentence])
    return result

In [24]:
kin_text_cleaned = list()
for sentence in kin_text :
    kin_text_cleaned.extend(sentence_slicer(clean_kin(sentence)))

In [25]:
# add <s> </s> tag for all sentence
kin_text_cleaned = np.vectorize(lambda x : '<s> ' + x + ' </s>')(kin_text_cleaned)

In [26]:
with open('../data/kor/kin.pickle', 'wb') as f :
    pickle.dump(kin_text_cleaned, f)

### Naver movie reply

In [28]:
def clean_movie(s) :
    # del url
    s = re.sub(r'http\S+', '', s)
    
    # only include alphanumerics
    s = re.sub(r"[^A-Za-z0-9가-힣(),.!?\'\`&]", " ", s)
    
    # insert spaces in special characters
    s = re.sub(r",", " , ", s)
    s = re.sub(r"!", " ! ", s)
    s = re.sub(r"\?", " \? ", s)
    s = re.sub(r"\&", " \& ", s)

    # only include alphanumerics again
    s = re.sub(r"[^A-Za-z0-9가-힣(),.!?\'\`&]", " ", s)
    
    # chagne all numbers to <num> tag
    s = re.sub(r"[0-9]{1,}", "<num> ", s)
    
    # reduce multiple spaces to single spaces
    s = re.sub(r"\s{2,}", " ", s)
    
    # reduce multiple dots to single dot
    s = re.sub(r"\.{2,}", ".", s)
    s = re.sub(r"\.", " . ", s)
    
    return s.strip().lower()

In [29]:
reply_data = pd.read_csv('../data/raw/naver_movie.txt', delimiter='\t')

In [30]:
reply_text = reply_data.document.astype(np.str).values

In [31]:
reply_text_cleaned = np.vectorize(clean_movie)(reply_text)

In [32]:
reply_text_cleaned = np.vectorize(lambda x : '<s> ' + x + ' </s>')(reply_text_cleaned)

In [33]:
reply_text_cleaned

array(['<s> 어릴때보고 지금다시봐도 재밌어요 </s>',
       '<s> 디자인을 배우는 학생으로 , 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산업이 부러웠는데 .  사실 우리나라에서도 그 어려운시절에 끝까지 열정을 지킨 노라노 같은 전통이있어 저와 같은 사람들이 꿈을 꾸고 이뤄나갈 수 있다는 것에 감사합니다 . </s>',
       '<s> 폴리스스토리 시리즈는 <num> 부터 뉴까지 버릴께 하나도 없음 .  최고 . </s>', ...,
       '<s> 완전 사이코영화 .  마지막은 더욱더 이 영화의질을 떨어트린다 . </s>',
       '<s> 왜난 재미없었지 라따뚜이 보고나서 스머프 봐서 그런가 </s>', '<s> 포풍저그가나가신다영차영차영차 </s>'], 
      dtype='<U218')

In [34]:
with open('../data/kor/movie.pickle', 'wb') as f :
    pickle.dump(reply_text_cleaned, f)

In [35]:
del reply_data, reply_text, reply_text_cleaned, clean_movie