In [4]:
import sys, os
from gensim.corpora import WikiCorpus
from tqdm import tqdm
import pickle
import pandas as pd
import numpy as np
import re

### Wikipedia 데이터는 pre-trained 모델 사용(메모리, 스토리지 문제)

In [97]:
# output = open('../data/wiki_eng.data', 'w')
# wiki = WikiCorpus('../data/enwiki-latest-pages-articles.xml.bz2')
# i = 0
# for text in tqdm(wiki.get_texts()) :
#     output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
#     i = i + 1
# output.close()
# print('Processing complete!')

### SMS_Spam_Collection

In [98]:
def clean_sms(s):
    # only include alphanumerics
    s = re.sub(r"[^A-Za-z0-9(),.!?\'\`&]", " ", s)
    
    # insert spaces in special characters
    s = re.sub(r",", " , ", s)
    s = re.sub(r"!", " ! ", s)
    s = re.sub(r"\(", " \( ", s)
    s = re.sub(r"\)", " \) ", s)
    s = re.sub(r"\?", " \? ", s)
    s = re.sub(r"\&", " \& ", s)
    
    # only include alphanumerics again
    s = re.sub(r"[^A-Za-z0-9(),.!?\'\`&]", " ", s)
    
    # reduce multiple spaces to single spaces
    s = re.sub(r"\s{2,}", " ", s)
    
    # reduce multiple dots to single dot
    s = re.sub(r"\.{2,}", ".", s)
    s = re.sub(r"\.", " . ", s)
    return s.strip().lower()

In [99]:
sms_data = pd.read_csv('../data/raw/sms-spam.csv', encoding = 'ISO-8859-1')

In [100]:
sms_text = sms_data.v2.values

In [101]:
sms_text_cleaned = np.vectorize(clean_sms)(sms_text)

In [102]:
sms_text_cleaned

array([ 'go until jurong point , crazy .  available only in bugis n great world la e buffet .  cine there got amore wat .',
       'ok lar .  joking wif u oni .',
       "free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 .  text fa to 87121 to receive entry question ( std txt rate ) t & c's apply 08452810075over18's",
       ..., 'pity , was in mood for that .  so . any other suggestions ?',
       "the guy did some bitching but i acted like i'd be interested in buying something else next week and he gave it to us for free",
       'rofl .  its true to its name'], 
      dtype='<U945')

In [48]:
with open('../data/eng/sms.pickle', 'wb') as f :
    pickle.dump(sms_text_cleaned, f)

In [103]:
del sms_data, sms_text, sms_text_cleaned, clean_sms

### Twitter data

In [104]:
twitter_data = pd.read_csv('../data/raw/sentiment140.csv', encoding = 'ISO-8859-1', header = None)

In [105]:
twitter_text = twitter_data[5].values

In [106]:
def clean_twits(s):
    # del url and tag(@)
    s = re.sub(r'http\S+', '', s)
    s = re.sub('\@\S+', '', s)
    
    # only include alphanumerics
    s = re.sub(r"[^A-Za-z0-9(),.!?;:\'\`&]", " ", s)   # added ; and : because of emoji
    
    # insert spaces in special characters
    s = re.sub(r",", " , ", s)
    s = re.sub(r"!", " ! ", s)
    s = re.sub(r"\?", " \? ", s)
    s = re.sub(r"\&", " \& ", s)
    
    # only include alphanumerics again
    s = re.sub(r"[^A-Za-z0-9(),.!?;:\'\`&]", " ", s) 
    
    # reduce multiple spaces to single spaces
    s = re.sub(r"\s{2,}", " ", s)
    
    # reduce multiple dots to single dot
    s = re.sub(r"\.{2,}", ".", s)
    s = re.sub(r"\.", " . ", s)
    return s.strip().lower()

In [107]:
twitter_text_cleaned = np.vectorize(clean_twits)(twitter_text)

In [108]:
twitter_text_cleaned

array([ "awww , that's a bummer .  you shoulda got david carr of third day to do it .  ;d",
       "is upset that he can't update his facebook by texting it .  and might cry as a result school today also .  blah !",
       'i dived many times for the ball .  managed to save 50 the rest go out of bounds',
       ..., 'are you ready for your mojo makeover ? ask me for details',
       'happy 38th birthday to my boo of alll time ! ! ! tupac amaru shakur',
       'happy charitytuesday'], 
      dtype='<U322')

In [109]:
with open('../data/eng/twitter.pickle', 'wb') as f :
    pickle.dump(twitter_text_cleaned, f)

In [110]:
del twitter_data, twitter_text, twitter_text_cleaned, clean_twits

### 한국어 Wikidata

In [83]:
wiki = WikiCorpus('../data/kor/kowiki-20181101-pages-articles.xml.bz2')

In [84]:
output = open('../data/kor/wiki_kor.data', 'w')
i = 0
for text in tqdm(wiki.get_texts()) :
    output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
    i = i + 1
output.close()
print('Processing complete!')

288672it [04:17, 1121.32it/s]

Processing complete!





In [85]:
del wiki

### Naver Kin

In [8]:
kin_data = pd.read_csv('../data/raw/raw_kin_data.csv')

In [9]:
kin_text = kin_data.text.values.astype(np.str).tolist()

In [10]:
def clean_kin(s) :
    # del url
    s = re.sub(r'http\S+', '', s)
    
    # only include alphanumerics
    s = re.sub(r"[^A-Za-z0-9가-힣(),.!?\'\`&]", " ", s)
    
    # insert spaces in special characters
    s = re.sub(r",", " , ", s)
    s = re.sub(r"!", " ! ", s)
    s = re.sub(r"\?", " \? ", s)
    s = re.sub(r"\&", " \& ", s)
        
    # replace \xa0 tag
    s = re.sub(r"\xa0", " ", s)
    
    # only include alphanumerics again
    s = re.sub(r"[^A-Za-z0-9가-힣(),.!?\'\`&]", " ", s)
    
    # reduce multiple spaces to single spaces
    s = re.sub(r"\s{2,}", " ", s)
    
    # reduce multiple emotional character to single caracter
    s = re.sub(r"ㅋ{2,}", " ", s)
    s = re.sub(r"ㅎ{2,}", " ", s)
    s = re.sub(r"ㅜ{2,}", " ", s)
    s = re.sub(r"ㅠ{2,}", " ", s)
    
    # reduce multiple dots to single dot
    s = re.sub(r"\.{2,}", ".", s)
    s = re.sub(r"\.", " . ", s)
    
    return s.strip().lower()

In [11]:
def sentence_slicer(text) :
    result = list()
    splited_text = [a + '다.' for a in text.split('다.')]
    if len(splited_text) > 1 :
        del splited_text[-1]
    else :
        splited_text[0] = splited_text[0][:-2]
    for sentence in splited_text : 
        if '요.' in sentence :
            splited_sentence = [a + '요.' for a in sentence.split('요.')]
            splited_sentence[-1] = splited_sentence[-1][:-2]
            result.extend(splited_sentence)
        else :
            result.extend([sentence])
    return result

In [12]:
kin_text_cleaned = list()
for sentence in kin_text :
    kin_text_cleaned.extend(sentence_slicer(sentence))

In [13]:
with open('../data/kor/naver_kin.data', 'w') as f :
    for sentence in kin_text_cleaned :
        f.write(sentence + '\t')

In [14]:
del kin_data, kin_text, kin_text_cleaned, clean_kin, sentence_slicer

### Naver movie reply

In [15]:
def clean_movie(s) :
    # del url
    s = re.sub(r'http\S+', '', s)
    
    # only include alphanumerics
    s = re.sub(r"[^A-Za-z0-9가-힣(),.!?\'\`&]", " ", s)
    
    # insert spaces in special characters
    s = re.sub(r",", " , ", s)
    s = re.sub(r"!", " ! ", s)
    s = re.sub(r"\?", " \? ", s)
    s = re.sub(r"\&", " \& ", s)

    # only include alphanumerics again
    s = re.sub(r"[^A-Za-z0-9가-힣(),.!?\'\`&]", " ", s)
    
    # reduce multiple spaces to single spaces
    s = re.sub(r"\s{2,}", " ", s)
    
    # reduce multiple emotional character to single caracter
    s = re.sub(r"ㅋ{2,}", " ", s)
    s = re.sub(r"ㅎ{2,}", " ", s)
    s = re.sub(r"ㅜ{2,}", " ", s)
    s = re.sub(r"ㅠ{2,}", " ", s)
    
    # reduce multiple dots to single dot
    s = re.sub(r"\.{2,}", ".", s)
    s = re.sub(r"\.", " . ", s)
    
    return s.strip().lower()

In [39]:
reply_data = pd.read_csv('../data/raw/naver_movie.txt', delimiter='\t')

In [45]:
reply_text = reply_data.document.astype(np.str).values

In [46]:
reply_text_cleaned = np.vectorize(clean_movie)(reply_text)

In [49]:
with open('../data/kor/movie.pickle', 'wb') as f :
    pickle.dump(reply_text_cleaned, f)

In [50]:
del reply_data, reply_text, reply_text_cleaned, clean_movie