# 3w preparing

## 작업 순서

### 1. 정규화

1. 문장 부호 제거 : 단어 단위에서
2. 소문자/대문자 변환 : 전체에서
3. 불용어 처리 : 단어 단위에서
4. 약어 대체 : 전체에서 
5. 반복되는 문자 처리 : 단어 단위에서

### 2. 토큰화

1. 텍스트 -> 문장
2. 문자 -> 단어

In [1]:
import pandas as pd
import numpy as np
import os, sys, warnings
from tqdm import tqdm_notebook as tqdm
import re, string
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

warnings.filterwarnings('ignore')
sys.path.append(os.path.abspath(os.path.dirname('../../modules/')))
from replacers import RegexpReplacer, RepeatReplacer, WordReplacer

In [2]:
path_github = '../../dataset/github/'
os.listdir(path_github)

['constants.py',
 'CONTRIBUTING.md',
 'gap-development.tsv',
 'gap-test.tsv',
 'gap-validation.tsv',
 'gap_scorer.py',
 'LICENSE',
 'README.md']

In [3]:
test = pd.read_csv(path_github+'gap-test.tsv', sep='\t')
validation = pd.read_csv(path_github+'gap-validation.tsv', sep='\t')
train = pd.read_csv(path_github+'gap-development.tsv', sep='\t')

In [4]:
validation.head()

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,A-coref,B,B-offset,B-coref,URL
0,validation-1,He admitted making four trips to China and pla...,him,256,Jose de Venecia Jr,208,False,Abalos,241,False,http://en.wikipedia.org/wiki/Commission_on_Ele...
1,validation-2,"Kathleen Nott was born in Camberwell, London. ...",She,185,Ellen,110,False,Kathleen,150,True,http://en.wikipedia.org/wiki/Kathleen_Nott
2,validation-3,"When she returns to her hotel room, a Liberian...",his,435,Jason Scott Lee,383,False,Danny,406,True,http://en.wikipedia.org/wiki/Hawaii_Five-0_(20...
3,validation-4,"On 19 March 2007, during a campaign appearance...",he,333,Reucassel,300,True,Debnam,325,False,http://en.wikipedia.org/wiki/Craig_Reucassel
4,validation-5,"By this time, Karen Blixen had separated from ...",she,427,Finch Hatton,290,False,Beryl Markham,328,True,http://en.wikipedia.org/wiki/Denys_Finch_Hatton


In [5]:
# 대/소문자
def lower_casing(x):
    return x.lower()

# 약어 대체
reg_exp_replacer = RegexpReplacer()

# 단어 토큰화
def word_tokenizing(doc): # doc 단위로
    return map(nltk.word_tokenize, doc) # sent 단위로

# 문장 부호 제거
punctuation_replacer = re.compile('[%s]'%re.escape(string.punctuation))
def punctuation_replacing(doc):
    ret = []
    for sent in doc:
        sent_ret = []
        for word in sent:
            temp = punctuation_replacer.sub(u'', word)
            if not temp == u'': 
                sent_ret.append(temp)
        ret.append(sent_ret)
    return ret
    
# 불용어 제거
stop_words = set(stopwords.words('english'))
def stop_words_replacing(doc):
    ret = []
    for sent in doc:
        ret.append([word for word in sent if word not in stop_words])
    return ret

# 반복되는 문자 처리
repeat_replacer = RepeatReplacer()
def repeat_replacing(doc):
    ret=[]
    for sent in doc:
        sent_ret = []
        for word in sent:
            sent_ret.append(repeat_replacer.replace(word))
        ret.append(sent_ret)
    return ret

In [12]:
%%time
for df, name in zip((train, test, validation),('train_text', 'test_text', 'validation_text')):
    globals()[name] = df['Text'].values.tolist()
    for foo in (lower_casing, reg_exp_replacer.replace, sent_tokenize, word_tokenizing, punctuation_replacing,stop_words_replacing,repeat_replacing):
        globals()[name] = list(map(foo, globals()[name]))

Wall time: 13.8 s


In [13]:
train_text

[[['zoe',
   'telford',
   'played',
   'police',
   'officer',
   'girlfriend',
   'simon',
   'magie'],
  ['dumped',
   'simon',
   'final',
   'episode',
   'series',
   '1',
   'slept',
   'jenny',
   'seen'],
  ['phoebe',
   'thomas',
   'played',
   'cheryl',
   'casidy',
   'pauline',
   'friend',
   'also',
   'year',
   '11',
   'pupil',
   'simon',
   'class'],
  ['dumped',
   'boyfriend',
   'following',
   'simon',
   'advice',
   'would',
   'sex',
   'later',
   'realised',
   'due',
   'catching',
   'crabs',
   'friend',
   'pauline']],
 [['grew',
   'evanston',
   'illinois',
   'second',
   'oldest',
   'five',
   'children',
   'including',
   'brothers',
   'fred',
   'gordon',
   'sisters',
   'marge',
   'peppy',
   'marilyn'],
  ['high',
   'school',
   'days',
   'spent',
   'new',
   'trier',
   'high',
   'school',
   'winetka',
   'illinois'],
  ['mackenzie',
   'studied',
   'bernard',
   'leach',
   '1949',
   '1952',
   'simple',
   'whelthrown',
   'funct