### デバッグ用

In [19]:
from config import PATH_TO_SOURCE, PATH_TO_OUTPUT

In [51]:
import pandas as pd
import os
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
import itertools
import re

class PathFiles(object):
    def __init__(self, source, limit=None):
        """
        `source` should be a path to a directory (as a string)
        if `limit` is set, `limit` files will be processed.
        
        Example::
            files = PathFiles(os.getcwd() + '\\corpus\\', 1000)
            
        The files in the directory should be either .tsv files or .csv files.
        """
        self.source = source
        self.limit = limit
        
        if os.path.isfile(self.source):
            self.input_files = [self.source]  # force code compatibility with list of files
        elif os.path.isdir(self.source):
            self.source = os.path.join(self.source, '')  # ensures os-specific slash at end of path
            self.input_files = os.listdir(self.source)
            self.input_files = [self.source + file for file in self.input_files]  # make full paths
            self.input_files.sort()  # makes sure it happens in filename order
        else:  # not a file or a directory, then we can't do anything with it
            raise ValueError('input is neither a file nor a path')
        
    def __iter__(self):
        for i,f in enumerate(self.input_files):
            print("[INFO]:Loading %s" % f)
            yield self.load(f)
        
    def load(self, file):
        if file[-4:] == ".tsv":
            delimiter = "\t"
        elif file[-4:] == ".csv":
            delimiter = ","
        else:
            print("[ERROR]:Failed to load %s" % f)
            assert False, "ファイルの拡張子がtsvでもcsvでもありません。"
        
        df = pd.read_csv(file, delimiter=delimiter, header = None)
        return df
    
    def pos_tag(self, df):
        df["pos"] = df[7].apply(lambda x: self.tupple2str(
            pos_tag(word_tokenize(x), tagset='universal')))
        return df
    
    def tupple2str(self, x):
        """
        Assume that x is a list of tupples such as (token, tag).
        """
        x = list(map(lambda x: x[0]+'/'+x[1], x))
        return ' '.join(x)
        
    def all_preprocess(self):
        print("[INFO]:Start preprocess")
        # 随時このリストに関数を追加していく
        todo_list = [self.pos_tag]
        print("[INFO]:Applying %s to each file." % ", ".join(list(map(lambda x: str(x).split()[2], todo_list))))
        
        for (df, file_name) in itertools.islice(zip(self, self.input_files), self.limit):
            # 各ファイルdfに対して、todo_list内の関数を全て適用していく
            
            for do in todo_list:
                df = do(df)
            output_name = re.sub(r'%s' % PATH_TO_SOURCE, PATH_TO_OUTPUT, file_name)
            df.to_csv(output_name[:-4] + ".prep" + output_name[-4:], sep='\t')
        print("[INFO]:Finished preprocess")

In [52]:
files = PathFiles(PATH_TO_SOURCE, limit = 1)

In [53]:
files.all_preprocess()

[INFO]:Start preprocess
[INFO]:Applying PathFiles.pos_tag to each file.
[INFO]:Loading /works/csisv12/akiko/acl_anthology/SENT.tsv.out.selected/A00-1019.sent.tsv
[INFO]:Finished preprocess


In [44]:
str(files.pos_tag).split()[2]

'PathFiles.pos_tag'

In [37]:
re.sub(r'%s' % PATH_TO_SOURCE, PATH_TO_OUTPUT,'/works/csisv12/akiko/acl_anthology/SENT.tsv.out.selected/')

'/works/csisv12/akiko/acl_anthology/SENT.tsv.out.selected.preprocess/'

In [29]:
d[0][7][10]

'Thus the translator remains in control of the translation process and the machine must continually adapt its suggestions in response to his or her input.'

In [1]:
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

In [8]:
def tupple2str(x):
    """
    Assume that x is a list of tupples such as (token, tag).
    """
    x = list(map(lambda x: x[0]+'/'+x[1], x))
    return ' '.join(x)

In [12]:
tupple2str(pos_tag(word_tokenize("Thus the translator remains in control of the translation process and the machine must continually adapt its suggestions in response to his or her input."), tagset='universal'))

'Thus/ADV the/DET translator/NOUN remains/VERB in/ADP control/NOUN of/ADP the/DET translation/NOUN process/NOUN and/CONJ the/DET machine/NOUN must/VERB continually/ADV adapt/VERB its/PRON suggestions/NOUN in/ADP response/NOUN to/PRT his/PRON or/CONJ her/PRON input/NOUN ./.'

In [11]:
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> l

Packages:
  [ ] abc................. Australian Broadcasting Commission 2006
  [ ] alpino.............. Alpino Dutch Treebank
  [*] averaged_perceptron_tagger Averaged Perceptron Tagger
  [ ] averaged_perceptron_tagger_ru Averaged Perceptron Tagger (Russian)
  [ ] basque_grammars..... Grammars for Basque
  [ ] biocreative_ppi..... BioCreAtIvE (Critical Assessment of Information
                           Extraction Systems in Biology)
  [ ] bllip_wsj_no_aux.... BLLIP Parser: WSJ Model
  [ ] book_grammars....... Grammars from NLTK Book
  [ ] brown............... Brown Corpus
  [ ] brown_tei........... Brown Corpus (TEI XML Version)
  [ ] cess_cat............ CESS-CAT Treebank
  [ ] cess_esp............ CESS-ESP Treebank
  [ ] chat80.....

Hit Enter to continue: 
  [ ] conll2007........... Dependency Treebanks from CoNLL 2007 (Catalan
                           and Basque Subset)
  [ ] crubadan............ Crubadan Corpus
  [ ] dependency_treebank. Dependency Parsed Treebank
  [ ] dolch............... Dolch Word List
  [ ] europarl_raw........ Sample European Parliament Proceedings Parallel
                           Corpus
  [ ] floresta............ Portuguese Treebank
  [ ] framenet_v15........ FrameNet 1.5
  [ ] framenet_v17........ FrameNet 1.7
  [ ] gazetteers.......... Gazeteer Lists
  [ ] genesis............. Genesis Corpus
  [ ] gutenberg........... Project Gutenberg Selections
  [ ] hmm_treebank_pos_tagger Treebank Part of Speech Tagger (HMM)
  [ ] ieer................ NIST IE-ER DATA SAMPLE
  [ ] inaugural........... C-Span Inaugural Address Corpus
  [ ] indian.............. Indian Language POS-Tagged Corpus
  [ ] jeita............... JEITA Public Morphologically Tagged Corpus (in
                           Cha

True

In [13]:
import re

In [14]:
text = """
Tag	Meaning	English Examples
ADJ	adjective	new, good, high, special, big, local
ADP	adposition	on, of, at, with, by, into, under
ADV	adverb	really, already, still, early, now
CONJ	conjunction	and, or, but, if, while, although
DET	determiner, article	the, a, some, most, every, no, which
NOUN	noun	year, home, costs, time, Africa
NUM	numeral	twenty-four, fourth, 1991, 14:24
PRT	particle	at, on, out, over per, that, up, with
PRON	pronoun	he, their, her, its, my, I, us
VERB	verb	is, say, told, given, playing, would
.	punctuation marks	. , ; !
X	other	ersatz, esprit, dunno, gr8, univeristy
"""

In [17]:
text = re.sub(r"\t", "|", text)
text = re.sub(r"\n", "|\n|", text)

In [18]:
print(text)

|
|Tag|Meaning|English Examples|
|ADJ|adjective|new, good, high, special, big, local|
|ADP|adposition|on, of, at, with, by, into, under|
|ADV|adverb|really, already, still, early, now|
|CONJ|conjunction|and, or, but, if, while, although|
|DET|determiner, article|the, a, some, most, every, no, which|
|NOUN|noun|year, home, costs, time, Africa|
|NUM|numeral|twenty-four, fourth, 1991, 14:24|
|PRT|particle|at, on, out, over per, that, up, with|
|PRON|pronoun|he, their, her, its, my, I, us|
|VERB|verb|is, say, told, given, playing, would|
|.|punctuation marks|. , ; !|
|X|other|ersatz, esprit, dunno, gr8, univeristy|
|
