Skip to content

Commit

Permalink
update reader
Browse files Browse the repository at this point in the history
  • Loading branch information
Hironsan committed Jun 27, 2017
1 parent 9486a44 commit 7035682
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 8 deletions.
80 changes: 72 additions & 8 deletions anago/data/reader.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,69 @@
import os
import re


class Reader(object):

def __init__(self):
pass

def read_entity(self, path):
for file_path in self._traverse(path):
if not file_path.endswith('.KNP'):
continue
for line in self._read_lines(file_path):
pass

def _filter(self, line):
if line[0] in '#*+':
ne_type, ne_word = self._extract_named_entity(line)
if line.startswith('EOS'):
return True
tokens = line.split()
word = tokens[0]

def _traverse(self, path):
"""
Return all file path included specified path
:param path:
:return: all file path
"""
file_paths = []
for file_or_dir in os.listdir(path):
_path = os.path.join(path, file_or_dir)
if os.path.isdir(_path):
file_paths.extend(self._traverse(_path))
else:
file_paths.append(_path)
return file_paths

def _read_lines(self, file_path):
with open(file_path) as f:
for line in f:
yield line

def _extract_named_entity(self, line):
name_tag = self._extract_name_tag(line)
ne_type = self._extract_type(name_tag)
ne_word = self._extract_target(name_tag)
return ne_type, ne_word

def _extract_substring(self, text, substring):
m = re.search(substring, text)
if m:
tag = m.group(1)
return tag
else:
return ''

def _extract_name_tag(self, text):
return self._extract_substring(text, '<ne (.+?)/>')

def _extract_type(self, text):
return self._extract_substring(text, 'type="(.+?)"')

def _extract_target(self, text):
return self._extract_substring(text, 'target="(.+?)"')


def doc_read(file):
Expand All @@ -14,18 +79,17 @@ def doc_read(file):
sent.append(word)
return sent




def reader(file_or_dir):
def reader(path):
docs = []
for file_or_dir in os.listdir(file_or_dir):
if not os.path.isdir(file_or_dir):
for file_or_dir in os.listdir(path):
dir_path = os.path.join(path, file_or_dir)
if not os.path.isdir(dir_path):
continue
for file in os.listdir(file_or_dir):
for file in os.listdir(dir_path):
if file.endswith('.KNP'):
path = os.path.join(file_or_dir, file)
doc = doc_read(path)
file_path = os.path.join(dir_path, file)
doc = doc_read(file_path)
docs.append(doc)
return docs

Expand Down
Empty file added tests/__init__.py
Empty file.
22 changes: 22 additions & 0 deletions tests/reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import os
import unittest

from anago.data.reader import reader, Reader


class TestReader(unittest.TestCase):

def setUp(self):
self.dir_path = os.path.join(os.path.dirname(__file__), '../data/raw/KWDLC-1.0')

def test_traverse(self):
r = Reader()
r.read_entity(self.dir_path)

"""
def test_reader(self):
dir_path = os.path.join(os.path.dirname(__file__), '../data/raw/KWDLC-1.0/dat/rel/')
print(dir_path)
docs = reader(dir_path)
print(docs[0])
"""

0 comments on commit 7035682

Please sign in to comment.