In [1]:
from bs4 import BeautifulSoup as bs
from bs4.element import Tag
import codecs

In [2]:
with codecs.open("./reuters.xml", "r", "utf-8") as infile:
    soup = bs(infile, 'html.parser')

In [3]:
docs = []
for elem in soup.find_all('document'):
    texts = []
    
    # "textwithnamedentities" 라는 이름의 태그 요소 안에 있는 내용을 반복문을 통해 가져오기
    for c in elem.find('textwithnamedentities').children:
        
        if type(c) == Tag:
            if c.name == 'namedentityintext':
                label = 'N'   # Named Entity 부분이라는 의미
            else:
                label = 'I'   # 상관없는 단어들은 'I'로 표시
            
            for w in c.text.split(' '):
                if len(w) > 0:
                    texts.append((w, label))
    docs.append(texts)
    
print(docs[0][:5])

[('Paxar', 'N'), ('Corp', 'N'), ('said', 'I'), ('it', 'I'), ('has', 'I')]


In [4]:
# 품사 태그(Part-of-Speech Tags) 생성
import nltk

nltk.download('averaged_perceptron_tagger')

data = []

for i, doc in enumerate(docs):
    # 문서에서 토큰(단어)을 가져와 리스트에 저장
    tokens = [t for t, label in doc]
    
    # 품사 태그 확인
    tagged = nltk.pos_tag(tokens)
    
    # 단어, 품사 태그, 그리고 label을 저장
    data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\HTS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [5]:
print(data[0][:5])

[('Paxar', 'NNP', 'N'), ('Corp', 'NNP', 'N'), ('said', 'VBD', 'I'), ('it', 'PRP', 'I'), ('has', 'VBZ', 'I')]


In [6]:
##### Feature 생성
def word2features(doc, i):
    word = doc[i][0]
    pos_tag = doc[i][1]
    
    #모든 단어에서의 공통적인 특징
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + pos_tag
    ]
    
    # 문서의 시작 부분
    if i > 0:
        word1 = doc[i-1][0]
        pos_tag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:pos_tag=' + pos_tag1
        ])
    
    else:
        # 있다면 "문서의 시작"이라고 표시
        features.append('BOS')
    
    #문서의 마지막 부분
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        pos_tag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:pos_tag=' + pos_tag1
        ])
        
    else:
        # 있다면 "문서의 마지막"이라고 표시
        features.append('EOS')
        
    return features

In [7]:
#### 모델에 학습 진행
from sklearn.model_selection import train_test_split

# 문서에서 feature(특징)을 추출하는 함수 선언
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# 각 문서의 label에 대한 정보를 저장하는 리스트를 생성하는 함수 선언
def get_labels(doc):
    return [label for (token, postag, label) in doc]

X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
#### CRF 모델 사용
import pycrfsuite

trainer = pycrfsuite.Trainer(verbose=False)

# 모델에 학습 데이터를 입력
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)
    
# 모델의 파라미터 설정
trainer.set_params({
    # L1 penalty의 계수
    'c1' : 0.1,
    
    # L2 penalty의 계수
    'c2' : 0.01,
    
    # 최대 반복 횟수
    'max_iterations' : 200,
    
    'feature.possible_transitions' : True
})

# 모델 학습, 학습이 끝난 후, "crf.model"의 이름으로 모델을 파일로 저장
trainer.train('crf.model')

In [9]:
#### 결과 확인
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# 테스트 세트에서 임의의 샘플을 출력
i = 8
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
    print("%s (%s)" % (y, x))

investment (N)
technologies (N)
inc (N)
said (I)
it (I)
will (I)
make (I)
available (I)
its (I)
online (I)
investment (I)
advisory (I)
service (I)
to (I)
spear (N)
securities (N)
inc (N)
customers. (I)
investment (N)
technologies (N)
said (I)
that (I)
through (I)
a (I)
spear (I)
rebate (I)
program, (I)
purchasers (I)
of (I)
the (I)
investment (I)
advisory (I)
service, (I)
vestor, (I)
can (I)
receive (I)
a (I)
cash (I)
rebate (I)
of (I)
up (I)
to (I)
the (I)
full (I)
subscription (I)
price (I)
of (I)
the (I)
investment (I)
advisory (I)
service (I)
from (I)
spear (N)
securities (N)
. (I)
spears (I)
brokerage (I)
commission (I)
rebate (I)
program (I)
will (I)
allow (I)
money (I)
managers (I)
or (I)
individual (I)
investors (I)
who (I)
purchase (I)
vestor (I)
from (I)
investment (N)
technologies (N)
, (I)
or (I)
from (I)
other (I)
authorized (I)
distributors, (I)
to (I)
recive (I)
a (I)
rebate (I)
of (I)
the (I)
brokerage (I)
commissions, (I)
the (I)
company (I)
said. (I)
each (I)
month (I

In [10]:
#### Sklearn으로 보기 쉽게 성능을 확인
import numpy as np
from sklearn.metrics import classification_report

# label을 탐색하기 위한 딕셔너리 생성
labels = {"N" : 1, "I" : 0}

# 문자열의 태그를 1차원의 배열로 변환
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

# 분류 성능 결과 출력
print(classification_report(truths, predictions, target_names=["I", "N"]))

              precision    recall  f1-score   support

           I       0.98      0.99      0.98      2579
           N       0.92      0.84      0.88       386

   micro avg       0.97      0.97      0.97      2965
   macro avg       0.95      0.92      0.93      2965
weighted avg       0.97      0.97      0.97      2965

