<a href="https://colab.research.google.com/github/Hyeon-Kang/NLP/blob/master/week05_2%20Sequence%20Labeling%20with%20CRFsuite/Sequence%20Labeling%20with%20CRFsuite.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#데이터 처리

from bs4 import BeautifulSoup as bs #XML, HTML등에서 데이터 추출
from bs4.element import Tag #데이터 태그 저장
import codecs #유니코드 파일 읽기

Collecting bs4
  Downloading https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz
Collecting beautifulsoup4 (from bs4)
  Downloading https://files.pythonhosted.org/packages/1a/b7/34eec2fe5a49718944e215fde81288eec1fa04638aa3fb57c1c6cd0f98c3/beautifulsoup4-4.8.0-py3-none-any.whl (97kB)
Collecting soupsieve>=1.2 (from beautifulsoup4->bs4)
  Downloading https://files.pythonhosted.org/packages/0b/44/0474f2207fdd601bb25787671c81076333d2c80e6f97e92790f8887cf682/soupsieve-1.9.3-py2.py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4, bs4
  Running setup.py install for bs4: started
    Running setup.py install for bs4: finished with status 'done'
Successfully installed beautifulsoup4-4.8.0 bs4-0.0.1 soupsieve-1.9.3


In [0]:
with codecs.open("./reuters.xml", "r", "utf-8") as infile:
    soup = bs (infile, 'html.parser')

In [0]:
print(soup)

<?xml version="1.0" encoding="UTF-8"?>
<corpus xmlns="http://semweb.unister.de/xml-corpus-schema-2013">
<document id="0">
<documenturi>http://www.research.att.com/~lewis/Reuters-21578/15001</documenturi>
<documentsource>Reuters-21578</documentsource>
<textwithnamedentities>
<namedentityintext uri="http://dbpedia.org/resource/Avery_Dennison">Paxar Corp</namedentityintext>
<simpletextpart> said it has acquired </simpletextpart>
<namedentityintext uri="http://aksw.org/notInWiki/Thermo-Print_GmbH">Thermo-Print GmbH</namedentityintext>
<simpletextpart> of </simpletextpart>
<namedentityintext uri="http://de.dbpedia.org/resource/Lohn_(Eschweiler)">Lohn</namedentityintext>
<simpletextpart>, </simpletextpart>
<namedentityintext uri="http://dbpedia.org/resource/West_Germany">West Germany</namedentityintext>
<simpletextpart>, a distributor of </simpletextpart>
<namedentityintext uri="http://dbpedia.org/resource/Avery_Dennison">Paxar</namedentityintext>
<simpletextpart> products, for undisclosed t

In [0]:
# 데이터 처리
docs = []
for elem in soup.find_all('document'):
    texts = []
    
    # 태그요소 안에 있는 내용을 반복문을 통해 가져오기
    for c in elem.find('textwithnamedentities').children:
        
        if type(c) == Tag:  # namedentityintext 는 모두 N으로 라벨링
            if c.name == 'namedentityintext':
                label = 'N'
            else:
                label = 'I'
            
            for w in c.text.split(' '):
                if len(w) > 0:
                    texts.append((w, label)) # word가 존재하면 해당 word에 라벨까지 append
    docs.append(texts)
    
print(docs[0][:5]) #2차원 배열 처음부터 5까지 출력

[('Paxar', 'N'), ('Corp', 'N'), ('said', 'I'), ('it', 'I'), ('has', 'I')]


In [0]:
# 품사 태그 생성
import nltk

nltk.download('averaged_perceptron_tagger')

data = []

for i, doc in enumerate(docs):
        #문서에서 토큰(단어)를 가져와 리스트에 저장
        tokens = [t for t, label in doc]
        
        #품사 태그 확인
        tagged = nltk.pos_tag(tokens)
        
        #단어, 품사 태그, 그리고 label을 저장
        data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Hyeon\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [0]:
print(data[0][:5]) # 단어, 품사태그, 개체명 인식 확인

[('Paxar', 'NNP', 'N'), ('Corp', 'NNP', 'N'), ('said', 'VBD', 'I'), ('it', 'PRP', 'I'), ('has', 'VBZ', 'I')]


In [0]:
#### Feature 생성 (단어의 핵심)

def word2features(doc, i):
    word = doc[i][0]
    pos_tag = doc[i][1]
    
    #모든 단어에서의 공통적인 특징 (단어들마다 서로 다른 특징들을 인식하기 위해 추출)
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:], # 
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + pos_tag
    ]
    
    
    #문서의 시작 부분
    if i > 0:
        word1 = doc[i-1][0]
        pos_tag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:pos_tag=%s' + pos_tag1
        ])
        
    else:
        # 있다면 "문서의 시작" 이라고 표시함
        features.append('BOS')
        
    
    # 문서의 마지막 부분
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        pos_tag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:pos_tag=%s' + pos_tag1
        ])
        
    else:
        # 있다면 "문서의 마지막" 이라고 표시
        features.append('EOS')
        
    return features

In [0]:
# CRF 모델의 라이브러리
!pip install python-crfsuite

Collecting python-crfsuite
  Downloading https://files.pythonhosted.org/packages/29/c9/b206fa75d5978a631b5e6914a051139d99ff4624f96eac1bec6486413944/python_crfsuite-0.9.6-cp36-cp36m-win_amd64.whl (154kB)
Installing collected packages: python-crfsuite
Successfully installed python-crfsuite-0.9.6


In [0]:
### 모델에 학습 진행
from sklearn.model_selection import train_test_split

# 문서에서 feature(특징)을 추출하는 함수 선언
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# 각 문서의 label에 대한 정보를 저장하는 리스트를 생성하는 함수 선언
def get_labels(doc):
    return [label for (token, postag, label) in doc]

X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [0]:
#### CRF 모델 사용
import pycrfsuite

trainer = pycrfsuite.Trainer(verbose=True)

# 모델에 학습 데이터를 입력
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)
    
# 모델의 파라미터 설정
trainer.set_params({
    # L1 penalty의 계수
    'c1' : 0.1,
    
    # L2 penalty의 계수
    'c2' : 0.01,
    
    # 최대 반복 횟수
    'max_iterations' : 200,
    
    'feature.possible_transitions' : True
})


# 모델 학습, 학습이 끝난 후, "crf.model"의 이름으로 모델을 파일로 저장
trainer.train('crf.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 13011
Seconds required: 0.127

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 5329.880748
Feature norm: 1.000000
Error norm: 5820.219488
Active features: 12595
Line search trials: 1
Line search step: 0.000045
Seconds required for this iteration: 0.022

***** Iteration #2 *****
Loss: 4292.692724
Feature norm: 0.844046
Error norm: 5215.205290
Active features: 12649
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.016

***** Iteration #3 *****
Loss: 3884.347162
Feature norm: 0.808360
Error norm: 11895.677995
Active features: 8495
Line search trials: 2
Line search step: 0.500000
Seconds required for this 

In [0]:
### 학습 결과 확인
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# 테스트 세트에서 임의의 샘플을 출력
i = 12
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
    print("%s (%s)" % (y, x))

great (N)
lakes (N)
chemical (N)
corp (N)
said (I)
it (I)
registered (I)
with (I)
the (I)
securities (N)
and (N)
exchange (N)
commission (N)
a (I)
proposed (I)
public (I)
offering (I)
of (I)
two (I)
mln (I)
shares (I)
of (I)
its (I)
common (I)
stock. (I)
proceeds (I)
from (I)
the (I)
offering (I)
will (I)
be (I)
used (I)
to (I)
repay (I)
debt (I)
incurred (I)
in (I)
the (I)
acquisition (I)
of (I)
two (I)
chemical (I)
companies, (I)
to (I)
increase (I)
great (N)
lakes (N)
equity (I)
investment (I)
in (I)
huntsman (N)
chemical (N)
corp (N)
and (I)
for (I)
general (I)
corporate (I)
purposes, (I)
it (I)
said. (I)
underwriters (I)
are (I)
led (I)
by (I)
first (N)
boston (N)
corp (N)
, (I)
goldman, (N)
sachs (N)
and (N)
co (N)
and (N)
eberstadt (N)
fleming (N)
inc (N)
. (I)


In [0]:
#### Sklearn 으로 보기 쉽게 성능을 확인

import numpy as np
from sklearn.metrics import classification_report

# label을 탐색하기 위한 딕셔너리 생성
labels = {"N" : 1, "I" : 0}

# 문자열의 태그를 1차원의 배열로 변환
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

# 분류 성능 결과 출력
print(classification_report(truths, predictions, target_names=["I", "N"]))

              precision    recall  f1-score   support

           I       0.99      0.99      0.99      3621
           N       0.90      0.90      0.90       450

    accuracy                           0.98      4071
   macro avg       0.94      0.94      0.94      4071
weighted avg       0.98      0.98      0.98      4071

