## POS-tagger и NER

#### Задание 1. Написать теггер на данных с русским языком
- проверить UnigramTagger, BigramTagger, TrigramTagger и их комбмнации
- написать свой теггер как на занятии, попробовать разные векторайзеры, добавить знание не только букв но и слов
- сравнить все реализованные методы сделать выводы


In [1]:
# Импорт библиотек

import corus
import pandas as pd

import pyconll

import nltk
# nltk.download()
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger
from nltk.tag import RegexpTagger

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Загрузка данных

train_data = pyconll.load_from_file('./ru_syntagrus-ud-train-a.conllu')
test_data = pyconll.load_from_file('./ru_syntagrus-ud-dev.conllu')

In [3]:
fdata_train = []
for sent in train_data[:]:
    fdata_train.append([(token.form, token.upos) for token in sent])
    
fdata_test = []
for sent in test_data[:]:
    fdata_test.append([(token.form, token.upos) for token in sent])
    
fdata_sent_test = []
for sent in test_data[:]:
    fdata_sent_test.append([token.form for token in sent])

In [4]:
results = []

In [5]:
# Default tagger

default_tagger = DefaultTagger('NOUN')

# display(default_tagger.tag(fdata_sent_test[100]), default_tagger.evaluate(fdata_test))
default_tagger.evaluate(fdata_test)

results.append(('default_tagger', default_tagger.evaluate(fdata_test)))

In [6]:
# Unigram Tagger

unigram_tagger = UnigramTagger(fdata_train)
# display(unigram_tagger.tag(fdata_sent_test[100]), unigram_tagger.evaluate(fdata_test))
unigram_tagger.evaluate(fdata_test)

results.append(('unigram_tagger', unigram_tagger.evaluate(fdata_test)))

In [7]:
# Bigram Tagger

bigram_tagger = BigramTagger(fdata_train, backoff=unigram_tagger)
# display(bigram_tagger.tag(fdata_sent_test[100]), bigram_tagger.evaluate(fdata_test))
bigram_tagger.evaluate(fdata_test)

results.append(('bigram_tagger', bigram_tagger.evaluate(fdata_test)))

In [8]:
# Trigram Tagger

trigram_tagger = TrigramTagger(fdata_train, backoff=bigram_tagger)
# display(trigram_tagger.tag(fdata_sent_test[100]), trigram_tagger.evaluate(fdata_test))
trigram_tagger.evaluate(fdata_test)

results.append(('trigram_tagger', trigram_tagger.evaluate(fdata_test)))

In [9]:
# Комбинация тэггеров

def backoff_tagger(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
    return backoff

backoff = DefaultTagger('NOUN')

tag = backoff_tagger(fdata_train,  
                     [UnigramTagger, BigramTagger, TrigramTagger],  
                     backoff = backoff) 
  
tag.evaluate(fdata_test) 

results.append(('comb_tagger', tag.evaluate(fdata_test)))

In [10]:
train_tok = []
train_label = []
for sent in fdata_train[:]:
    for tok in sent:
        train_tok.append(tok[0])
        train_label.append('NO_TAG' if tok[1] is None else tok[1])
        
test_tok = []
test_label = []
for sent in fdata_test[:]:
    for tok in sent:
        test_tok.append(' ' if tok[1] is None else tok[1])
        test_label.append('NO_TAG' if tok[1] is None else tok[1])

In [11]:
le = LabelEncoder()
train_enc_labels = le.fit_transform(train_label)

In [12]:
test_enc_labels = le.transform(test_label)

In [13]:
# Hashing Vectorizer

hvectorizer = HashingVectorizer(ngram_range=(1, 3), analyzer='char', n_features=50)

X_train = hvectorizer.fit_transform(train_tok)

X_test = hvectorizer.transform(test_tok)

lr = LogisticRegression(random_state=0, max_iter=10)

lr.fit(X_train, train_enc_labels)

pred = lr.predict(X_test)

accuracy_score(test_enc_labels, pred)

results.append(('hashing_vectorizer', accuracy_score(test_enc_labels, pred)))

In [14]:
# Count Vectorizer

cvectorizer = CountVectorizer(ngram_range=(1, 3), analyzer='char')

X_train = cvectorizer.fit_transform(train_tok)

X_test = cvectorizer.transform(test_tok)

lr = LogisticRegression(random_state=0, max_iter=10)

lr.fit(X_train, train_enc_labels)

pred = lr.predict(X_test)

accuracy_score(test_enc_labels, pred)

results.append(('count_vectorizer', accuracy_score(test_enc_labels, pred)))

In [15]:
# Tfidf Vectorizer

tvectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='char')

X_train = tvectorizer.fit_transform(train_tok)

X_test = tvectorizer.transform(test_tok)

lr = LogisticRegression(random_state=0, max_iter=10)

lr.fit(X_train, train_enc_labels)

pred = lr.predict(X_test)

accuracy_score(test_enc_labels, pred)

results.append(('tfidf_vectorizer', accuracy_score(test_enc_labels, pred)))

In [16]:
# Общие результаты

res_df = pd.DataFrame(results, columns=['approach', 'accuracy'])

res_df.sort_values(by='accuracy', ascending=False)

Unnamed: 0,approach,accuracy
4,comb_tagger,0.878775
2,bigram_tagger,0.829279
3,trigram_tagger,0.829143
1,unigram_tagger,0.823732
5,hashing_vectorizer,0.34734
0,default_tagger,0.23594
7,tfidf_vectorizer,0.23594
6,count_vectorizer,0.190025


Самый высокий результат показала комбинация тэггеров, при использовании векторайзеров качество ниже в разы

#### Задание 2. Проверить насколько хорошо работает NER
данные брать из http://www.labinform.ru/pub/named_entities/
- проверить NER из nltk/spacy/deeppavlov
- написать свой нер попробовать разные подходы
-- передаём в сетку токен и его соседей
-- передаём в сетку только токен
- сделать выводы по вашим экспериментам какой из подходов успешнее справляется


In [17]:
!wget http://www.labinform.ru/pub/named_entities/collection5.zip

--2022-05-23 23:01:05--  http://www.labinform.ru/pub/named_entities/collection5.zip
Resolving www.labinform.ru... 95.181.230.181
Connecting to www.labinform.ru|95.181.230.181|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1899530 (1.8M) [application/zip]
Saving to: 'collection5.zip'


2022-05-23 23:01:05 (7.56 MB/s) - 'collection5.zip' saved [1899530/1899530]



In [18]:
!unzip collection5.zip

Archive:  collection5.zip
   creating: Collection5/
  inflating: Collection5/001.ann     
  inflating: Collection5/001.txt     
  inflating: Collection5/002.ann     
  inflating: Collection5/002.txt     
  inflating: Collection5/003.ann     
  inflating: Collection5/003.txt     
  inflating: Collection5/004.ann     
  inflating: Collection5/004.txt     
  inflating: Collection5/005.ann     
  inflating: Collection5/005.txt     
  inflating: Collection5/006.ann     
  inflating: Collection5/006.txt     
  inflating: Collection5/007.ann     
  inflating: Collection5/007.txt     
  inflating: Collection5/008.ann     
  inflating: Collection5/008.txt     
  inflating: Collection5/009.ann     
  inflating: Collection5/009.txt     
  inflating: Collection5/010.ann     
  inflating: Collection5/010.txt     
  inflating: Collection5/011.ann     
  inflating: Collection5/011.txt     
  inflating: Collection5/012.ann     
  inflating: Collection5/012.txt     
  inflating: Collection5/013.ann    

  inflating: Collection5/114.txt     
  inflating: Collection5/1140.ann    
  inflating: Collection5/1140.txt    
  inflating: Collection5/1141.ann    
  inflating: Collection5/1141.txt    
  inflating: Collection5/1142.ann    
  inflating: Collection5/1142.txt    
  inflating: Collection5/1143.ann    
  inflating: Collection5/1143.txt    
  inflating: Collection5/1144.ann    
  inflating: Collection5/1144.txt    
  inflating: Collection5/1145.ann    
  inflating: Collection5/1145.txt    
  inflating: Collection5/1146.ann    
  inflating: Collection5/1146.txt    
  inflating: Collection5/1147.ann    
  inflating: Collection5/1147.txt    
  inflating: Collection5/1148.ann    
  inflating: Collection5/1148.txt    
  inflating: Collection5/1149.ann    
  inflating: Collection5/1149.txt    
  inflating: Collection5/115.ann     
  inflating: Collection5/115.txt     
  inflating: Collection5/1150.ann    
  inflating: Collection5/1150.txt    
  inflating: Collection5/

  inflating: Collection5/226.txt     
  inflating: Collection5/227.ann     
  inflating: Collection5/227.txt     
  inflating: Collection5/228.ann     
  inflating: Collection5/228.txt     
  inflating: Collection5/229.ann     
  inflating: Collection5/229.txt     
  inflating: Collection5/22_11_12a.ann  
  inflating: Collection5/22_11_12a.txt  
  inflating: Collection5/22_11_12c.ann  
  inflating: Collection5/22_11_12c.txt  
  inflating: Collection5/22_11_12d.ann  
  inflating: Collection5/22_11_12d.txt  
  inflating: Collection5/22_11_12g.ann  
  inflating: Collection5/22_11_12g.txt  
  inflating: Collection5/22_11_12h.ann  
  inflating: Collection5/22_11_12h.txt  
  inflating: Collection5/22_11_12i.ann  
  inflating: Collection5/22_11_12i.txt  
  inflating: Collection5/22_11_12j.ann  
  inflating: Collection5/22_11_12j.txt  
  inflating: Collection5/230.ann     
  inflating: Collection5/230.txt     
  inflating: Collection5/231.ann     
  inflating: Collectio

  inflating: Collection5/321.txt     
  inflating: Collection5/322.ann     
  inflating: Collection5/322.txt     
  inflating: Collection5/323.ann     
  inflating: Collection5/323.txt     
  inflating: Collection5/324.ann     
  inflating: Collection5/324.txt     
  inflating: Collection5/325.ann     
  inflating: Collection5/325.txt     
  inflating: Collection5/326.ann     
  inflating: Collection5/326.txt     
  inflating: Collection5/327.ann     
  inflating: Collection5/327.txt     
  inflating: Collection5/328.ann     
  inflating: Collection5/328.txt     
  inflating: Collection5/329.ann     
  inflating: Collection5/329.txt     
  inflating: Collection5/330.ann     
  inflating: Collection5/330.txt     
  inflating: Collection5/331.ann     
  inflating: Collection5/331.txt     
  inflating: Collection5/332.ann     
  inflating: Collection5/332.txt     
  inflating: Collection5/333.ann     
  inflating: Collection5/333.txt     
  inflating: Collection5/

  inflating: Collection5/551.txt     
  inflating: Collection5/552.ann     
  inflating: Collection5/552.txt     
  inflating: Collection5/553.ann     
  inflating: Collection5/553.txt     
  inflating: Collection5/554.ann     
  inflating: Collection5/554.txt     
  inflating: Collection5/555 (!).ann  
  inflating: Collection5/555 (!).txt  
  inflating: Collection5/556.ann     
  inflating: Collection5/556.txt     
  inflating: Collection5/557.ann     
  inflating: Collection5/557.txt     
  inflating: Collection5/558.ann     
  inflating: Collection5/558.txt     
  inflating: Collection5/559.ann     
  inflating: Collection5/559.txt     
  inflating: Collection5/560.ann     
  inflating: Collection5/560.txt     
  inflating: Collection5/561.ann     
  inflating: Collection5/561.txt     
  inflating: Collection5/562.ann     
  inflating: Collection5/562.txt     
  inflating: Collection5/563.ann     
  inflating: Collection5/563.txt     
  inflating: Collection

In [85]:
# Загрузка данных

from corus import load_ne5

dir = 'Collection5/'
records = load_ne5(dir)

In [24]:
words_docs = []
for ix, rec in enumerate(records):
    words = []
    for token in tokenize(rec.text):
        type_ent = 'OUT'
        for ent in rec.spans:
            if (token.start >= ent.start) and (token.stop <= ent.stop):
                type_ent = ent.type
                break
        words.append([token.text, type_ent])
    words_docs.extend(words)

In [107]:
df_words = pd.DataFrame(words_docs, columns=['word', 'tag'])

In [108]:
df_words

Unnamed: 0,word,tag
0,Д,PER
1,.,PER
2,Медведев,PER
3,назначил,OUT
4,ряд,OUT
...,...,...
265222,ВТБ,ORG
265223,на,OUT
265224,новый,OUT
265225,срок,OUT
