In [199]:
import nltk
import warnings
warnings.filterwarnings('ignore')

In [26]:
sent = "Trying to learn Part of Speech Tagging using NLTK and spaCy, Kamalam is being productive."
nltk_pos_tagged = nltk.pos_tag(nltk.word_tokenize(sent))
nltk_pos_tagged

[('Trying', 'VBG'),
 ('to', 'TO'),
 ('learn', 'VB'),
 ('Part', 'NNP'),
 ('of', 'IN'),
 ('Speech', 'NNP'),
 ('Tagging', 'NNP'),
 ('using', 'VBG'),
 ('NLTK', 'NNP'),
 ('and', 'CC'),
 ('spaCy', 'NN'),
 (',', ','),
 ('Kamalam', 'NNP'),
 ('is', 'VBZ'),
 ('being', 'VBG'),
 ('productive', 'JJ'),
 ('.', '.')]

In [27]:
import pandas as pd
pd.DataFrame(nltk_pos_tagged, 
             columns=['Word', 'POS tag']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
Word,Trying,to,learn,Part,of,Speech,Tagging,using,NLTK,and,spaCy,",",Kamalam,is,being,productive,.
POS tag,VBG,TO,VB,NNP,IN,NNP,NNP,VBG,NNP,CC,NN,",",NNP,VBZ,VBG,JJ,.


In [28]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [29]:
spacy_sent = nlp(sent)
spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in spacy_sent]
spacy_pos_tagged

[(Trying, 'VBG', 'VERB'),
 (to, 'TO', 'PART'),
 (learn, 'VB', 'VERB'),
 (Part, 'NN', 'NOUN'),
 (of, 'IN', 'ADP'),
 (Speech, 'NNP', 'PROPN'),
 (Tagging, 'NNP', 'PROPN'),
 (using, 'VBG', 'VERB'),
 (NLTK, 'NNP', 'PROPN'),
 (and, 'CC', 'CCONJ'),
 (spaCy, 'VBN', 'VERB'),
 (,, ',', 'PUNCT'),
 (Kamalam, 'NNP', 'PROPN'),
 (is, 'VBZ', 'AUX'),
 (being, 'VBG', 'AUX'),
 (productive, 'JJ', 'ADJ'),
 (., '.', 'PUNCT')]

In [30]:
pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
Word,Trying,to,learn,Part,of,Speech,Tagging,using,NLTK,and,spaCy,",",Kamalam,is,being,productive,.
POS tag,VBG,TO,VB,NN,IN,NNP,NNP,VBG,NNP,CC,VBN,",",NNP,VBZ,VBG,JJ,.
Tag type,VERB,PART,VERB,NOUN,ADP,PROPN,PROPN,VERB,PROPN,CCONJ,VERB,PUNCT,PROPN,AUX,AUX,ADJ,PUNCT


In [31]:
#Chunking
from nltk.corpus import conll2000
data = conll2000.chunked_sents()
train_data = data[:10900]
test_data = data[10900:] 

print(len(train_data), len(test_data))
print(train_data[1]) 

10900 48
(S
  Chancellor/NNP
  (PP of/IN)
  (NP the/DT Exchequer/NNP)
  (NP Nigel/NNP Lawson/NNP)
  (NP 's/POS restated/VBN commitment/NN)
  (PP to/TO)
  (NP a/DT firm/NN monetary/JJ policy/NN)
  (VP has/VBZ helped/VBN to/TO prevent/VB)
  (NP a/DT freefall/NN)
  (PP in/IN)
  (NP sterling/NN)
  (PP over/IN)
  (NP the/DT past/JJ week/NN)
  ./.)


In [32]:
from nltk.chunk.util import tree2conlltags, conlltags2tree
wtc = tree2conlltags(train_data[1])
wtc

[('Chancellor', 'NNP', 'O'),
 ('of', 'IN', 'B-PP'),
 ('the', 'DT', 'B-NP'),
 ('Exchequer', 'NNP', 'I-NP'),
 ('Nigel', 'NNP', 'B-NP'),
 ('Lawson', 'NNP', 'I-NP'),
 ("'s", 'POS', 'B-NP'),
 ('restated', 'VBN', 'I-NP'),
 ('commitment', 'NN', 'I-NP'),
 ('to', 'TO', 'B-PP'),
 ('a', 'DT', 'B-NP'),
 ('firm', 'NN', 'I-NP'),
 ('monetary', 'JJ', 'I-NP'),
 ('policy', 'NN', 'I-NP'),
 ('has', 'VBZ', 'B-VP'),
 ('helped', 'VBN', 'I-VP'),
 ('to', 'TO', 'I-VP'),
 ('prevent', 'VB', 'I-VP'),
 ('a', 'DT', 'B-NP'),
 ('freefall', 'NN', 'I-NP'),
 ('in', 'IN', 'B-PP'),
 ('sterling', 'NN', 'B-NP'),
 ('over', 'IN', 'B-PP'),
 ('the', 'DT', 'B-NP'),
 ('past', 'JJ', 'I-NP'),
 ('week', 'NN', 'I-NP'),
 ('.', '.', 'O')]

In [33]:
def conll_tag_chunks(chunk_sents):
    tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
    return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
    
def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff 

In [34]:
from nltk.tag import UnigramTagger, BigramTagger
from nltk.chunk import ChunkParserI #abstract class, can be used to implement as wanted

# define the chunker class
class NGramTagChunker(ChunkParserI):
    def __init__(self, train_sentences, tagger_classes=[UnigramTagger, BigramTagger]):
        train_sent_tags = conll_tag_chunks(train_sentences)
        self.chunk_tagger = combined_tagger(train_sent_tags, tagger_classes)

    def parse(self, tagged_sentence):
        if not tagged_sentence: 
            return None
        pos_tags = [tag for word, tag in tagged_sentence]
        chunk_pos_tags = self.chunk_tagger.tag(pos_tags)
        chunk_tags = [chunk_tag for (pos_tag, chunk_tag) in chunk_pos_tags]
        wpc_tags = [(word, pos_tag, chunk_tag) for ((word, pos_tag), chunk_tag) in zip(tagged_sentence, chunk_tags)]
        return conlltags2tree(wpc_tags)

In [39]:
# train chunker model  
ntc = NGramTagChunker(train_data)
# evaluate chunker model performance
print(ntc.accuracy(test_data))

ChunkParse score:
    IOB Accuracy:  90.0%%
    Precision:     82.1%%
    Recall:        86.3%%
    F-Measure:     84.1%%


In [74]:
sentence = 'Testing the chunker model, hopefully the result comes out well. Kamalam would be happy then to go for lunch.'
sentence

'Testing the chunker model, hopefully the result comes out well. Kamalam would be happy then to go for lunch.'

In [75]:
nltk_pos_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
chunk_tree = ntc.parse(nltk_pos_tagged)
print(chunk_tree)

(S
  (VP Testing/VBG)
  (NP the/DT chunker/NN model/NN)
  ,/,
  hopefully/RB
  (NP the/DT result/NN)
  (VP comes/VBZ)
  out/RP
  well/RB
  ./.
  (NP Kamalam/NNP)
  (VP would/MD be/VB)
  (NP happy/JJ)
  then/RB
  (VP to/TO go/VB)
  (PP for/IN)
  (NP lunch/NN)
  ./.)


In [76]:
#Couldn't install ghostscripts/standford nlp parser for viewing the hierarchy in the chunks, but that could be done. 

In [77]:
from spacy import displacy
displacy.render(spacy_sent, jupyter=True, 
                options={'distance': 110,
                         'arrow_stroke': 2,
                         'arrow_width': 8})

In [78]:
#NER using spacy [Can also be done with nltk using StanfordNERtagger, but need to download it in the system and then use it along with nltk

In [79]:
sentence = 'Three more countries have joined an “international grand committee” of parliaments, adding to calls for Facebook’s boss, Mark Zuckerberg, to give evidence on misinformation to the coalition. Brazil, Latvia and Singapore bring the total to eight different parliaments across the world, with plans to send representatives to London on 27 November with the intention of hearing from Zuckerberg. Since the Cambridge Analytica scandal broke, the Facebook chief has only appeared in front of two legislatures: the American Senate and House of Representatives, and the European parliament. Facebook has consistently rebuffed attempts from others, including the UK and Canadian parliaments, to hear from Zuckerberg. He added that an article in the New York Times on Thursday, in which the paper alleged a pattern of behaviour from Facebook to “delay, deny and deflect” negative news stories, “raises further questions about how recent data breaches were allegedly dealt with within Facebook.”'

In [80]:
spacy_review = nlp(sentence)
spacy_review

Three more countries have joined an “international grand committee” of parliaments, adding to calls for Facebook’s boss, Mark Zuckerberg, to give evidence on misinformation to the coalition. Brazil, Latvia and Singapore bring the total to eight different parliaments across the world, with plans to send representatives to London on 27 November with the intention of hearing from Zuckerberg. Since the Cambridge Analytica scandal broke, the Facebook chief has only appeared in front of two legislatures: the American Senate and House of Representatives, and the European parliament. Facebook has consistently rebuffed attempts from others, including the UK and Canadian parliaments, to hear from Zuckerberg. He added that an article in the New York Times on Thursday, in which the paper alleged a pattern of behaviour from Facebook to “delay, deny and deflect” negative news stories, “raises further questions about how recent data breaches were allegedly dealt with within Facebook.”

In [81]:
ner_tagged = [(word.text, word.ent_type_) for word in spacy_review]
print(ner_tagged)

[('Three', 'CARDINAL'), ('more', ''), ('countries', ''), ('have', ''), ('joined', ''), ('an', ''), ('“', ''), ('international', ''), ('grand', ''), ('committee', ''), ('”', ''), ('of', ''), ('parliaments', ''), (',', ''), ('adding', ''), ('to', ''), ('calls', ''), ('for', ''), ('Facebook', 'ORG'), ('’s', ''), ('boss', ''), (',', ''), ('Mark', 'PERSON'), ('Zuckerberg', 'PERSON'), (',', ''), ('to', ''), ('give', ''), ('evidence', ''), ('on', ''), ('misinformation', ''), ('to', ''), ('the', ''), ('coalition', ''), ('.', ''), ('Brazil', 'GPE'), (',', ''), ('Latvia', 'GPE'), ('and', ''), ('Singapore', 'GPE'), ('bring', ''), ('the', ''), ('total', ''), ('to', ''), ('eight', 'CARDINAL'), ('different', ''), ('parliaments', ''), ('across', ''), ('the', ''), ('world', ''), (',', ''), ('with', ''), ('plans', ''), ('to', ''), ('send', ''), ('representatives', ''), ('to', ''), ('London', 'GPE'), ('on', ''), ('27', 'DATE'), ('November', 'DATE'), ('with', ''), ('the', ''), ('intention', ''), ('of', '

In [82]:
# visualize named entities
displacy.render(spacy_review, style='ent', jupyter=True)

In [83]:
named_entities = []
temp_entity_name = ''
temp_named_entity = None
for term, tag in ner_tagged:
    if tag:
        temp_entity_name = ' '.join([temp_entity_name, term]).strip()
        temp_named_entity = (temp_entity_name, tag)
    else:
        if temp_named_entity:
            named_entities.append(temp_named_entity)
            temp_entity_name = ''
            temp_named_entity = None

In [84]:
named_entities

[('Three', 'CARDINAL'),
 ('Facebook', 'ORG'),
 ('Mark Zuckerberg', 'PERSON'),
 ('Brazil', 'GPE'),
 ('Latvia', 'GPE'),
 ('Singapore', 'GPE'),
 ('eight', 'CARDINAL'),
 ('London', 'GPE'),
 ('27 November', 'DATE'),
 ('Zuckerberg', 'PERSON'),
 ('Cambridge Analytica', 'GPE'),
 ('Facebook', 'ORG'),
 ('two', 'CARDINAL'),
 ('American Senate', 'ORG'),
 ('House of Representatives', 'ORG'),
 ('European', 'NORP'),
 ('UK', 'GPE'),
 ('Canadian', 'NORP'),
 ('Zuckerberg', 'PERSON'),
 ('the New York Times', 'ORG'),
 ('Thursday', 'DATE'),
 ('Facebook', 'PERSON')]

In [72]:
from collections import Counter
c = Counter([item[1] for item in named_entities])
c.most_common()

[('GPE', 6),
 ('ORG', 5),
 ('PERSON', 4),
 ('CARDINAL', 3),
 ('DATE', 2),
 ('NORP', 2)]

In [145]:
#Tweaking available POS Taggers

In [89]:
sentence = "Since the Cambridge Analytica scandal broke, Mark Zuckerberg has only appeared in front of two legislatures."
sentence

'Since the Cambridge Analytica scandal broke, Mark Zuckerberg has only appeared in front of two legislatures.'

In [90]:
sentence_tokens = nltk.word_tokenize(sentence)
nltk_pos_tagged = nltk.pos_tag(sentence_tokens)
pd.DataFrame(nltk_pos_tagged, columns=['Word', 'POS tag']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
Word,Since,the,Cambridge,Analytica,scandal,broke,",",Mark,Zuckerberg,has,only,appeared,in,front,of,two,legislatures,.
POS tag,IN,DT,NNP,NNP,NN,VBD,",",NNP,NNP,VBZ,RB,VBN,IN,NN,IN,CD,NNS,.


In [91]:
from nltk.corpus import treebank
data = treebank.tagged_sents()
train_data = data[:3500]
test_data = data[3500:]
len(train_data), len(test_data)

(3500, 414)

In [111]:
treebank.tagged_sents()

[[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...]

In [114]:
tags = [tag for (word, tag) in treebank.tagged_words("wsj_0003.mrg")]
nltk.FreqDist(tags).max()

'IN'

In [116]:
# default tagger
from nltk.tag import DefaultTagger #Tags every word with the same mentioned POS
dt = DefaultTagger('NN')
dt.accuracy(test_data)

0.1454158195372253

In [102]:
pos_tagged = dt.tag(nltk.word_tokenize(sentence))
pd.DataFrame(pos_tagged, columns=['Word', 'POS tag']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
Word,Since,the,Cambridge,Analytica,scandal,broke,",",Mark,Zuckerberg,has,only,appeared,in,front,of,two,legislatures,.
POS tag,NN,NN,NN,NN,NN,NN,NN,NN,NN,NN,NN,NN,NN,NN,NN,NN,NN,NN


In [117]:
# regex tagger
from nltk.tag import RegexpTagger
# define regex tag patterns
patterns = [
        (r'.*ing$', 'VBG'),               # gerunds
        (r'.*ed$', 'VBD'),                # simple past
        (r'.*es$', 'VBZ'),                # 3rd singular present
        (r'.*ould$', 'MD'),               # modals
        (r'.*\'s$', 'NN$'),               # possessive nouns
        (r'.*s$', 'NNS'),                 # plural nouns
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')                     # nouns 
]
rt = RegexpTagger(patterns)
rt

<Regexp Tagger: size=8>

In [136]:
rt.accuracy(test_data) #all accuracies are based on the accepted gold standard test data

0.24039113176493368

In [119]:
pos_tagged = rt.tag(nltk.word_tokenize(sentence))
pd.DataFrame(pos_tagged, columns=['Word', 'POS tag']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
Word,Since,the,Cambridge,Analytica,scandal,broke,",",Mark,Zuckerberg,has,only,appeared,in,front,of,two,legislatures,.
POS tag,NN,NN,NN,NN,NN,NN,NN,NN,NN,NNS,NN,VBD,NN,NN,NN,NN,VBZ,NN


In [131]:
#Lookup Taggers (Unigram Tagger)
#Most likely tag from the most frequent words are stored in a lookup table
fd = nltk.FreqDist(treebank.words("wsj_0003.mrg"))
cfd = nltk.ConditionalFreqDist(treebank.tagged_words("wsj_0003.mrg"))
most_freq_words = fd.most_common(100)

#finding the most likely tags
likely_tags = dict((word, cfd[word].max()) for (word, num) in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
baseline_tagger.accuracy(treebank.tagged_sents())

0.3862489570503397

In [134]:
sample = treebank.sents("wsj_0003.mrg")
baseline_tagger.tag(sample[0])

[('A', 'DT'),
 ('form', None),
 ('of', 'IN'),
 ('asbestos', 'NN'),
 ('once', 'RB'),
 ('used', 'VBN'),
 ('*', '-NONE-'),
 ('*', '-NONE-'),
 ('to', 'TO'),
 ('make', 'VB'),
 ('Kent', 'NNP'),
 ('cigarette', 'NN'),
 ('filters', 'NNS'),
 ('has', 'VBZ'),
 ('caused', None),
 ('a', 'DT'),
 ('high', None),
 ('percentage', 'NN'),
 ('of', 'IN'),
 ('cancer', 'NN'),
 ('deaths', 'NNS'),
 ('among', 'IN'),
 ('a', 'DT'),
 ('group', None),
 ('of', 'IN'),
 ('workers', 'NNS'),
 ('exposed', 'VBN'),
 ('*', '-NONE-'),
 ('to', 'TO'),
 ('it', 'PRP'),
 ('more', 'RBR'),
 ('than', 'IN'),
 ('30', None),
 ('years', 'NNS'),
 ('ago', 'IN'),
 (',', ','),
 ('researchers', 'NNS'),
 ('reported', 'VBD'),
 ('0', '-NONE-'),
 ('*T*-1', '-NONE-'),
 ('.', '.')]

In [135]:
#backoff is nothing but when most of the words, don't have a tag, it's assigned None, when with backoff mentioned, the backup method is used.
baseline_tagger = nltk.UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger('NN')) #In this case "NN" is assigned when it can't find any corresponding word to assign a tag.
baseline_tagger.tag(sample[0])

[('A', 'DT'),
 ('form', 'NN'),
 ('of', 'IN'),
 ('asbestos', 'NN'),
 ('once', 'RB'),
 ('used', 'VBN'),
 ('*', '-NONE-'),
 ('*', '-NONE-'),
 ('to', 'TO'),
 ('make', 'VB'),
 ('Kent', 'NNP'),
 ('cigarette', 'NN'),
 ('filters', 'NNS'),
 ('has', 'VBZ'),
 ('caused', 'NN'),
 ('a', 'DT'),
 ('high', 'NN'),
 ('percentage', 'NN'),
 ('of', 'IN'),
 ('cancer', 'NN'),
 ('deaths', 'NNS'),
 ('among', 'IN'),
 ('a', 'DT'),
 ('group', 'NN'),
 ('of', 'IN'),
 ('workers', 'NNS'),
 ('exposed', 'VBN'),
 ('*', '-NONE-'),
 ('to', 'TO'),
 ('it', 'PRP'),
 ('more', 'RBR'),
 ('than', 'IN'),
 ('30', 'NN'),
 ('years', 'NNS'),
 ('ago', 'IN'),
 (',', ','),
 ('researchers', 'NNS'),
 ('reported', 'VBD'),
 ('0', '-NONE-'),
 ('*T*-1', '-NONE-'),
 ('.', '.')]

In [120]:
## N gram taggers
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)
tt

<TrigramTagger: size=41616>

In [122]:
print("Performance:\nUnigramTagger: {}\nBigramTagger: {}\nTrigramTagger: {}".format(ut.accuracy(test_data), bt.accuracy(test_data), tt.accuracy(test_data)))

Performance:
UnigramTagger: 0.8607803272340013
BigramTagger: 0.13466937748087907
TrigramTagger: 0.08064672281924679


In [137]:
t = nltk.BigramTagger(train_data, backoff=ut)
t.accuracy(test_data) #Comparitively the combined tagger works well because of backoff.

0.8701713621841417

In [138]:
t1 = UnigramTagger(train_data, backoff=nltk.DefaultTagger('NN'))
t2 = BigramTagger(train_data, backoff=t1)
t3 = TrigramTagger(train_data, backoff=t2)

In [139]:
t3.accuracy(test_data) #Even better results when defined backing down from trigram to default

0.8874043953916159

In [140]:
#Supervised Learning for POS Tagging

In [141]:
from nltk.classify import NaiveBayesClassifier
from nltk.tag.sequential import ClassifierBasedPOSTagger
nbt = ClassifierBasedPOSTagger(train=train_data, classifier_builder=NaiveBayesClassifier.train)
nbt

<ClassifierBasedTagger: <nltk.classify.naivebayes.NaiveBayesClassifier object at 0x000001B7471FE750>>

In [143]:
nbt.accuracy(test_data) #better results than NGram Taggers

0.9306806079969019

In [158]:
#Customising NER
#NER - Sequence modelling problem
#Trying out CRF --> Conditional Random Fields

In [151]:
df = pd.read_csv(r"C:\Users\kamalam.s\Desktop\kamalam's\nlp dev\data\ner\ner_dataset.csv", encoding='ISO-8859-1')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 4 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Sentence #  47959 non-null    object
 1   Word        1048565 non-null  object
 2   POS         1048575 non-null  object
 3   Tag         1048575 non-null  object
dtypes: object(4)
memory usage: 32.0+ MB


In [152]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1048565,1048566,1048567,1048568,1048569,1048570,1048571,1048572,1048573,1048574
Sentence #,Sentence: 1,,,,,,,,,,...,,,Sentence: 47959,,,,,,,
Word,Thousands,of,demonstrators,have,marched,through,London,to,protest,the,...,impact,.,Indian,forces,said,they,responded,to,the,attack
POS,NNS,IN,NNS,VBP,VBN,IN,NNP,TO,VB,DT,...,NN,.,JJ,NNS,VBD,PRP,VBD,TO,DT,NN
Tag,O,O,O,O,O,O,B-geo,O,O,O,...,O,O,B-gpe,O,O,O,O,O,O,O


In [153]:
df['Sentence #'].nunique(), df.Word.nunique(), df.POS.nunique(), df.Tag.nunique()

(47959, 35177, 42, 17)

In [154]:
# geo = Geographical Entity
# org = Organization
# per = Person
# gpe = Geopolitical Entity
# tim = Time indicator
# art = Artifact
# eve = Event
# nat = Natural Phenomenon
# Anything other than the above has been tagged O
# IOB is used

In [157]:
df.Tag.value_counts()
#It can be seen that, the distribution of values in different tags is unbalanced.

Tag
O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: count, dtype: int64

In [206]:
def word2features(sent, i):
    word = str(sent[i][0])
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

In [207]:
agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                   s['POS'].values.tolist(), 
                                                   s['Tag'].values.tolist())]

In [208]:
grouped_df = df.groupby('Sentence #').apply(agg_func)

In [209]:
print(grouped_df[grouped_df.index == 'Sentence: 1'].values)

[list([('Thousands', 'NNS', 'O')])]


In [210]:
sentences = [s for s in grouped_df]
sentences[0]

[('Thousands', 'NNS', 'O')]

In [211]:
from sklearn.model_selection import train_test_split
import numpy as np

X = np.array([sent2features(s) for s in sentences])
y = np.array([sent2labels(s) for s in sentences])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train.shape, X_test.shape

((35969, 1), (11990, 1))

In [213]:
import sklearn_crfsuite
crf = sklearn_crfsuite.CRF(c1=0.1,
                           c2=0.1,
                           max_iterations=100,
                           all_possible_transitions=True,
                           verbose=True)

In [215]:
try:
    crf.fit(X_train, y_train) #instead of downgrading sklearn for mitigating this issue, gave a try-catch block
except AttributeError:
    pass

loading training data to CRFsuite: 100%|██████████████████████████████████████| 35969/35969 [00:00<00:00, 78037.55it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 6894
Seconds required: 0.051

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.06  loss=42206.99 active=6796  feature_norm=1.00
Iter 2   time=0.03  loss=34905.36 active=6538  feature_norm=1.62
Iter 3   time=0.02  loss=30297.87 active=6681  feature_norm=1.92
Iter 4   time=0.02  loss=21137.73 active=6638  feature_norm=3.81
Iter 5   time=0.03  loss=18360.41 active=6699  feature_norm=4.48
Iter 6   time=0.04  loss=16672.71 active=6720  feature_norm=5.32
Iter 7   time=0.03  loss=15431.20 active=6736  feature_norm=6.37
Iter 8   time=0.03  loss=14549.78 active=6750  feature_norm=7.62
Iter 9   time=0.04  loss=13560.74 active=6735  feature_norm=8.80
Iter 10  time=0

In [216]:
y_pred = crf.predict(X_test)
print(y_pred[0])

['O']


In [217]:
print(y_test[0])

['O']


In [218]:
from sklearn_crfsuite import metrics as crf_metrics

labels = list(crf.classes_)
labels.remove('O') #Intentionally removing 'O' to understand how well the model classifies other classes

In [226]:
from sklearn_crfsuite import metrics
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

0.823409066315601