# Imports

In [2]:
import nltk
from nltk import word_tokenize
from nltk.corpus import treebank
import pprint
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

# Datasets download

In [3]:
nltk.download('treebank')
nltk.download('punkt')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Tagged sentences

In [4]:
tagged_sentences = treebank.tagged_sents()
print(tagged_sentences[0])

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


# Define features

In [5]:
def features(sentence, index):                       
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_before_period': True if index != len(sentence) - 1 and sentence[index + 1] == '.' else False,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'last_word_in_sentence': sentence[-1],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }

# Untag sentences

In [6]:
def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

# Divide data to train part and test part

In [7]:
cutoff = int(.80 * len(tagged_sentences))                     
training_sentences = tagged_sentences[:cutoff]
test_sentences = tagged_sentences[cutoff:]

# Transform data to dataset

In [8]:
def transform_to_dataset(tagged_sentences):
    X, y = [], []
 
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])
 
    return X, y
 
X, y = transform_to_dataset(training_sentences)

# Define pipeline for training

In [9]:
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
])
 
clf.fit(X[:10000], y[:10000]) 
 
print ('Training completed')
 
X_test, y_test = transform_to_dataset(test_sentences)
 
print ("Accuracy:", clf.score(X_test, y_test))

Training completed
Accuracy: 0.8947552273067518


In [11]:
def pos_tag(sentence): 
  tags = clf.predict([features(sentence, index) for index in range(len(sentence))]) 
  return list(zip(sentence, tags)) 
  
pos_tag(word_tokenize('This is a test sentence!'))

[('This', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('test', 'NN'),
 ('sentence', 'NN'),
 ('!', 'CD')]