In [22]:
import nltk
nltk . download ( 'treebank' )
from nltk import word_tokenize
from nltk.corpus import brown as cb
from nltk.corpus import treebank as tb
import pprint
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\ranas\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


In [23]:
raw_text = nltk.Text(cb.words('ca01'))
print (raw_text)

<Text: The Fulton County Grand Jury said Friday an...>


In [24]:
print (cb.words()[0:20])

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that']


In [25]:
print (cb.tagged_words()[0:10])

[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN')]


In [26]:
tagged_sentences_brown_corpus = nltk.corpus.brown.tagged_sents()
pprint.pprint(tagged_sentences_brown_corpus[0])

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN'),
 ("Atlanta's", 'NP$'),
 ('recent', 'JJ'),
 ('primary', 'NN'),
 ('election', 'NN'),
 ('produced', 'VBD'),
 ('``', '``'),
 ('no', 'AT'),
 ('evidence', 'NN'),
 ("''", "''"),
 ('that', 'CS'),
 ('any', 'DTI'),
 ('irregularities', 'NNS'),
 ('took', 'VBD'),
 ('place', 'NN'),
 ('.', '.')]


In [27]:
raw_text = nltk.Text(tb.words()[0:10])
print (raw_text)

<Text: Pierre Vinken , 61 years old , will...>


In [28]:
print (tb.words()[0:10])

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the']


In [29]:
tagged_sentences_treebank_corpus = nltk.corpus.treebank.tagged_sents()
pprint.pprint (tagged_sentences_treebank_corpus[0])

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]


In [30]:
print ("Tagged sentences: ", len(tagged_sentences_treebank_corpus))
print ("Tagged words:", len(nltk.corpus.treebank.tagged_words()))

Tagged sentences:  3914
Tagged words: 100676


In [31]:
def features(sentence, index):
    # "sentence: [w1, w2, ...], index: the index of the word"
    return {
    'word': sentence[index],
    'is_first': index == 0,
    'is_last': index == len(sentence) - 1,
    'is_capitalized': sentence[index][0].upper() == sentence[index][0],
    'is_all_caps': sentence[index].upper() == sentence[index],
    'is_all_lower': sentence[index].lower() == sentence[index],
    'prefix-1': sentence[index][0],
    'prefix-2': sentence[index][:2],
    'prefix-3': sentence[index][:3],
    'suffix-1': sentence[index][-1],
    'suffix-2': sentence[index][-2:],
    'suffix-3': sentence[index][-3:],
    'prev_word': '' if index == 0 else sentence[index - 1],
    'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
    'has_hyphen': '-' in sentence[index],
    'is_numeric': sentence[index].isdigit(),
    'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }

3.3 Generate features
Function for generating features form tagged corpus

In [32]:
pprint.pprint(features(['This', 'is', 'a', 'sentence'], 0))

{'capitals_inside': False,
 'has_hyphen': False,
 'is_all_caps': False,
 'is_all_lower': False,
 'is_capitalized': True,
 'is_first': True,
 'is_last': False,
 'is_numeric': False,
 'next_word': 'is',
 'prefix-1': 'T',
 'prefix-2': 'Th',
 'prefix-3': 'Thi',
 'prev_word': '',
 'suffix-1': 's',
 'suffix-2': 'is',
 'suffix-3': 'his',
 'word': 'This'}


In [33]:
pprint.pprint(features(['හෙට', 'යන්නේ', 'කීයටද', ], 1))


{'capitals_inside': False,
 'has_hyphen': False,
 'is_all_caps': True,
 'is_all_lower': True,
 'is_capitalized': True,
 'is_first': False,
 'is_last': False,
 'is_numeric': False,
 'next_word': 'කීයටද',
 'prefix-1': 'ය',
 'prefix-2': 'යන',
 'prefix-3': 'යන්',
 'prev_word': 'හෙට',
 'suffix-1': 'ේ',
 'suffix-2': 'නේ',
 'suffix-3': '්නේ',
 'word': 'යන්නේ'}


In [170]:
import csv

f5 = open('poss_sentence.csv', 'r')
reader = csv.reader(f5)

senetences = []

for row in reader:
    if row != []:
        newRow = []
        for w in row:
            s = eval(w)
            newRow.append(s)
        senetences.append(newRow)

print (senetences[0])


[('ඔව්', 'NIP'), ('අද', 'NNC'), ('හරිම', 'NNC'), ('ලස්සනයි', 'VP')]


3.4 Transform Dataset
Extract words form tagged sentences using 'untag' function

In [180]:
def untag(tagged_sentence):
    
    return [w for w, t in tagged_sentence]

In [187]:
sentence = [('ඔව්', 'NIP'), ('අද', 'NNC'), ('හරිම', 'NNC'), ('ලස්සනයි', 'VP')]
sen2 =tagged_sentences_treebank_corpus[0]
untag(sentence)


['ඔව්', 'අද', 'හරිම', 'ලස්සනයි']

Transform dataset into X, y pairs where X = Features Y = POS lables¶

In [186]:
def transform_to_dataset(tagged_sentences):
    X, y = [], []
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])
            #pprint.pprint(" original word: "+ str(tagged) + " Word: "+ str(untag(tagged))+ "Y: " + y[index])
    return X, y

In [190]:
sentence = [[('ඔව්', 'NIP'), ('අද', 'NNC'), ('හරිම', 'NNC'), ('ලස්සනයි', 'VP')]]
sentence2 = [[('By', 'IN'), ('1997', 'CD'), (',', ','), ('almost', 'RB'), ('all', 'DT'), ('remaining', 'VBG'), ('uses', 'NNS'), ('of', 'IN'), ('cancer-causing', 'JJ'), ('asbestos', 'NN'), ('will', 'MD'), ('be', 'VB'), ('outlawed', 'VBN'), ('*-6', '-NONE-'), ('.', '.')]]
# print(sentence2)
transform_to_dataset(sentence)


([{'word': 'ඔව්',
   'is_first': True,
   'is_last': False,
   'is_capitalized': True,
   'is_all_caps': True,
   'is_all_lower': True,
   'prefix-1': 'ඔ',
   'prefix-2': 'ඔව',
   'prefix-3': 'ඔව්',
   'suffix-1': '්',
   'suffix-2': 'ව්',
   'suffix-3': 'ඔව්',
   'prev_word': '',
   'next_word': 'අද',
   'has_hyphen': False,
   'is_numeric': False,
   'capitals_inside': False},
  {'word': 'අද',
   'is_first': False,
   'is_last': False,
   'is_capitalized': True,
   'is_all_caps': True,
   'is_all_lower': True,
   'prefix-1': 'අ',
   'prefix-2': 'අද',
   'prefix-3': 'අද',
   'suffix-1': 'ද',
   'suffix-2': 'අද',
   'suffix-3': 'අද',
   'prev_word': 'ඔව්',
   'next_word': 'හරිම',
   'has_hyphen': False,
   'is_numeric': False,
   'capitals_inside': False},
  {'word': 'හරිම',
   'is_first': False,
   'is_last': False,
   'is_capitalized': True,
   'is_all_caps': True,
   'is_all_lower': True,
   'prefix-1': 'හ',
   'prefix-2': 'හර',
   'prefix-3': 'හරි',
   'suffix-1': 'ම',
   'suffix-2

3.5 Build training and testing dataset

In [191]:
cutoff = int(.75 * len(senetences))
training_sentences = senetences[:cutoff]
test_sentences = senetences[cutoff:]

In [192]:
training_sentences[0]


[('ඔව්', 'NIP'), ('අද', 'NNC'), ('හරිම', 'NNC'), ('ලස්සනයි', 'VP')]

In [193]:
print (len(training_sentences))
print (len(test_sentences))

126
42


In [194]:
X, y = transform_to_dataset(training_sentences)

In [195]:
print(len(X)) 
print(len(y))

482
482


3.6 Train model
Initialize the classifier

In [196]:
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
])

In [197]:
clf.fit(X[:482],y[:482]) 

3.7 Measure Accuracy

In [198]:
X_test, y_test = transform_to_dataset(test_sentences)

In [199]:
print ("Accuracy:{:.3%}".format(clf.score(X_test, y_test)))

Accuracy:75.510%


3.8 Generate POS tags for given sentence

def pos_tag(sentence):
    tagged_sentence = []
    tags = clf.predict([features(sentence, index) for index in range(len(sentence))])
    return zip(sentence, tags)

In [200]:
def pos_tag(sentence):
    tagged_sentence = []
    tags = clf.predict([features(sentence, index) for index in range(len(sentence))])
    return zip(sentence, tags)

In [203]:
senetence = 'මිනිත්තුවකට තත්පර කීයක් තිබේද'

POS_list = list(pos_tag(sentence=senetence.split()))
for t in POS_list:
    print(u"{:<16}{:>2}".format(str(t[0]),str(t[1])))   

මිනිත්තුවකට     PRP
තත්පර           RP
කීයක්           NNC
තිබේද           QBE


In [132]:
POS_list = list(pos_tag(word_tokenize("We will meet at eight o'clock on Thursday morning.")))
for t in POS_list:
    print(u"{:<16}{:>2}".format(str(t[0]),str(t[1])))  

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\ranas/nltk_data'
    - 'c:\\Users\\ranas\\anaconda3\\envs\\my_init\\nltk_data'
    - 'c:\\Users\\ranas\\anaconda3\\envs\\my_init\\share\\nltk_data'
    - 'c:\\Users\\ranas\\anaconda3\\envs\\my_init\\lib\\nltk_data'
    - 'C:\\Users\\ranas\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [None]:
POS_list = list(pos_tag(word_tokenize('Alexander, the great...!')))
for t in POS_list:
    print(u"{:<16}{:>2}".format(str(t[0]),str(t[1])))   