In [34]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jahna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [35]:
import re

import numpy as np
from nltk import sent_tokenize, word_tokenize

from nltk.cluster.util import cosine_distance

MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE)

In [37]:
def normalize_whitespace(text):
    return MULTIPLE_WHITESPACE_PATTERN.sub(_replace_whitespace, text)
def _replace_whitespace(match):
    text = match.group()

    if "\n" in text or "\r" in text:
        return "\n"
    else:
        return " "
def is_blank(string):
    return not string or string.isspace()

In [38]:
def get_symmetric_matrix(matrix):
    
    return matrix + matrix.T - np.diag(matrix.diagonal())

In [39]:
def core_cosine_similarity(vector1, vector2):
   
    return 1 - cosine_distance(vector1, vector2)

In [46]:
class TextRank4Sentences():
    def __init__(self):
        self.damping = 0.85  # damping coefficient, usually is .85
        self.min_diff = 1e-5  # convergence threshold
        self.steps = 100  # iteration steps
        self.text_str = None
        self.sentences = None
        self.pr_vector = None

    def _sentence_similarity(self, sent1, sent2, stopwords=None):
        if stopwords is None:
            stopwords = []

        sent1 = [w.lower() for w in sent1]
        sent2 = [w.lower() for w in sent2]

        all_words = list(set(sent1 + sent2))

        vector1 = [0] * len(all_words)
        vector2 = [0] * len(all_words)

        # build the vector for the first sentence
        for w in sent1:
            if w in stopwords:
                continue
            vector1[all_words.index(w)] += 1

        # build the vector for the second sentence
        for w in sent2:
            if w in stopwords:
                continue
            vector2[all_words.index(w)] += 1

        return core_cosine_similarity(vector1, vector2)

    def _build_similarity_matrix(self, sentences, stopwords=None):
        # create an empty similarity matrix
        sm = np.zeros([len(sentences), len(sentences)])

        for idx1 in range(len(sentences)):
            for idx2 in range(len(sentences)):
                if idx1 == idx2:
                    continue

                sm[idx1][idx2] = self._sentence_similarity(sentences[idx1], sentences[idx2], stopwords=stopwords)

        # Get Symmeric matrix
        sm = get_symmetric_matrix(sm)

        # Normalize matrix by column
        norm = np.sum(sm, axis=0)
        sm_norm = np.divide(sm, norm, where=norm != 0)  # this is ignore the 0 element in norm

        return sm_norm

    def _run_page_rank(self, similarity_matrix):

        pr_vector = np.array([1] * len(similarity_matrix))

        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr_vector = (1 - self.damping) + self.damping * np.matmul(similarity_matrix, pr_vector)
            if abs(previous_pr - sum(pr_vector)) < self.min_diff:
                break
            else:
                previous_pr = sum(pr_vector)

        return pr_vector

    def _get_sentence(self, index):

        try:
            return self.sentences[index]
        except IndexError:
            return ""

    def get_top_sentences(self, number=5):

        top_sentences = []

        if self.pr_vector is not None:

            sorted_pr = np.argsort(self.pr_vector)
            sorted_pr = list(sorted_pr)
            sorted_pr.reverse()

            index = 0
            for epoch in range(number):
                sent = self.sentences[sorted_pr[index]]
                sent = normalize_whitespace(sent)
                top_sentences.append(sent)
                index += 1

        return top_sentences

    def analyze(self, text, stop_words=None):
        self.text_str = text
        self.sentences = sent_tokenize(self.text_str)

        tokenized_sentences = [word_tokenize(sent) for sent in self.sentences]

        similarity_matrix = self._build_similarity_matrix(tokenized_sentences, stop_words)

        self.pr_vector = self._run_page_rank(similarity_matrix)

In [47]:
text_str = '''
    Mr. President, I offer you our congratulations on your election as the President of the current session of the General Assembly.
You represent Norway, a country which can take pride in its reputation as peaceful, just and progressive.
Your personal qualifications and your family's dedication to international effort are well known.
I should also like to express our appreciation of the services of your distinguished predecessor, Mrs. Angie Brooks Randolph.
I would also repeat our admiration for U Thant, whose skill and dedication have won him our respect.

41.	
Today is Mahatma Gandhi's one-hundred-first birthday, and we in India will take a fresh pledge to dedicate ourselves once again to the ideals for which the Mahatma lived and died, peace and nonviolence being the foremost among them.
We may not fully succeed in living up to his ideals but we must continue to try.

42.	
There are many developments in India which give us satisfaction.
Our people are expecting a better life through our development plans.
We have had a sizable increase in agricultural and industrial production.
Our trade is also showing signs of improvement.
India has once again demonstrated its faith in full-fledged democracy.
Alongside this there is a growing desire of the common man to share more equitably in the distribution of national wealth.

43.	
twenty-two days ago a great conference ended at Lusaka, and in 22 days from now we shall be celebrating the signing of the Charter of the United Nations.
The Conference of Lusaka owes much of its success to the efforts and organization undertaken by the Government and people of Zambia, and once again we should like to thank them.
The final declarations and resolution s of that Conference are being circulated as United Nations documents.
They represent the consensus of 53 Members of the United Nations, representing about half the human race.
I would urge that everyone read them.

44.	
The Conference at Lusaka highlighted several key points.
These are: international peace and security, peaceful coexistence and friendly relations, solution of international problems by negotiations, the value of the United Nations as a universal forum, decolonization, development, disarmament and the pursuit of the principles of nonalignment.
In order to fulfill the objectives we subscribed to at Lusaka, we seek the widest support and cooperation of the Members of the United Nations.

45.	
In assessing the achievements of the United Nations over the past 25 years, the major factor that strikes us is that, while another world war has been avoided, insecurity still prevails and that, while tension between the great Powers has eased and negotiations between them in certain fields have begun developments which are welcome armed conflicts are still raging in many parts of the world.
This situation requires attention from the world community.
A positive step in that direction that has been taken is the adoption by the Sixth Committee last week [1184th meeting] of the Declaration on Principles of international law concerning Friendly Relations and Cooperation among States in Accordance with the Charter of the United Nations
[A 18082, para.
8].
A further step to strengthen the edifice of peace that this Assembly could take is to adopt a d

    '''


In [9]:
import spacy

In [10]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(text_str)

In [11]:
for token in doc:
    print(token.text,'->',token.pos_)


     -> SPACE
Mr. -> PROPN
President -> PROPN
, -> PUNCT
I -> PRON
offer -> VERB
you -> PRON
our -> PRON
congratulations -> NOUN
on -> ADP
your -> PRON
election -> NOUN
as -> ADP
the -> DET
President -> PROPN
of -> ADP
the -> DET
current -> ADJ
session -> NOUN
of -> ADP
the -> DET
General -> PROPN
Assembly -> PROPN
. -> PUNCT

 -> SPACE
You -> PRON
represent -> VERB
Norway -> PROPN
, -> PUNCT
a -> DET
country -> NOUN
which -> DET
can -> AUX
take -> VERB
pride -> NOUN
in -> ADP
its -> PRON
reputation -> NOUN
as -> ADP
peaceful -> ADJ
, -> PUNCT
just -> ADV
and -> CCONJ
progressive -> ADJ
. -> PUNCT

 -> SPACE
Your -> PRON
personal -> ADJ
qualifications -> NOUN
and -> CCONJ
your -> PRON
family -> NOUN
's -> PART
dedication -> NOUN
to -> ADP
international -> ADJ
effort -> NOUN
are -> AUX
well -> ADV
known -> ADJ
. -> PUNCT

 -> SPACE
I -> PRON
should -> AUX
also -> ADV
like -> VERB
to -> PART
express -> VERB
our -> PRON
appreciation -> NOUN
of -> ADP
the -> DET
services -> NOUN
of -> ADP

In [48]:
from spacy import displacy 
displacy.render(doc, style='dep',jupyter=True)

# HMM Implementation

In [1]:
import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint, time
nltk.download('treebank')
nltk.download('universal_tagset')
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))
print(nltk_data[:5])

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\jahna\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\jahna\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')], [('Rudolph', 'NOUN'), ('Agnew', 'NOUN'), (',', '.'), ('55', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), ('and', 'CONJ'), ('former', 'ADJ'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Consolidated', 'NOUN'), ('Gold', 'NOUN'), ('Fields', 'NOUN'), ('PLC', 'NOUN'), (',', '.'), ('was', 'VERB'), ('named', 'VERB'), ('*-1', 'X'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('of', 'ADP'), ('this', 'DET'), ('British', 'ADJ'), ('industrial', 'ADJ'), ('

In [13]:
for sent in nltk_data[:5]:
  for tuple in sent:
    print(tuple)

('Pierre', 'NOUN')
('Vinken', 'NOUN')
(',', '.')
('61', 'NUM')
('years', 'NOUN')
('old', 'ADJ')
(',', '.')
('will', 'VERB')
('join', 'VERB')
('the', 'DET')
('board', 'NOUN')
('as', 'ADP')
('a', 'DET')
('nonexecutive', 'ADJ')
('director', 'NOUN')
('Nov.', 'NOUN')
('29', 'NUM')
('.', '.')
('Mr.', 'NOUN')
('Vinken', 'NOUN')
('is', 'VERB')
('chairman', 'NOUN')
('of', 'ADP')
('Elsevier', 'NOUN')
('N.V.', 'NOUN')
(',', '.')
('the', 'DET')
('Dutch', 'NOUN')
('publishing', 'VERB')
('group', 'NOUN')
('.', '.')
('Rudolph', 'NOUN')
('Agnew', 'NOUN')
(',', '.')
('55', 'NUM')
('years', 'NOUN')
('old', 'ADJ')
('and', 'CONJ')
('former', 'ADJ')
('chairman', 'NOUN')
('of', 'ADP')
('Consolidated', 'NOUN')
('Gold', 'NOUN')
('Fields', 'NOUN')
('PLC', 'NOUN')
(',', '.')
('was', 'VERB')
('named', 'VERB')
('*-1', 'X')
('a', 'DET')
('nonexecutive', 'ADJ')
('director', 'NOUN')
('of', 'ADP')
('this', 'DET')
('British', 'ADJ')
('industrial', 'ADJ')
('conglomerate', 'NOUN')
('.', '.')
('A', 'DET')
('form', 'NOUN'

In [14]:
train_set,test_set =train_test_split(nltk_data,train_size=0.80,test_size=0.20,random_state = 101)

In [15]:
train_tagged_words = [ tup for sent in train_set for tup in sent ]
test_tagged_words = [ tup for sent in test_set for tup in sent ]
print(len(train_tagged_words))
print(len(test_tagged_words))

80310
20366


In [16]:
train_tagged_words[:15]

[('Drink', 'NOUN'),
 ('Carrier', 'NOUN'),
 ('Competes', 'VERB'),
 ('With', 'ADP'),
 ('Cartons', 'NOUN'),
 ('At', 'ADP'),
 ('last', 'ADJ'),
 ('count', 'NOUN'),
 (',', '.'),
 ('Candela', 'NOUN'),
 ('had', 'VERB'),
 ('sold', 'VERB'),
 ('$', '.'),
 ('4', 'NUM'),
 ('million', 'NUM')]

In [17]:
tags = {tag for word,tag in train_tagged_words}
print(len(tags))
print(tags)
vocab = {word for word,tag in train_tagged_words}

12
{'NOUN', 'PRT', 'VERB', 'ADV', 'CONJ', 'ADJ', 'DET', 'PRON', '.', 'NUM', 'X', 'ADP'}


In [18]:
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    return (count_w_given_tag, count_tag)

In [19]:
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [20]:
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]
 
print(tags_matrix)

[[2.62344331e-01 4.39345129e-02 1.49133503e-01 1.68945398e-02
  4.24540639e-02 1.25838192e-02 1.31063312e-02 4.65906132e-03
  2.40094051e-01 9.14395228e-03 2.88252197e-02 1.76826611e-01]
 [2.50489235e-01 1.17416831e-03 4.01174158e-01 9.39334650e-03
  2.34833662e-03 8.29745606e-02 1.01369865e-01 1.76125243e-02
  4.50097844e-02 5.67514673e-02 1.21330721e-02 1.95694715e-02]
 [1.10589318e-01 3.06629837e-02 1.67955801e-01 8.38858187e-02
  5.43278083e-03 6.63904250e-02 1.33609578e-01 3.55432779e-02
  3.48066315e-02 2.28360966e-02 2.15930015e-01 9.23572779e-02]
 [3.21955010e-02 1.47401085e-02 3.39022487e-01 8.14584941e-02
  6.98215654e-03 1.30721495e-01 7.13731572e-02 1.20248254e-02
  1.39255241e-01 2.98681147e-02 2.28859577e-02 1.19472459e-01]
 [3.49066973e-01 4.39077942e-03 1.50384188e-01 5.70801310e-02
  5.48847427e-04 1.13611415e-01 1.23490669e-01 6.03732169e-02
  3.51262353e-02 4.06147093e-02 9.33040585e-03 5.59824370e-02]
 [6.96893215e-01 1.14563107e-02 1.14563107e-02 5.24271838e-03
  1

In [21]:
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
display(tags_df)

Unnamed: 0,NOUN,PRT,VERB,ADV,CONJ,ADJ,DET,PRON,.,NUM,X,ADP
NOUN,0.262344,0.043935,0.149134,0.016895,0.042454,0.012584,0.013106,0.004659,0.240094,0.009144,0.028825,0.176827
PRT,0.250489,0.001174,0.401174,0.009393,0.002348,0.082975,0.10137,0.017613,0.04501,0.056751,0.012133,0.019569
VERB,0.110589,0.030663,0.167956,0.083886,0.005433,0.06639,0.13361,0.035543,0.034807,0.022836,0.21593,0.092357
ADV,0.032196,0.01474,0.339022,0.081458,0.006982,0.130721,0.071373,0.012025,0.139255,0.029868,0.022886,0.119472
CONJ,0.349067,0.004391,0.150384,0.05708,0.000549,0.113611,0.123491,0.060373,0.035126,0.040615,0.00933,0.055982
ADJ,0.696893,0.011456,0.011456,0.005243,0.016893,0.063301,0.005243,0.000194,0.066019,0.021748,0.020971,0.080583
DET,0.635906,0.000287,0.040247,0.012074,0.000431,0.206411,0.006037,0.003306,0.017393,0.022855,0.045134,0.009918
PRON,0.212756,0.014123,0.484738,0.036902,0.005011,0.070615,0.009567,0.006834,0.041913,0.006834,0.088383,0.022323
.,0.218539,0.002789,0.08969,0.052569,0.060079,0.046132,0.172192,0.068769,0.092372,0.07821,0.025641,0.092908
NUM,0.35166,0.026062,0.020707,0.00357,0.014281,0.035345,0.00357,0.001428,0.119243,0.18422,0.202428,0.037487


# Viterbi Algorithm for optimizaton
    

In [25]:
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    for key, word in enumerate(words):
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
        pmax = max(p)
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [26]:
random.seed(1234) 
rndom = [random.randint(1,len(test_set)) for x in range(10)]
test_run = [test_set[i] for i in rndom]
test_run_base = [tup for sent in test_run for tup in sent]
test_tagged_words = [tup[0] for sent in test_run for tup in sent]

In [27]:
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
difference = end-start 
print("Time taken in seconds: ", difference)
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 
accuracy = len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy: ',accuracy*100)

Time taken in seconds:  71.95472812652588
Viterbi Algorithm Accuracy:  95.69377990430623


In [28]:
patterns = [
    (r'.*ing$', 'VERB'),              # gerund
    (r'.*ed$', 'VERB'),               # past tense 
    (r'.*es$', 'VERB'),               # verb    
    (r'.*\'s$', 'NOUN'),              # possessive nouns
    (r'.*s$', 'NOUN'),                # plural nouns
    (r'\*T?\*?-[0-9]+$', 'X'),        # X
    (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'), # cardinal numbers
    (r'.*', 'NOUN')                   # nouns
]
rule_based_tagger = nltk.RegexpTagger(patterns)

In [29]:
def Viterbi_rule_based(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    for key, word in enumerate(words):
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
        pmax = max(p)
        state_max = rule_based_tagger.tag([word])[0][1]       
        if(pmax==0):
            state_max = rule_based_tagger.tag([word])[0][1] # assign based on rule based tagger
        else:
            if state_max != 'X':
                state_max = T[p.index(pmax)]                
        state.append(state_max)
    return list(zip(words, state))

In [None]:
*