In [54]:
# -*- coding: utf-8 -*-
"""
Created on Sun Aug  6 11:50:44 2017
@ Main author: Abdul-Maalik"""
"@ editing & cahnging Junaid Ghauri
"""
********************************** Start DESCRIPTION *********************************************************************************************************

STEP ONE: I have attach the english language corpus and python file and data_files folder must be in same place to run this program
STEP TWO: I have created the NamedEntityRecognition class and define __init__ function.
STEP THREE: I have set the corpus unzip correct path.
STEP FOUR: I have read all files from corpus that have extension of .tags
STEP FIVE: For every sentence in all files, every word is separated by 1 newline character. For every word, each annotation is separated by a tab character. 
STEP SIXTH: Then, IOB Tagging system contains tags of the form:
    B-{CHUNK_TYPE} – for the word in the Beginning chunk
    I-{CHUNK_TYPE} – for words Inside the chunk
    O – Outside any chunk
STEP SEVEN: I have extracted the features from .tags files
STEP EIGHT: Then,I have defined conll_iob function that assign the IOB tags
STEP NINE: I have created training sample and testing samples.
STEP TEN: I have taken some datasets for training and then I have given input a sentence of english langauge 
STEP ELEVEN: input sentence is tokenized and then passed in a pos tagger built-in NLTK.
STEP TWELTH: After that I have passed the input sentence to parse function and this function return a decision tree.   
STEP THIRTEEN: Then I have passed decision tree to tree2conlltags built in funcion to get collocaion
STEP FOURTEEN:After that I have display the result 
Step FIFTEEN: Finally,I have calculated the accuracy of my result
STEP SIXTEEN : OUTPUT
                        #training samples = 55809
                        #test samples = 6102

                        IOB tagging of the given sentence is::::::::::::::::::::::::
                        [('The', 'DT', 'O'), ('United', 'NNP', 'B-org'), ('Nations', 'NNP', 'I-org'), ('has', 'VBZ', 'I-org'), ('administered', 'VBN', 'O'), ('Kosovo', 'NNP', 'B-tim'), ('since', 'IN', 'I-tim'), ('1999', 'CD', 'B-tim'), (',', ',', 'O'), ('since', 'IN', 'O'), ('NATO', 'NNP', 'B-org'), ('air', 'NN', 'O'), ('strikes', 'NNS', 'O'), ('against', 'IN', 'O'), ('Yugoslavia', 'NNP', 'B-gpe'), ('forced', 'VBD', 'I-gpe'), ('Yugoslav', 'NNP', 'B-tim'), ('and', 'CC', 'O'), ('Serbian', 'JJ', 'B-gpe'), ('security', 'NN', 'O'), ('forces', 'NNS', 'O'), ('from', 'IN', 'O'), ('the', 'DT', 'O'), ('area', 'NN', 'O'), ('.', '.', 'O')]

                        Entity Recognition of the Given sentence is ::::::::::::::::::
                        (S
                          The/DT
                          (org United/NNP Nations/NNP has/VBZ)
                          administered/VBN
                          (tim Kosovo/NNP since/IN)
                          (tim 1999/CD)
                          ,/,
                          since/IN
                          (org NATO/NNP)
                          air/NN
                          strikes/NNS
                          against/IN
                          (gpe Yugoslavia/NNP forced/VBD)
                          (tim Yugoslav/NNP)
                          and/CC
                          (gpe Serbian/JJ)
                          security/NN
                          forces/NNS
                          from/IN
                          the/DT
                          area/NN
                          ./.)

                        Evaluating the accuracy of the result is :  0.9156785243741765
                
STEP SEVENTEEN:In the above output:
    First is length of training dataset and testing datasets
    Second is Output of IOB function of input sentence like The is  "O", United is "B-org" and Nation is "I-org"
    THird is Entity Recognition like United Nation and NATO is organization and Yugoslav is geo location means country 
    Fourth is evaluation of my results and output in line 55
    
*************************************************** END DESCRIPTION **************************************************
"""

import os,string
import codecs
import collections
from collections import Iterable

import nltk
from nltk import pos_tag, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.chunk import conlltags2tree,tree2conlltags,ChunkParserI
from nltk.tag import ClassifierBasedTagger

ner_tags = collections.Counter() 
corpus_files = "data"

class NamedEntityRecognition(ChunkParserI):
    
    def __init__(self, train_sents, **kwargs):
        assert isinstance(train_sents, Iterable)
 
        self.feature_detector = NamedEntityRecognition.features
        self.tagger = ClassifierBasedTagger(
            train=train_sents,
            feature_detector=NamedEntityRecognition.features,
            **kwargs)
 
    def parse(self, tagged_sent):
        
        chunks = self.tagger.tag(tagged_sent)
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]
        return conlltags2tree(iob_triplets)
    
    def features(tokens, index, history):

        stemmer = SnowballStemmer('english')
        tokens = [('[START2]', '[START2]'), ('[START1]', '[START1]')] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')]
        history = ['[START2]', '[START1]'] + list(history)

        index += 2
     
        word, pos = tokens[index]
        prevword, prevpos = tokens[index - 1]
        prevprevword, prevprevpos = tokens[index - 2]
        nextword, nextpos = tokens[index + 1]
        nextnextword, nextnextpos = tokens[index + 2]
        previob = history[index - 1]
        contains_dash = '-' in word
        contains_dot = '.' in word
        allascii = all([True for c in word if c in string.ascii_lowercase])
     
        allcaps = word == word.capitalize()
        capitalized = word[0] in string.ascii_uppercase
     
        prevallcaps = prevword == prevword.capitalize()
        prevcapitalized = prevword[0] in string.ascii_uppercase
     
        nextallcaps = prevword == prevword.capitalize()
        nextcapitalized = prevword[0] in string.ascii_uppercase
     
        return {
            'word': word,
            'lemma': stemmer.stem(word),
            'pos': pos,
            'all-ascii': allascii,
     
            'next-word': nextword,
            'next-lemma': stemmer.stem(nextword),
            'next-pos': nextpos,
     
            'next-next-word': nextnextword,
            'nextnextpos': nextnextpos,
     
            'prev-word': prevword,
            'prev-lemma': stemmer.stem(prevword),
            'prev-pos': prevpos,
     
            'prev-prev-word': prevprevword,
            'prev-prev-pos': prevprevpos,
     
            'prev-iob': previob,
     
            'contains-dash': contains_dash,
            'contains-dot': contains_dot,
     
            'all-caps': allcaps,
            'capitalized': capitalized,
     
            'prev-all-caps': prevallcaps,
            'prev-capitalized': prevcapitalized,
     
            'next-all-caps': nextallcaps,
            'next-capitalized': nextcapitalized,
        }
    
    
    def to_conll_iob(annotated_sentence):

        global iob_tokens
        iob_tokens = []
        for idx, annotated_token in enumerate(annotated_sentence):
            tag, word, ner = annotated_token
         
            if ner != 'O':
                if idx == 0:
                    ner = "B-" + ner
                elif annotated_sentence[idx - 1][2] == ner:
                    ner = "I-" + ner
                else:
                    ner = "B-" + ner
            iob_tokens.append((tag, word, ner))
        return iob_tokens
                    
    def read_tag_files(corpus_files):
        
        for root, dirs, files in os.walk(corpus_files):
            for filename in files:
                if filename.endswith(".tags"):
                    
                    with open(os.path.join(root, filename), 'rb') as file_handle:
                        file_content = file_handle.read().decode('utf-8').strip()
                        annotated_sentences = file_content.split('\n\n')
                        for annotated_sentence in annotated_sentences:
                            annotated_tokens = [seq for seq in annotated_sentence.split('\n') if seq]
     
                            standard_form_tokens = []
        
                            for idx, annotated_token in enumerate(annotated_tokens):
                                annotations = annotated_token.split('\t')
                                word, tag, ner = annotations[0], annotations[1], annotations[3]
                                            
                                if ner != 'O':
                                    ner = ner.split('-')[0]
     
                                if tag in ('LQU', 'RQU'):
                                    tag = "``"
     
                                standard_form_tokens.append((word, tag, ner))
        
                            conll_tokens = NamedEntityRecognition.to_conll_iob(standard_form_tokens)
                        
                            yield [((w, t), iob) for w, t, iob in conll_tokens]


    def Main():
        
        reader = NamedEntityRecognition.read_tag_files(corpus_files)
        data = list(reader)
        training_samples = data[:int(len(data) * 0.9)]
        test_samples = data[int(len(data) * 0.9):]
        
        print ("#training samples = %s" % len(training_samples))
        print ("#test samples = %s" % len(test_samples))
        
        print ("\nIOB tagging of the given sentence is::::::::::::::::::::::::")
        chunker = NamedEntityRecognition(training_samples[:500]) 
        sentance = "The United Nations has administered Kosovo since 1999, since NATO air strikes against Yugoslavia forced Yugoslav and Serbian security forces from the area" 

        tagged_sentance = pos_tag(word_tokenize(sentance))
        NER_tree = chunker.parse (tagged_sentance)
        iob_tagged = tree2conlltags(NER_tree)
        print (iob_tagged)
        
        print ("\nEntity Recognition using IOB of the Given sentence is ::::::::::::::::::")
        print (NER_tree)
        
        score = chunker.evaluate([conlltags2tree([(w, t, iob) for (w, t), iob in iobs]) for iobs in test_samples[:50]])
        print ("\nEvaluating the accuracy of the IOB taggs is :  "+str(score.accuracy()))
        
        
NamedEntityRecognition.Main()


#training samples = 55809
#test samples = 6201

IOB tagging of the given sentence is::::::::::::::::::::::::
[('The', 'DT', 'O'), ('United', 'NNP', 'B-org'), ('Nations', 'NNP', 'I-org'), ('has', 'VBZ', 'I-org'), ('administered', 'VBN', 'O'), ('Kosovo', 'NNP', 'B-tim'), ('since', 'IN', 'I-tim'), ('1999', 'CD', 'B-tim'), (',', ',', 'O'), ('since', 'IN', 'O'), ('NATO', 'NNP', 'B-org'), ('air', 'NN', 'O'), ('strikes', 'NNS', 'O'), ('against', 'IN', 'O'), ('Yugoslavia', 'NNP', 'B-gpe'), ('forced', 'VBD', 'I-gpe'), ('Yugoslav', 'NNP', 'B-tim'), ('and', 'CC', 'O'), ('Serbian', 'JJ', 'B-gpe'), ('security', 'NN', 'O'), ('forces', 'NNS', 'O'), ('from', 'IN', 'O'), ('the', 'DT', 'O'), ('area', 'NN', 'O')]

Entity Recognition using IOB of the Given sentence is ::::::::::::::::::
(S
  The/DT
  (org United/NNP Nations/NNP has/VBZ)
  administered/VBN
  (tim Kosovo/NNP since/IN)
  (tim 1999/CD)
  ,/,
  since/IN
  (org NATO/NNP)
  air/NN
  strikes/NNS
  against/IN
  (gpe Yugoslavia/NNP forced/VBD)
  (t

In [2]:
        from nltk import pos_tag, word_tokenize
        import nltk
        print ("\nContext free grammer Entity Recognition ::::::::::::::::::::::")
        print ("\nStarted..................\n")
        ######################### Part of Speech tagging using nltk library #######################################
        
        sentance = "The United Nations has administered Kosovo since 1999, since NATO air strikes against Yugoslavia forced Yugoslav and Serbian security forces from the area."
        sent = "The Turkey banned the Facebook due to waste of time" 
        tagged_sentance = pos_tag(word_tokenize(sentance))
       
        ############################# List of Countries ans Organization ###########################################
        
        org = ["United Nations","UNESCO","African Union","UNICEF","World Trade Organization","Serbian security forces","Google","Microsoft","Yugoslav","Facebook","NATO"]
        country = ["Ukraine","United Arab Emirates","United Kingdom","United States","Turkey","Uzbekistan","Kosovo","Venezuela","Vietnam","Virgin Islands","Wallis and Futuna","West Bank","Yugoslavia","Yemen","Zambia","Zimbabwe"]
        WeekDays = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
        ########################### Regular Expression are defined #################################################
        
        grammar = r"""
            NBAR:
                {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns

            NP:
                {<NBAR>}
                {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
        """
        sentence_re = r'''(?x)      # set flag to allow verbose regexps
              ([A-Z])(\.[A-Z])+\.?  # abbreviations, e.g. U.S.A.
            | \w+(-\w+)*            # words with optional internal hyphens
            | \$?\d+(\.\d+)?%?      # currency and percentages, e.g. $12.40, 82%
            | \.\.\.                # ellipsis
            | [][.,;"'?():-_`]      # these are separate tokens
        '''
        ################################# Apply Regular Expression on tagged sentance ##############################
        
        print ("Named Phrase as a NP shown where appropriated ..................\n")
        cp = nltk.RegexpParser(grammar)
        result = cp.parse(tagged_sentance)
        print(result)
        
        ################################### Separate the Noun phrase from tagged data ##########################
        
        NER_NN  = list()
        NER_NNP = list()
        
        for tag in result:
            if isinstance (tag, nltk.tree.Tree):
               
                if tag.label()== 'NP':
                    tag_list = tag.leaves()
                    temp_NNP = ""
                    temp_NN = ""
                    
                    for k in range (len(tag_list)): 
                        if 'NN' or 'NNS' in tag_list[k]:
                            if temp_NN == "":
                                temp_NN = tag_list[k][0]
                            else:
                                temp_NN = temp_NN+" "+tag_list[k][0]
                            if tag_list[k][0] == "time":
                                tag.set_label("tim")

                        if 'NNP' in tag_list[k]:
                            if temp_NNP == "":
                                temp_NNP = tag_list[k][0]
                            else:
                                temp_NNP = temp_NNP+" "+tag_list[k][0]
                        if 'NNPS' in tag_list[k]:
                            if temp_NNP == "":
                                temp_NNP = tag_list[k][0]
                            else:
                                temp_NNP = temp_NNP+" "+tag_list[k][0]
                                
                        if 'JJ' in tag_list[k] and k == 0 and 'NN' in tag_list[k+1]:
                            if tag_list[k+2]:
                                temp_NNP= tag_list[k][0]+" "+tag_list[k+1][0]+" "+tag_list[k+2][0]
                            else:
                                temp_NNP= tag_list[k][0]+" "+tag_list[k+1][0]
                                print (temp_NNP)
                                    
                                
                    
                    if temp_NN != "":
                        if temp_NN != temp_NNP:
                            NER_NN.append(temp_NN)
                            
        ####################### Noun phrase are match with different lists according to Data set like list of country and organization
        
                    if temp_NNP != "":
                        for j in range(len(org)):
                            if org[j] == temp_NNP:
                                tag.set_label("org")
                                NER_NNP.append(temp_NNP)
                        
                        for k in range (len(country)):
                            if country[k] == temp_NNP:
                                tag.set_label("geo-loc")
                                NER_NNP.append(temp_NNP)
                                
        print ("\nNamed Entity using context free grammer after matching with different list of named entities..................\n")            
        print (result)
        
        ############################ Noun phrase are printed #####################################
        
        print ("\n Common Nouns are "+ str(NER_NN))
        print ("\n Named Phrases "+ str(NER_NNP)+"\n")
        


Context free grammer Entity Recognition ::::::::::::::::::::::

Started..................

Named Phrase as a NP shown where appropriated ..................

(S
  The/DT
  (NP (NBAR United/NNP Nations/NNP))
  has/VBZ
  administered/VBN
  (NP (NBAR Kosovo/NNP))
  since/IN
  1999/CD
  ,/,
  since/IN
  (NP (NBAR NATO/NNP air/NN strikes/NNS))
  against/IN
  (NP (NBAR Yugoslavia/NNP))
  forced/VBD
  (NP (NBAR Yugoslav/NNP))
  and/CC
  (NP (NBAR Serbian/JJ security/NN forces/NNS))
  from/IN
  the/DT
  (NP (NBAR area/NN))
  ./.)

Named Entity using context free grammer after matching with different list of named entities..................

(S
  The/DT
  (org (NBAR United/NNP Nations/NNP))
  has/VBZ
  administered/VBN
  (geo-loc (NBAR Kosovo/NNP))
  since/IN
  1999/CD
  ,/,
  since/IN
  (org (NBAR NATO/NNP air/NN strikes/NNS))
  against/IN
  (geo-loc (NBAR Yugoslavia/NNP))
  forced/VBD
  (org (NBAR Yugoslav/NNP))
  and/CC
  (org (NBAR Serbian/JJ security/NN forces/NNS))
  from/IN
  the/DT
  (N