count_freqs

In [1]:
#! /usr/bin/python

import sys
from collections import defaultdict
import math

"""
Count n-gram frequencies in a data file and write counts to
stdout. 
"""

def simple_conll_corpus_iterator(corpus_file):
    """
    Get an iterator object over the corpus file. The elements of the
    iterator contain (word, ne_tag) tuples. Blank lines, indicating
    sentence boundaries return (None, None).
    """
    l = corpus_file.readline()
    while l:
        line = l.strip()
        if line: # Nonempty line
            # Extract information from line.
            # Each line has the format
            # word pos_tag phrase_tag ne_tag
            fields = line.split(" ")
            ne_tag = fields[-1]
            #phrase_tag = fields[-2] #Unused
            #pos_tag = fields[-3] #Unused
            word = " ".join(fields[:-1])
            yield word, ne_tag
        else: # Empty line
            yield (None, None)                        
        l = corpus_file.readline()

def sentence_iterator(corpus_iterator):
    """
    Return an iterator object that yields one sentence at a time.
    Sentences are represented as lists of (word, ne_tag) tuples.
    """
    current_sentence = [] #Buffer for the current sentence
    for l in corpus_iterator:        
            if l==(None, None):
                if current_sentence:  #Reached the end of a sentence
                    yield current_sentence
                    current_sentence = [] #Reset buffer
                else: # Got empty input stream
                    sys.stderr.write("WARNING: Got empty input file/stream.\n")
                    raise StopIteration
            else:
                current_sentence.append(l) #Add token to the buffer

    if current_sentence: # If the last line was blank, we're done
        yield current_sentence  #Otherwise when there is no more token
                                # in the stream return the last sentence.

def get_ngrams(sent_iterator, n):
    """
    Get a generator that returns n-grams over the entire corpus,
    respecting sentence boundaries and inserting boundary tokens.
    Sent_iterator is a generator object whose elements are lists
    of tokens.
    """
    for sent in sent_iterator:
         #Add boundary symbols to the sentence
         w_boundary = (n-1) * [(None, "*")]
         w_boundary.extend(sent)
         w_boundary.append((None, "STOP"))
         #Then extract n-grams
         ngrams = (tuple(w_boundary[i:i+n]) for i in range(len(w_boundary)-n+1))
         for n_gram in ngrams: #Return one n-gram at a time
            yield n_gram        


class Hmm(object):
    """
    Stores counts for n-grams and emissions. 
    """

    def __init__(self, n=3):
        assert n>=2, "Expecting n>=2."
        self.n = n
        self.emission_counts = defaultdict(int)
        self.ngram_counts = [defaultdict(int) for i in range(self.n)]
        self.all_states = set()

    def train(self, corpus_file):
        """
        Count n-gram frequencies and emission probabilities from a corpus file.
        """
        ngram_iterator = \
            get_ngrams(sentence_iterator(simple_conll_corpus_iterator(corpus_file)), self.n)

        for ngram in ngram_iterator:
            #Sanity check: n-gram we get from the corpus stream needs to have the right length
            assert len(ngram) == self.n, "ngram in stream is %i, expected %i" % (len(ngram, self.n))

            tagsonly = tuple([ne_tag for word, ne_tag in ngram]) #retrieve only the tags            
            for i in range(2, self.n+1): #Count NE-tag 2-grams..n-grams
                self.ngram_counts[i-1][tagsonly[-i:]] += 1
            
            if ngram[-1][0] is not None: # If this is not the last word in a sentence
                self.ngram_counts[0][tagsonly[-1:]] += 1 # count 1-gram
                self.emission_counts[ngram[-1]] += 1 # and emission frequencies

            # Need to count a single n-1-gram of sentence start symbols per sentence
            if ngram[-2][0] is None: # this is the first n-gram in a sentence
                self.ngram_counts[self.n - 2][tuple((self.n - 1) * ["*"])] += 1

    def write_counts(self, output, printngrams=[1,2,3]):
        """
        Writes counts to the output file object.
        Format:

        """
        # First write counts for emissions
        for word, ne_tag in self.emission_counts:            
            output.write("%i WORDTAG %s %s\n" % (self.emission_counts[(word, ne_tag)], ne_tag, word))


        # Then write counts for all ngrams
        for n in printngrams:            
            for ngram in self.ngram_counts[n-1]:
                ngramstr = " ".join(ngram)
                output.write("%i %i-GRAM %s\n" %(self.ngram_counts[n-1][ngram], n, ngramstr))

    def read_counts(self, corpusfile):

        self.n = 3
        self.emission_counts = defaultdict(int)
        self.ngram_counts = [defaultdict(int) for i in xrange(self.n)]
        self.all_states = set()

        for line in corpusfile:
            parts = line.strip().split(" ")
            count = float(parts[0])
            if parts[1] == "WORDTAG":
                ne_tag = parts[2]
                word = parts[3]
                self.emission_counts[(word, ne_tag)] = count
                self.all_states.add(ne_tag)
            elif parts[1].endswith("GRAM"):
                n = int(parts[1].replace("-GRAM",""))
                ngram = tuple(parts[2:])
                self.ngram_counts[n-1][ngram] = count
                

eval_gene_tagger

In [142]:
#! /usr/bin/python

import sys


"""
Evaluate gene tagger output by comparing it to a gold standard file.

Running the script on your tagger output like this

    python eval_gene_tagger.py gene_dev.key your_tagger_output.dat

will generate a table of results like this:

    Found 14071 GENES. Expected 5942 GENES; Correct: 3120.

		 precision 	recall 		F1-Score
    GENE:	 0.433367	0.231270	0.301593

Adopted from original named entity evaluation.

"""

def corpus_iterator(corpus_file, with_logprob = False):
    """
    Get an iterator object over the corpus file. The elements of the
    iterator contain (word, ne_tag) tuples. Blank lines, indicating
    sentence boundaries return (None, None).
    """
    l = corpus_file.readline()    
    tagfield = with_logprob and -2 or -1

    try:
        while l:
            line = l.strip()
            if line: # Nonempty line
                # Extract information from line.
                # Each line has the format
                # word ne_tag [log_prob]
                fields = line.split(" ")
                ne_tag = fields[tagfield]
                word = " ".join(fields[:tagfield])
                yield word, ne_tag
            else: # Empty line
                yield (None, None)
            l = corpus_file.readline()
    except IndexError:
        sys.stderr.write("Could not read line: \n")
        sys.stderr.write("\n%s" % line)
        if with_logprob:
            sys.stderr.write("Did you forget to output log probabilities in the prediction file?\n")
        sys.exit(1)


class NeTypeCounts(object):
    """
    Stores true/false positive/negative counts for each NE type.
    """

    def __init__(self):
        self.tp = 0
        self.fp = 0
        self.tn = 0
        self.fn = 0 

    def get_precision(self):
        return self.tp / float(self.tp + self.fp)

    def get_recall(self):
        return self.tp / float(self.tp + self.fn)

    def get_accuracy(self):
        return (self.tp + self.tn) / float(self.tp + self.tn + self.fp + self.fn)


class Evaluator(object):
    """
    Stores global true/false positive/negative counts. 
    """


    ne_classes = ["GENE"]

    def __init__(self):        
        self.tp = 0
        self.tn = 0
        self.fp = 0        
        self.fn = 0

        # Initialize an object that counts true/false positives/negatives
        # for each NE class
        self.class_counts = {}
        for c in self.ne_classes:
            self.class_counts[c] = NeTypeCounts()

    def compare(self, gold_standard, prediction):
        """
        Compare the prediction against a gold standard. Both objects must be
        generator or iterator objects that return a (word, ne_tag) tuple at a
        time.
        """

        # Define a couple of tags indicating the status of each stream
        curr_pred_type = None # prediction stream was previously in a named entity
        curr_pred_start = None # a new prediction starts at the current token
        curr_gs_type = None   # prediction stream was previously in a named entity
        curr_gs_start = None # a new prediction starts at the current token

        total = 0
        for gs_word, gs_tag in gold_standard: # Move through the gold standard stream
            #print(gs_word,gs_tag)
            pred_word, pred_tag = next(prediction) # Get the corresponding item from the prediction stream
            #print(pred_word,pred_tag)
            # Make sure words in both files match up
            if gs_word != pred_word:
                sys.stderr.write("Could not align gold standard and predictions in line %i.\n" % (total+1))
                sys.stderr.write("Gold standard: %s  Prediction file: %s\n" % (gs_word, pred_word))
                sys.exit(1)        

            # Split off the I and B tags
            gs_type = gs_tag==None and "O" or gs_tag.split("-")[-1]
            pred_type = pred_tag==None and "O" or pred_tag.split("-")[-1]                        

            # Check if a named entity ends here in either stream.
            # This is the case if we are currently in an entity and either
            #   - end of sentence
            #   - current word is marked O
            #   - new entity starts (B - or I with different NE type)
            pred_ends = curr_pred_type!=None and ((pred_tag==None or pred_tag[0] in "OB") or (curr_pred_type!=pred_type and pred_tag[0]=="I"))
            gs_ends = curr_gs_type!=None and ((gs_tag==None or gs_tag[0] in "OB") or (curr_gs_type!=gs_type and gs_tag[0]=="I"))
            

            # Check if a named entity starts here in either stream.
            # This is tha case if this is not the end of a sentence and
            #   - This is not the end of a sentence
            #   - New entity starts (B, I after O or at begining of sentence or
            #       I with different NE type) 
            if pred_word!=None:
                pred_start = (pred_tag!=None and pred_tag[0] == "B") or (curr_pred_type==None and pred_tag[0]=="I") or \
                    (curr_pred_type!=None and curr_pred_type!=pred_type and pred_tag.startswith("I"))
                gs_starts = (gs_tag!=None and gs_tag[0] == "B") or (curr_gs_type==None and gs_tag[0]=="I") or \
                    (curr_gs_type!=None and curr_gs_type!=gs_type and gs_tag.startswith("I"))
            else:
                pred_start = False
                gs_starts = False            

            #For debugging:
            #print pred_word, gs_tag, pred_tag, pred_ends, gs_ends, pred_start, gs_starts


            # Now try to match up named entities that end here

            if gs_ends and pred_ends: # GS and prediction contain a named entity that ends in the same place

                #If both named entities start at the same place and are of the same type
                if curr_gs_start == curr_pred_start and curr_gs_type == curr_pred_type:
                    # Count true positives
                    self.tp += 1
                    self.class_counts[curr_pred_type].tp += 1
                else: #span matches, but label doesn't match: count both a true positive and a false negative
                    self.fp += 1
                    self.fn += 1
                    self.class_counts[curr_pred_type].fp += 1
                    self.class_counts[curr_gs_type].fn += 1
            elif gs_ends: #Didn't find the named entity in the gold standard, count false negative
                self.fn += 1
                self.class_counts[curr_gs_type].fn += 1
            elif pred_ends: #Named entity in the prediction doesn't match one int he gold_standard, count false positive
                self.fp += 1
                self.class_counts[curr_pred_type].fp += 1
            elif curr_pred_type==None and curr_pred_type==None: #matching O tag or end of sentence, count true negative
                self.tn += 1
                for c in self.ne_classes:
                    self.class_counts[c].tn += 1

            # Remember that we are no longer in a named entity
            if gs_ends:
                curr_gs_type = None
            if pred_ends:
                curr_pred_type = None

            # If a named entity starts here, remember it's type and this position
            if gs_starts:
                curr_gs_start = total
                curr_gs_type = gs_type
            if pred_start:
                curr_pred_start = total
                curr_pred_type = pred_type
            total += 1

    def print_scores(self):
        """
        Output a table with accuracy, precision, recall and F1 score. 
        """

        print ("Found %i GENEs. Expected %i GENEs; Correct: %i.\n" % (self.tp + self.fp, self.tp + self.fn, self.tp))


        if self.tp + self.tn + self.fp + self.fn == 0: # There was nothing to do.
            acc = 1
        else:
            acc = (self.tp + self.tn) / float(self.tp + self.tn + self.fp + self.fn)

        if self.tp+self.fp == 0:   # Prediction didn't annotate any NEs
            prec = 1
            
        else:
            prec = self.tp / float(self.tp + self.fp)
            

        if self.tp+self.fn == 0: # Prediction marked everything as a NE of the wrong type.
            rec = 1
        else:
            rec = self.tp / float(self.tp + self.fn)

        print ("\t precision \trecall \t\tF1-Score")
        fscore = (2*prec*rec)/(prec+rec)
        #print "Total:\t %f\t%f\t%f" % (prec, rec, fscore)
        for c in self.ne_classes:
            c_tp = self.class_counts[c].tp
            c_tn = self.class_counts[c].tn
            c_fp = self.class_counts[c].fp
            c_fn = self.class_counts[c].fn
            #print c
            #print c_tp
            #print c_tn
            #print c_fp
            #print c_fn
            if (c_tp + c_tn + c_fp + c_fn) == 0:                
                c_acc = 1
            else:
                c_acc = (c_tp + c_tn) / float(c_tp + c_tn + c_fp + c_fn)
            
            if (c_tp + c_fn) == 0:
                sys.stderr.write("Warning: no instances for entity type %s in gold standard.\n" % c)
                c_rec = 1
            else:
                c_rec = c_tp / float(c_tp + c_fn)
            if (c_tp + c_fp) == 0:
                sys.stderr.write("Warning: prediction file does not contain any instances of entity type %s.\n" % c)
                c_prec =1
            else:
                c_prec = c_tp / float(c_tp + c_fp)

            if c_prec + c_rec == 0:
                fscore = 0
            else:    
                fscore = (2*c_prec * c_rec)/(c_prec + c_rec)
            print ("%s:\t %f\t%f\t%f" % (c, c_prec, c_rec, fscore))


Part1: baseline

In [107]:
def emission(x,y):
    xx = x
    if (x in rarelist) or (x not in uni):
    #if (x in rarelist4) or (x not in uni):
    #if (x in rarelist3) or (x not in uni):
    #if (x in rarelist2) or (x not in uni):
        xx = '_RARE_'
    if (y,xx) in counttag:
        return (1.0*counttag[(y,xx)])/(1.0*unigram[y])
    else:
        return 0

    
def emissionum(x,y):
    if (y,x) in counttag:
        return (1.0*counttag[(y,x)])/(1.0*unigram[y])
    elif any(str.isdigit(c) for c in x):
        return (1.0*counttag[(y,'_NUM_')])/(1.0*unigram[y])
    else:
        return 0
    
def emissionprefix(x,y):
    if (y,x) in counttag:
        return (1.0*counttag[(y,x)])/(1.0*unigram[y])
    else:
        for a in preaf:
            if x[:len(a)] == a:
                if (y,a) in counttag:
                    return (1.0*counttag[(y,a)])/(1.0*unigram[y])
        return 0

def emissioncombined(x,y):
    tmp = x
    for a in preaf:
            if x[:len(a)] == a:
                x=a
                break
    if tmp == x:
        if any(str.isdigit(c) for c in x):
            x = '_NUM_'
    if (y,x) in counttag:
        return (1.0*counttag[(y,x)])/(1.0*unigram[y])
    else:
        return 0

def predbase(w):
    eO = emission(w,'O')
    eI = emission(w,'I-GENE')
    if( eO > eI):
        return 'O'
    else:
        return 'I-GENE'
        
def prednum(w):
    eO = emissionum(w,'O')
    eI = emissionum(w,'I-GENE')
    if( eO > eI):
        return 'O'
    else:
        return 'I-GENE'

def predprefix(w):
    eO = emissionprefix(w,'O')
    eI = emissionprefix(w,'I-GENE')
    if( eO > eI):
        return 'O'
    else:
        return 'I-GENE'
    
def predcombined(w):
    eO = emissioncombined(w,'O')
    eI = emissioncombined(w,'I-GENE')
    if( eO > eI):
        return 'O'
    else:
        return 'I-GENE'

In [6]:
inputf = open("gene.train","r")
uni = dict()
for l in inputf:
    l = l.split()
    if(len(l) != 0):
        if (l[0] in uni) == True:
            uni[l[0]] += 1
        else:
            uni[l[0]] = 1
rarelist = []
rarelist4 = []
rarelist3 = []
rarelist2 = []
for k in uni:
    if uni[k] < 5:
        rarelist.append(k)
    if uni[k] < 4:
        rarelist4.append(k)
    if uni[k] < 3:
        rarelist3.append(k)
    if uni[k] < 2:
        rarelist2.append(k)
print(len(uni))
print(len(rarelist),len(rarelist4),len(rarelist3),len(rarelist2))

31328
25074 23830 21756 17197


In [10]:
rareout = open("generare.train","w")
rareout4 = open("generare4.train","w")
rareout3 = open("generare3.train","w")
rareout2 = open("generare2.train","w")
inputf = open("gene.train","r")
for l in inputf:
    ls = l.split()
    if(len(ls) != 0):
        if (ls[0] in rarelist):
            rareout.write('_RARE_'+' '+ls[1]+'\n')
        else:
            rareout.write(ls[0]+' '+ls[1]+'\n')
        if (ls[0] in rarelist4):
            rareout4.write('_RARE_'+' '+ls[1]+'\n')
        else:
            rareout4.write(ls[0]+' '+ls[1]+'\n')
        if (ls[0] in rarelist3):
            rareout3.write('_RARE_'+' '+ls[1]+'\n')
        else:
            rareout3.write(ls[0]+' '+ls[1]+'\n')
        if (ls[0] in rarelist2):
            rareout2.write('_RARE_'+' '+ls[1]+'\n')
        else:
            rareout2.write(ls[0]+' '+ls[1]+'\n')
        
    else:
        rareout.write(l)
        rareout4.write(l)
        rareout3.write(l)
        rareout2.write(l)
rareout.close()
rareout4.close()
rareout3.close()
rareout2.close()

In [89]:
rareout = open("genenum.train","w")
inputf = open("gene.train","r")
for l in inputf:
    ls = l.split()
    if(len(ls) != 0):
        if any(str.isdigit(c) for c in ls[0]):
            ls[0] = '_NUM_'
        rareout.write(ls[0]+' '+ls[1]+'\n')
    else:
        rareout.write(l)
rareout.close()

In [90]:
rareout = open("geneprefix.train","w")
inputf = open("gene.train","r")
for l in inputf:
    ls = l.split()
    if(len(ls) != 0):
        for a in preaf:
            if ls[0][:len(a)] == a:
                ls[0] = a
        rareout.write(ls[0]+' '+ls[1]+'\n')
    else:
        rareout.write(l)
rareout.close()

In [104]:
rareout = open("genecombined.train","w")
inputf = open("gene.train","r")
for l in inputf:
    ls = l.split()
    if(len(ls) != 0):
        tmp = ls[0]
        for a in preaf:
            if ls[0][:len(a)] == a:
                ls[0] = a
        if tmp == ls[0]:
            if any(str.isdigit(c) for c in ls[0]):
                ls[0] = '_NUM_'
        rareout.write(ls[0]+' '+ls[1]+'\n')
    else:
        rareout.write(l)
rareout.close()

In [105]:
# Initialize a trigram counter without count<5
inputf = open("gene.train","r")
counter = Hmm(3)
# Collect counts
counter.train(inputf)
# Write the counts
outputf = open("gene.counts","w") 
counter.write_counts(outputf)
outputf.close()

In [11]:
# Initialize a trigram counter with count<5
inputf = open("generare.train","r")
inputf4 = open("generare4.train","r")
inputf3 = open("generare3.train","r")
inputf2 = open("generare2.train","r")
counter = Hmm(3)
counter.train(inputf)
outputf = open("generare.counts","w") 
counter.write_counts(outputf)
outputf.close()
counter4 = Hmm(3)
counter4.train(inputf4)
outputf4 = open("generare4.counts","w") 
counter4.write_counts(outputf4)
outputf4.close()
counter3 = Hmm(3)
counter3.train(inputf3)
outputf3 = open("generare3.counts","w") 
counter3.write_counts(outputf3)
outputf3.close()
counter2 = Hmm(3)
counter2.train(inputf2)
outputf2 = open("generare2.counts","w") 
counter2.write_counts(outputf2)
outputf2.close()

In [91]:
# Initialize a trigram counter with count<5
inputf = open("genenum.train","r")
counter = Hmm(3)
# Collect counts
counter.train(inputf)
# Write the counts
outputf = open("genenum.counts","w") 
counter.write_counts(outputf)
outputf.close()

In [92]:
# Initialize a trigram counter with count<5
inputf = open("geneprefix.train","r")
counter = Hmm(3)
# Collect counts
counter.train(inputf)
# Write the counts
outputf = open("geneprefix.counts","w") 
counter.write_counts(outputf)
outputf.close()

In [105]:
# Initialize a trigram counter with count<5
inputf = open("genecombined.train","r")
counter = Hmm(3)
# Collect counts
counter.train(inputf)
# Write the counts
outputf = open("genecombined.counts","w") 
counter.write_counts(outputf)
outputf.close()

In [207]:
#co = open("geneprefix.counts", "r")
#co = open("genenum.counts", "r")
co = open("generare.counts", "r")
#co = open("generare4.counts", "r")
#co = open("generare3.counts", "r")
#co = open("generare2.counts", "r")
#co = open("gene.counts","r")
#co = open("genecombined.counts",'r')

counttag = dict()
unigram = dict()
bigram = dict()
trigram = dict()
for l in co:
    l = l.split()
    if(l[1] == 'WORDTAG'):
        counttag[(l[2],l[3])] = int(l[0])
    elif(len(l) == 3):
        unigram[l[2]] = int(l[0])
    elif(len(l) == 4):
        bigram[(l[2],l[3])] = int(l[0])
    elif(len(l) == 5):
        trigram[(l[2],l[3],l[4])] = int(l[0])

In [85]:
dev = open("gene.dev", "r")
devout = open("gene.dev.p1.out",'w')
#devout = open("gene.dev.p6.out",'w')
#devout = open("gene.dev.p7.out",'w')
#devout = open("gene.dev.p8.out",'w')

In [86]:
for l in dev:
    ls = l.split()
    if (len(ls) != 0):
        pre = predbase(ls[0])
        devout.write(ls[0]+' '+pre+'\n')
    else:
        devout.write(l)
devout.close()

Results for baseline mataches the result in writeup

In [87]:
gs_iterator = corpus_iterator(open("gene.key"))
pred_iterator = corpus_iterator(open("gene.dev.p1.out"), with_logprob = False)
#pred_iterator = corpus_iterator(open("gene.dev.p6.out"), with_logprob = False)
#pred_iterator = corpus_iterator(open("gene.dev.p7.out"), with_logprob = False)
#pred_iterator = corpus_iterator(open("gene.dev.p8.out"), with_logprob = False)
evaluator = Evaluator()
evaluator.compare(gs_iterator, pred_iterator)
evaluator.print_scores()

Found 2669 GENEs. Expected 642 GENEs; Correct: 424.

	 precision 	recall 		F1-Score
GENE:	 0.158861	0.660436	0.256116


In [88]:
train = open("gene.train", "r")
devout = open("gene.train.p1.out",'w')
for l in train:
    ls = l.split()
    if (len(ls) != 0):
        pre = predbase(ls[0])
        devout.write(ls[0]+' '+pre+'\n')
    else:
        devout.write(l)
devout.close()
gs_iterator = corpus_iterator(open("gene.train"))
pred_iterator = corpus_iterator(open("gene.train.p1.out"), with_logprob = False)
evaluator = Evaluator()
evaluator.compare(gs_iterator, pred_iterator)
evaluator.print_scores()

Found 67658 GENEs. Expected 16637 GENEs; Correct: 11669.

	 precision 	recall 		F1-Score
GENE:	 0.172470	0.701388	0.276861


Try to observe indicative feature

In [165]:
import operator
sortedcounttag = sorted(counttag.items(), key=operator.itemgetter(0))

In [166]:
for k,v in sortedcounttag:
    if(k[0] == 'I-GENE') :
        print(k[1] + ', '+str(v))

", 7
#, 1
', 39
'), 1
',, 1
'-, 17
'-', 1
(, 247
(+), 4
(+)-, 4
(-, 2
(-), 1
(-/-), 2
), 214
)(, 6
),, 1
)-, 25
)-,, 1
)--, 1
)/, 4
*, 4
+, 42
+)-, 3
++-, 1
+,, 2
+-, 1
+/, 4
,, 79
-, 4395
-(, 3
-), 1
-,, 4
--, 3
----, 2
-->, 2
-/-, 1
-[, 2
., 165
.", 2
/, 227
/*, 1
/+, 2
0, 4
000, 2
0201, 1
08L, 2
1, 879
10, 12
100, 2
100beta, 1
101, 1
105, 3
109, 1
10q24, 1
11, 15
110, 1
113, 2
114, 1
115, 2
11B, 2
12, 12
121, 1
124, 1
125, 3
128, 1
12K, 1
12Q229L, 1
12S, 4
13, 2
130, 2
131, 1
133, 1
13Q226L, 1
13S, 1
14, 10
140, 1
143, 1
145, 1
14DM, 2
15, 3
150, 1
156, 1
16, 10
160, 1
16K, 1
16S, 2
17, 4
170, 1
17S, 1
18, 5
184, 1
185, 1
186, 1
18K, 2
18S, 4
19, 6
192, 1
19K, 2
19S, 1
1A, 3
1A1, 1
1B, 1
1C, 2
1D, 2
1H, 1
1N, 1
1R, 2
1Ra, 1
1a, 10
1alpha, 7
1b, 6
1beta, 11
1gamma, 2
1long, 1
1p, 1
2, 373
20, 6
200, 1
201, 1
2022, 1
20S, 7
2109, 1
212, 1
22, 1
23, 2
233, 1
235, 1
2368, 1
23S, 3
24, 3
240F, 1
248trp, 1
24K, 1
24p3, 1
25, 4
250, 3
259, 1
26, 1
26S, 3
27, 4
279, 1
27S, 1
28, 2
28S, 5
2A

Egfr, 1
Egr, 4
Egr1, 4
Elav, 2
Elavl1, 2
Elk, 9
Elk1, 2
Elongation, 1
EmBP, 1
Emericella, 1
EnA, 1
Endo16, 1
Endonuclease, 1
Endorphins, 2
Engrailed, 2
Engrailed2, 1
Enh4, 1
Enhancer, 1
Ent, 1
Env, 7
Enzyme, 1
Ep, 1
Epb7, 1
Epc1, 1
Eph, 1
EphA8, 1
EphB1, 2
EpiQ, 3
Epidermal, 1
Epo, 10
Epo1, 1
EpoR, 1
Eps8, 1
Epstein, 8
Erb, 1
ErbA, 1
ErbB, 12
ErbB1, 1
ErbB2, 4
ErbB3, 2
ErbB4, 3
ErbBs, 1
Erg24p, 1
Erk, 4
Erk2, 2
Erwinia, 2
Erythrocyte, 1
EsaR, 1
Escherichia, 30
Esigma54, 2
Estrogen, 1
Ets, 30
Ets1, 1
Ets2, 3
Euglena, 1
Eukaryotic, 2
Expansins, 1
Extended, 1
Extracellular, 1
F, 20
F0, 1
F1, 4
F10, 1
F11, 1
F1F0, 1
F2771, 1
F3R, 1
F42A, 1
F6, 1
FAB, 1
FABP, 3
FAD7, 1
FAK, 18
FANCC, 1
FAP, 1
FAS, 3
FAS1, 1
FAX, 2
FBN1, 1
FBNYV, 1
FBP, 1
FBP1, 1
FBR, 2
FC3, 1
FCR1, 1
FDP, 2
FEF, 1
FG, 3
FGD1, 1
FGF, 13
FGF1, 1
FGF2, 1
FGF3, 1
FGF3s, 1
FGF4, 1
FGF8a, 1
FGF8b, 1
FGF8e, 1
FGFR, 8
FGFR1, 2
FGFR3, 3
FGFRs, 1
FHR2, 1
FIII, 1
FIRE1, 1
FKBP, 3
FKBP12, 1
FKHR, 4
FL, 3
FLAG, 1
FLAME, 1
FLAP, 1
FLI, 6

LANA2, 1
LAP, 3
LAR, 2
LAT, 3
LAZ, 1
LAZ3, 2
LB1, 1
LBDs, 1
LBP, 1
LCAT, 2
LCDV, 1
LCR, 3
LD78, 3
LDC, 1
LDH, 4
LDL, 15
LDLR, 1
LDLr, 1
LE6, 2
LEF, 2
LEF1, 1
LEM, 1
LEU, 1
LEU2, 5
LF, 2
LFA, 3
LG, 1
LGR, 2
LH, 14
LHA2, 1
LHA4, 1
LHR, 6
LHRH, 3
LHbeta, 2
LIF, 1
LIFR, 2
LILRE, 1
LIM, 5
LIM2, 1
LIMK, 2
LIMK1, 1
LIMK2, 1
LIOR, 1
LIP, 1
LIYV, 1
LL2, 2
LMP, 5
LMP1, 6
LMP2, 1
LMP2A, 3
LMP2B, 1
LMP7, 1
LMW, 1
LNA, 1
LNX, 2
LP, 1
LPA, 1
LPAM, 1
LPD1, 1
LPH, 1
LPL, 4
LPS, 2
LR, 1
LRH, 1
LRP, 2
LRR, 3
LRU1, 1
LSU, 1
LT, 3
LTBP, 4
LTC, 1
LTG19, 1
LTG9, 1
LTR, 25
LTTRs, 1
LU, 2
LUC, 2
LUV1, 1
LYS, 1
LYS2, 1
LZ, 2
La, 3
Lac, 1
LacZ, 2
Lace, 1
LamB, 1
Lap18, 1
Large, 1
Latexin, 1
Lb, 1
Lck, 7
Ld, 1
Ldb1, 1
LeMT, 2
Lef, 1
Leishmania, 1
Lens, 1
Lepore, 1
Leu, 2
Leukemia, 1
LexA, 7
Lg, 1
Lhcb1, 1
Lian, 2
Light, 1
Limulus, 1
Lipoprotein, 1
Lmo2, 2
Lnk, 1
Lp, 2
LpS1, 2
Lrp, 3
Lsp, 1
Ltk, 1
Ltp4, 2
Lu, 2
Luc, 3
Luman, 1
Luteinizing, 1
LvUSF1, 1
LvUSF2, 1
Ly, 3
LyF, 1
Lyn, 6
Lyp, 1
Lyp2, 1
Lys, 1
LysR, 4
Ly

Pokeweed, 2
Pol, 14
Pol32Delta, 1
Pole1, 1
Polycomb, 1
Polyhomeotic, 1
Pop1, 1
Pop2, 1
Posterior, 1
PpG7, 1
Pph21p, 1
PrB, 3
PrKX, 1
Pra, 5
Pre, 1
Preimmune, 1
Prep, 1
PrfA, 4
PrgX, 1
Pri2, 1
Pro, 3
ProW, 1
Products, 1
Prolactin, 1
Prostate, 2
Protamine, 1
Protein, 10
Proteus, 1
Prp16, 1
Prp3p, 2
Prp42p, 1
Prp44p, 2
Prp4p, 1
PrpB, 1
PrpC, 1
PrpD, 1
PrpE, 1
Prx, 1
Psc2, 1
Pseudomonas, 4
Psi, 1
Pt, 1
Ptc, 1
PtdIns, 3
Pto, 3
Ptx1, 1
Punta, 1
Pur, 3
Purified, 2
PvirE, 1
Px, 3
PyRo1, 1
Pyk2, 2
Pyp3, 1
PyrR, 2
Q, 2
Q18, 1
Q209L, 2
QBP, 2
QDE, 1
QUTA, 1
Qa, 1
R, 42
R1, 4
R122A, 1
R2, 4
R206S, 1
R4A, 1
R7G, 1
RA, 1
RAB, 1
RACK1, 1
RAD, 1
RAD16, 1
RAD17, 1
RAD18, 1
RAD23, 1
RAD24, 1
RAD30, 1
RAD30B, 1
RAD5, 1
RAD52, 1
RAD54, 1
RAD6, 1
RAD7, 1
RAF1, 1
RAFTK, 2
RAG, 2
RAG1, 1
RAM23, 1
RAMLG, 1
RAP, 2
RAP1, 4
RAP30, 1
RAP74, 3
RAR, 17
RARE, 1
RAREoct, 1
RARalpha, 1
RARs, 1
RAS, 4
RAS2val19, 1
RAT3, 1
RB, 8
RB2, 1
RB6, 1
RBCC, 2
RBE, 1
RBP, 3
RBP1, 1
RBP56, 1
RBPJk, 1
RC3, 3
RCA, 1
RCC1, 4
RCC1p, 1

Thiobacillus, 1
Thioredoxin, 1
ThlA, 1
ThlB, 1
Thr, 2
ThrRS, 1
Thromboplastin, 1
Thyreoliberin, 1
Thyrotropin, 2
Ti, 1
Tiam1, 1
Tie2, 1
Tih1, 3
Tin, 1
Tinman, 1
Tip60, 1
Tissue, 1
Tk1, 1
Tn1, 2
Tn3411, 1
Tn3HoHo1, 1
Tn4451, 1
Tn5, 1
Tn5422, 1
Tn5mob, 1
Tn903, 1
TnC, 3
TnI, 6
TnIcardiac, 3
TnT, 2
TnrA, 1
Tob, 1
Tob2, 1
Topoisomerase, 1
Toro, 1
ToxR, 2
ToxT, 1
Toxocara, 1
Tpl, 1
Tpo, 1
Tpr, 1
TraD, 1
TraI, 2
TraM, 2
TraR, 4
Transforming, 3
Trdpm1, 1
Treponema, 1
Trident, 1
Tristetraprolin, 1
Trithorax, 1
Trk, 2
Trp53, 2
Tru9I, 1
Truncated, 1
Trx, 1
TrxR2, 1
Trycophytin, 1
Tssc3, 1
Ttk, 1
Tto1, 1
TtrA, 1
TtrB, 1
TtrC, 1
Tub4p, 1
Tuberculin, 1
Tumor, 5
Tumour, 1
Tup1, 1
Ty, 1
Ty1, 5
Ty5, 2
TyA, 3
Tyk, 1
Tyk2, 1
Type, 4
Tyr, 3
Tyr0, 1
TyrR, 1
TyrRS, 1
TyrTS, 3
Tyrosine, 1
U, 2
U1, 19
U11, 1
U1102, 1
U12, 1
U14, 6
U16, 2
U17, 1
U1A, 2
U2, 19
U21, 1
U24, 1
U266, 1
U2AF, 1
U2AF65, 1
U3, 3
U4, 9
U5, 8
U57, 1
U6, 17
U7, 1
U73, 1
UAC, 1
UAS2, 1
UBC, 1
UBC2, 1
UBC9, 1
UBI4, 1
UBP41, 1
UBP43, 1
UBS

controlled, 1
converting, 9
cop, 1
copia, 1
copper, 2
core, 21
corepressor, 1
cornified, 1
corpuscular, 1
cortactin, 1
corticosteroid, 1
corticotrophin, 1
corticotropin, 4
costimulatory, 1
cot, 1
cotE, 1
cotJ, 1
cotS, 2
counterpart, 1
coupled, 11
couples, 1
cox, 1
cox1, 2
cox3, 3
coxI, 2
cpc, 3
cph, 2
cpm7, 1
cpxA, 1
crassa, 6
crb2, 1
creatine, 14
creatinine, 1
crk, 2
crl, 1
crm1, 1
crosslinking, 1
crp, 1
crsA, 1
crsA1, 1
crsA4, 1
crsA47, 1
cryoglobulin, 1
cryptococcal, 1
crystal, 1
crystallin, 1
csbA, 1
csk, 1
csrA, 1
csrR, 1
csrS, 1
cstA, 1
ctf, 1
cucumisin, 1
cullin, 1
curli, 1
cut, 5
cut5, 1
cuticle, 1
cvaA, 2
cvi, 1
cwg2, 1
cwg2p, 1
cyanmethemoglobin, 1
cyanobacterial, 2
cyc, 1
cyc07, 1
cycH, 2
cycMs4, 1
cyclase, 10
cycle, 3
cyclic, 8
cyclin, 78
cyclins, 4
cyclo, 4
cycloheximide, 1
cyclooxygenase, 3
cyclophilin, 2
cyclops, 1
cyclosome, 1
cysB, 1
cystatin, 1
cysteine, 8
cystic, 2
cystyl, 1
cyt, 4
cytR, 1
cytadherence, 1
cytochrome, 24
cytochromes, 2
cytokeratin, 2
cytokine, 2
cytok

latent, 7
laying, 1
lck, 1
leader, 1
lecithin, 1
lectin, 3
lectins, 1
lef2, 1
left, 1
lemdr1, 1
length, 19
lentil, 1
leprae, 1
leptin, 7
leu1, 1
leuA, 2
leuB, 1
leucine, 11
leucoagglutinin, 1
leukemia, 6
leukocyte, 2
leukotrienase, 1
leukotriene, 1
levodopa, 2
lexA, 1
ligand, 4
ligase, 7
ligases, 1
light, 14
like, 88
lin, 1
linear, 1
link, 2
linked, 4
linker, 2
linkers, 1
lipA, 1
lipase, 16
lipid, 1
lipoamide, 1
lipocalin, 1
lipolytica, 1
lipoprotein, 12
lipoproteins, 2
lipoxygenase, 1
liver, 10
localization, 2
located, 2
loci, 13
locus, 49
long, 10
loop, 9
loss, 1
low, 6
lrn, 1
lts, 1
luc, 1
luc3, 1
luciferase, 28
lucigenin, 1
lumican, 1
lung, 1
luteal, 1
luteinizing, 4
luteus, 1
lyase, 9
lyases, 1
lymphocyte, 7
lymphoid, 3
lymphotactin, 1
lymphotropic, 1
lyn, 2
lysine, 1
lysosomal, 5
lysozyme, 8
lysyl, 2
m, 7
m1R, 1
m6F6, 1
m8, 1
m9, 1
mAb, 7
mAb1C2, 1
mAhr, 1
mBMAL1, 2
mCD22, 2
mDAP, 2
mDia2, 1
mE6, 1
mEmBP, 1
mHIF, 1
mHuA, 3
mIRS3, 2
mIg, 1
mIgG, 1
mIgY, 2
mLAT, 1
mLL2, 1
mMIWC, 1


qTBP42, 1
quinone, 3
qutA, 1
r, 3
rAC, 1
rAMPK, 1
rCCK, 1
rDNA, 1
rE12, 1
rENK, 1
rGH, 3
rGST, 1
rHSA, 1
rHb1, 1
rHu, 2
rIFN, 2
rIL, 3
rLHR, 4
rMTM, 1
rNFIL, 1
rOC, 1
rRNA, 14
rRNAs, 1
rabbit, 4
rac1, 2
rad1, 2
rad16, 1
rad18, 1
rad52, 1
rad54, 1
rad6, 2
rad9, 1
raf, 5
rainbow, 1
ram, 1
rap1, 1
rap1GAPII, 1
rap1s, 1
rap1t, 2
ras, 35
ras4BVal, 1
rasVal12, 1
rat, 66
rbSec1, 1
rbcL, 2
rbcS, 3
rbp1, 1
rbsA, 1
rbsC, 1
rbsD, 1
rci, 1
rd22, 1
rd22BP1, 1
reactive, 5
reading, 5
rearranged, 4
rearrangements, 1
recJ, 1
recN, 1
receptor, 280
receptors, 72
recognition, 12
recognized, 1
recombinant, 35
recombinants, 1
recombinase, 1
recombination, 3
recruiting, 2
red, 2
redox, 1
reduced, 3
reducing, 1
reductase, 24
reductases, 2
ref, 1
reg1, 1
reg2, 1
region, 65
regions, 10
regulated, 11
regulating, 1
regulation, 1
regulator, 8
regulators, 2
regulatory, 37
regulon, 3
rehCG, 1
rel, 7
relA, 1
related, 52
relaxase, 1
relaxin, 1
releasing, 10
renal, 2
renin, 19
rep, 2
rep3, 1
repA, 4
repY, 1
repZ, 1
rep

Try to grouping words in having numbers in it or not

In [170]:
genum = 0
gew = 0
Onum = 0
Oew = 0
for t,w in counttag:
    if any(str.isdigit(c) for c in w) == True:
        #print(w)
        if t == 'O':
            Onum += 1
        else:
            genum +=1
    else:
        if t == 'O':
            Oew +=1 
        else:
            gew +=1 

print(Onum, genum, Oew, gew)
print(Onum/(Onum+Oew),genum/(genum+gew))

2413 3023 22682 5388
0.09615461247260411 0.3594102960408988


In [96]:
dev = open("gene.dev", "r")
devout = open("gene.dev.p2.out",'w')
for l in dev:
    ls = l.split()
    if (len(ls) != 0):
        pre = prednum(ls[0])
        devout.write(ls[0]+' '+pre+'\n')
    else:
        devout.write(l)
devout.close()
gs_iterator = corpus_iterator(open("gene.key"))
pred_iterator = corpus_iterator(open("gene.dev.p2.out"), with_logprob = False)
evaluator = Evaluator()
evaluator.compare(gs_iterator, pred_iterator)
evaluator.print_scores()

Found 2454 GENEs. Expected 642 GENEs; Correct: 426.

	 precision 	recall 		F1-Score
GENE:	 0.173594	0.663551	0.275194


In [97]:
train = open("gene.train", "r")
devout = open("gene.train.p2.out",'w')
for l in train:
    ls = l.split()
    if (len(ls) != 0):
        pre = prednum(ls[0])
        devout.write(ls[0]+' '+pre+'\n')
    else:
        devout.write(l)
devout.close()
gs_iterator = corpus_iterator(open("gene.train"))
pred_iterator = corpus_iterator(open("gene.train.p2.out"), with_logprob = False)
evaluator = Evaluator()
evaluator.compare(gs_iterator, pred_iterator)
evaluator.print_scores()

Found 56184 GENEs. Expected 16637 GENEs; Correct: 12184.

	 precision 	recall 		F1-Score
GENE:	 0.216859	0.732344	0.334629


Get informative prefix

In [39]:
#affix = ['syl','transferase','pheno','fer','kary','hapl','hetero','homo','zyg','mut','poly','gam','gene','nuc','photo','sis']
prefix = ['AT','Glu','II','Lys','NI','PA','PK','PT','Prp','RA','TF','acetyl','activ','adreno',
         'amin','anti','beta','calc','calp','carboxy','chim','chloro','comple','delta','dna','ferr','fibril','fibro',
         'galactos','germ','gluco','glutam','glyco','hetero','homo','hydrox','immuno','initiat','inter','lact','leuk',
         'lipo','lympho','metallop','methyl','micro','mini','mitochondria','mono','multi','neuro','nucleo',
         'oncoge','phospha','phospho','polyp','proteas','protein','recombin','regulat','retino','ribonucle','ribosyl',
          'somato','stimulat','synthe','thrombo','transcri','transfer','transduc',]
preaf = []
for a in prefix:
    geup = 0
    gew = 0
    Oup = 0
    Oew = 0
    for t,w in counttag:
        if w[:len(a)] == a:
            if t == 'O':
                Oup += 1
            else:
                geup +=1
        else:
            if t == 'O':
                Oew +=1 
            else:
                gew +=1 
    if geup != 0:
        if (Oup/geup <= 1):
            preaf.append(a)
print(preaf)

['AT', 'Lys', 'PA', 'PK', 'PT', 'Prp', 'RA', 'TF', 'acetyl', 'adreno', 'beta', 'calp', 'carboxy', 'delta', 'dna', 'ferr', 'galactos', 'germ', 'glutam', 'metallop', 'mitochondria', 'ribonucle', 'ribosyl', 'transcri']


In [171]:
len(preaf)

24

In [175]:
dev = open("gene.dev", "r")
devout = open("gene.dev.p1.out",'w')
for l in dev:
    ls = l.split()
    if (len(ls) != 0):
        pre = predprefix(ls[0])
        devout.write(ls[0]+' '+pre+'\n')
    else:
        devout.write(l)
devout.close()
gs_iterator = corpus_iterator(open("gene.key"))
pred_iterator = corpus_iterator(open("gene.dev.p1.out"), with_logprob = False)
evaluator = Evaluator()
evaluator.compare(gs_iterator, pred_iterator)
evaluator.print_scores()

Found 2247 GENEs. Expected 642 GENEs; Correct: 423.

	 precision 	recall 		F1-Score
GENE:	 0.188251	0.658879	0.292835


In [174]:
train = open("gene.train", "r")
devout = open("gene.train.p1.out",'w')
for l in train:
    ls = l.split()
    if (len(ls) != 0):
        pre = predprefix(ls[0])
        devout.write(ls[0]+' '+pre+'\n')
    else:
        devout.write(l)
devout.close()
gs_iterator = corpus_iterator(open("gene.train"))
pred_iterator = corpus_iterator(open("gene.train.p1.out"), with_logprob = False)
evaluator = Evaluator()
evaluator.compare(gs_iterator, pred_iterator)
evaluator.print_scores()

Found 49043 GENEs. Expected 16637 GENEs; Correct: 12053.

	 precision 	recall 		F1-Score
GENE:	 0.245764	0.724470	0.367022


In [173]:
dev = open("gene.test", "r")
devout = open("gene.test.p1.out",'w')
for l in dev:
    ls = l.split()
    if (len(ls) != 0):
        pre = predprefix(ls[0])
        devout.write(ls[0]+' '+pre+'\n')
    else:
        devout.write(l)
devout.close()

In [108]:
dev = open("gene.dev", "r")
devout = open("gene.dev.p5.out",'w')
for l in dev:
    ls = l.split()
    if (len(ls) != 0):
        pre = predcombined(ls[0])
        devout.write(ls[0]+' '+pre+'\n')
    else:
        devout.write(l)
devout.close()
gs_iterator = corpus_iterator(open("gene.key"))
pred_iterator = corpus_iterator(open("gene.dev.p5.out"), with_logprob = False)
evaluator = Evaluator()
evaluator.compare(gs_iterator, pred_iterator)
evaluator.print_scores()

Found 2465 GENEs. Expected 642 GENEs; Correct: 425.

	 precision 	recall 		F1-Score
GENE:	 0.172414	0.661994	0.273576


In [109]:
train = open("gene.train", "r")
devout = open("gene.train.p4.out",'w')
for l in train:
    ls = l.split()
    if (len(ls) != 0):
        pre = predcombined(ls[0])
        devout.write(ls[0]+' '+pre+'\n')
    else:
        devout.write(l)
devout.close()
gs_iterator = corpus_iterator(open("gene.train"))
pred_iterator = corpus_iterator(open("gene.train.p4.out"), with_logprob = False)
evaluator = Evaluator()
evaluator.compare(gs_iterator, pred_iterator)
evaluator.print_scores()

Found 56494 GENEs. Expected 16637 GENEs; Correct: 12175.

	 precision 	recall 		F1-Score
GENE:	 0.215510	0.731803	0.332964


PART 2 : Trigram HMM

In [46]:
def prob(yi2,yi1,yi):    
    return (trigram[(yi2,yi1,yi)]*1.0)/(bigram[(yi1,yi)]*1.0)

In [202]:
datadev = open("gene.dev", "r")
sentences = []
s = []
for l in datadev:
    l = l.split()
    if len(l) != 0:
        s.append(l[0])
    else:
        sentences.append(s)
        s = []

In [206]:
def Viterbi(s):
    pi = dict()
    bp = dict()
    pi[(0,'*','*')] = 1
    tag = [0]*len(s)
    pi[(1,'*','O')] = pi[(0,'*','*')]*prob('*','*','O')*emission(s[0],'O')
    pi[(1,'*','I-GENE')] = pi[(0,'*','*')]*prob('*','*','I-GENE')*emission(s[0],'I-GENE')
    bp[(1,'*','O')] = '*'
    bp[(1,'*','I-GENE')] = '*'
    for k in range(2,len(s)+1):
        for u in ['O','I-GENE']:
            for v in ['O','I-GENE']:
                if k == 2:
                    pi[(k,u,v)] = pi[(1,'*',u)]*prob('*',u,v)*emission(s[1],v)
                    bp[(k,u,v)] = '*'
                else:
                    pi1 = pi[(k-1,'O',u)]*prob('O',u,v)*emission(s[k-1],v)
                    pi2 = pi[(k-1,'I-GENE',u)]*prob('I-GENE',u,v)*emission(s[k-1],v)
                    if(pi1>=pi2):
                        pi[(k,u,v)] = pi1
                        bp[(k,u,v)] = 'O'
                    else:
                        pi[(k,u,v)] = pi2
                        bp[(k,u,v)] = 'I-GENE'
    maxuv = -1
    print(pi)
    #print(bp)
    if(len(s)>=2):
        for u in ['O','I-GENE']:
                for v in ['O','I-GENE']:
                    pi1 = pi[(len(s),u,v)]*prob(u,v,'STOP')
                    if pi1>maxuv:
                        maxuv = pi1
                        (yn1,yn) = (u,v)
        tag[len(s)-1] = yn
        tag[len(s)-2] = yn1
        if len(s)>2:
            for k in reversed(range(1,len(s)-1)):
                tag[k-1] = bp[(k+2,tag[k],tag[k+1])]
    elif len(s) == 1:
        tag[0] = 'O'
        return tag
    return tag
        

In [187]:
devout = open("gene.dev.p2.out",'w')
for l in sentences:
    pre = Viterbi(l)
    for i in range(0,len(l)):
        devout.write(l[i]+' '+pre[i]+'\n')
    else:
        devout.write('\n')
devout.close()
gs_iterator = corpus_iterator(open("gene.key"))
pred_iterator = corpus_iterator(open("gene.dev.p2.out"), with_logprob = False)
evaluator = Evaluator()
evaluator.compare(gs_iterator, pred_iterator)
evaluator.print_scores()

Found 374 GENEs. Expected 642 GENEs; Correct: 202.

	 precision 	recall 		F1-Score
GENE:	 0.540107	0.314642	0.397638


In [140]:
data =open("gene.train", "r")
sentrain = []
s = []
for l in data:
    if len(sentrain) == 13795:
        if l == '. O\n':
            sentrain.append(s)
    l = l.split()
    if len(l) != 0:
        s.append(l[0])
    else:
        sentrain.append(s)
        s = []

In [179]:
devout = open("gene.train.p2.out",'w')
for l in sentrain:
    pre = Viterbi(l)
    #print(len(l),l)
    for i in range(0,len(l)):
        devout.write(l[i]+' '+pre[i]+'\n')
    else:
        devout.write('\n')
devout.close()
gs_iterator = corpus_iterator(open("gene.train"))
pred_iterator = corpus_iterator(open("gene.train.p2.out"), with_logprob = False)
evaluator = Evaluator()
evaluator.compare(gs_iterator, pred_iterator)
evaluator.print_scores()

Found 10438 GENEs. Expected 16637 GENEs; Correct: 5835.

	 precision 	recall 		F1-Score
GENE:	 0.559015	0.350724	0.431025


In [183]:
datatest = open("gene.test", "r")
sentences = []
s = []
for l in datatest:
    l = l.split()
    if len(l) != 0:
        s.append(l[0])
    else:
        sentences.append(s)
        s = []

In [184]:
devout = open("gene.test.p2.out",'w')
for l in sentences:
    pre = Viterbi(l)
    #print(len(l),l)
    for i in range(0,len(l)):
        devout.write(l[i]+' '+pre[i]+'\n')
    else:
        devout.write('\n')
devout.close()

In [203]:
sentences[5]

['Therefore',
 ',',
 'we',
 'suggested',
 'that',
 'both',
 'proteins',
 'might',
 'belong',
 'to',
 'the',
 'PLTP',
 'family',
 '.']

In [204]:
Viterbi(sentences[5])

{(0, '*', '*'): 1, (1, '*', 'O'): 0.00010720660160867851, (1, '*', 'I-GENE'): 0.0, (2, 'O', 'O'): 1.4848596405018943e-07, (2, 'O', 'I-GENE'): 7.696410643338835e-09, (2, 'I-GENE', 'O'): 0.0, (2, 'I-GENE', 'I-GENE'): 0.0, (3, 'O', 'O'): 1.897575232826306e-10, (3, 'O', 'I-GENE'): 0.0, (3, 'I-GENE', 'O'): 4.29095968973283e-12, (3, 'I-GENE', 'I-GENE'): 0.0, (4, 'O', 'O'): 4.016255338835847e-14, (4, 'O', 'I-GENE'): 0.0, (4, 'I-GENE', 'O'): 0.0, (4, 'I-GENE', 'I-GENE'): 0.0, (5, 'O', 'O'): 3.1053665020140405e-16, (5, 'O', 'I-GENE'): 0.0, (5, 'I-GENE', 'O'): 0.0, (5, 'I-GENE', 'I-GENE'): 0.0, (6, 'O', 'O'): 4.683995617178435e-19, (6, 'O', 'I-GENE'): 0.0, (6, 'I-GENE', 'O'): 0.0, (6, 'I-GENE', 'I-GENE'): 0.0, (7, 'O', 'O'): 4.0408020236605833e-22, (7, 'O', 'I-GENE'): 1.974178961563237e-21, (7, 'I-GENE', 'O'): 0.0, (7, 'I-GENE', 'I-GENE'): 0.0, (8, 'O', 'O'): 3.89731307990885e-26, (8, 'O', 'I-GENE'): 0.0, (8, 'I-GENE', 'O'): 8.306859104625818e-26, (8, 'I-GENE', 'I-GENE'): 0.0, (9, 'O', 'O'): 8.3

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

In [208]:
Viterbi(sentences[5])

{(0, '*', '*'): 1, (1, '*', 'O'): 0.00010720660160867851, (1, '*', 'I-GENE'): 0.0, (2, 'O', 'O'): 1.4848596405018943e-07, (2, 'O', 'I-GENE'): 7.696410643338835e-09, (2, 'I-GENE', 'O'): 0.0, (2, 'I-GENE', 'I-GENE'): 0.0, (3, 'O', 'O'): 1.897575232826306e-10, (3, 'O', 'I-GENE'): 0.0, (3, 'I-GENE', 'O'): 4.29095968973283e-12, (3, 'I-GENE', 'I-GENE'): 0.0, (4, 'O', 'O'): 4.016255338835847e-14, (4, 'O', 'I-GENE'): 0.0, (4, 'I-GENE', 'O'): 0.0, (4, 'I-GENE', 'I-GENE'): 0.0, (5, 'O', 'O'): 3.1053665020140405e-16, (5, 'O', 'I-GENE'): 0.0, (5, 'I-GENE', 'O'): 0.0, (5, 'I-GENE', 'I-GENE'): 0.0, (6, 'O', 'O'): 4.683995617178435e-19, (6, 'O', 'I-GENE'): 0.0, (6, 'I-GENE', 'O'): 0.0, (6, 'I-GENE', 'I-GENE'): 0.0, (7, 'O', 'O'): 4.0408020236605833e-22, (7, 'O', 'I-GENE'): 1.974178961563237e-21, (7, 'I-GENE', 'O'): 0.0, (7, 'I-GENE', 'I-GENE'): 0.0, (8, 'O', 'O'): 3.89731307990885e-26, (8, 'O', 'I-GENE'): 0.0, (8, 'I-GENE', 'O'): 8.306859104625818e-26, (8, 'I-GENE', 'I-GENE'): 0.0, (9, 'O', 'O'): 8.3

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']