In [1]:
import numpy as np
import re
from sklearn import linear_model

In [3]:
#create sentense and related pos and chunking tag array
def createSentense(filename):    
    sentenses = list()
    pos_sentenses = list()    
    with open(filename, 'r') as ins:
        sentense = list()
        pos_sentense = list()        
        for line in ins:
            separator = line.split()            
            if len(separator)==3:                
                sentense.append(separator[0])
                pos_sentense.append(separator[1])                
            else:                
                sentenses.append(sentense)
                pos_sentenses.append(pos_sentense)                
                sentense = list()
                pos_sentense = list()                
        sentenses.append(sentense)
        pos_sentenses.append(pos_sentense)        
    return sentenses,pos_sentenses

train_sentenses,train_pos_sentenses = createSentense('train.txt')

In [5]:
def createIndividualList(sentenses,pos_sentenses):
    words = list()
    posTags = list()
    for sentense in sentenses:
        for word in sentense:
            words.append(word)
    for pos_sentense in pos_sentenses:
        for posTag in pos_sentense:
            posTags.append(posTag)
    return words,posTags

train_words,train_POS_tags = createIndividualList(train_sentenses,train_pos_sentenses)

In [6]:
# creating a dictionary of tags which will be used in creating onHotEncoding for each tag
train_POS_tags_np = np.array(train_POS_tags)
unique_tags = np.unique(train_POS_tags_np)
index = np.argwhere(unique_tags=='-1')
unique_tags = np.delete(unique_tags,index)
print(unique_tags)

# mapping each of the unique tags to an integer value
tag_to_int = dict((c,i) for i,c in enumerate(unique_tags))
int_to_tag = dict((i,c) for i,c in enumerate(unique_tags))

# create oneHotEncoding for a particular tag
def getOneHotTagEncoding(tag):    
    resultset = [0] * len(unique_tags)
    resultset[tag_to_int[tag]] = 1
    return resultset

#getOneHotTagEncoding('NNP')

['#' '$' "''" '(' ')' ',' '.' ':' 'CC' 'CD' 'DT' 'EX' 'FW' 'IN' 'JJ' 'JJR'
 'JJS' 'MD' 'NN' 'NNP' 'NNPS' 'NNS' 'PDT' 'POS' 'PRP' 'PRP$' 'RB' 'RBR'
 'RBS' 'RP' 'SYM' 'TO' 'UH' 'VB' 'VBD' 'VBG' 'VBN' 'VBP' 'VBZ' 'WDT' 'WP'
 'WP$' 'WRB' '``']


In [7]:
# creates a feature vector for a particular word
def getWordFeatures(words,index):
    features = list()
    
    #1 : Give high value if the word is to
    if words[index] == 'to':
        features.append(1)
    else:
        features.append(0)
    
    #2 : if the word is DT
    if words[index]=='a' or words[index]=='an' or words[index]=='the' or words[index]=='another' or words[index]=='both' or words[index]=='each':
        features.append(1)
    else:
        features.append(0.1)
    
    #3 : DT will always be followed by a Noun Phrase
    if index>0 and words[index-1]=='a' or words[index-1]=='an' or words[index-1]=='the':
        features.append(1)
    else:
        features.append(0.5)
    
    #4 for comma
    if words[index] == ',':
        features.append(1)
    else:
        features.append(0)
    
    #5 for full stop
    if words[index] == '.':
        features.append(1)
    else:
        features.append(0)
    
    #6 for double quotes
    if words[index] == '``':
        features.append(1)
    else:
        features.append(0)
    
    #7 length of the word
    features.append(len(words[index]))
    
    #8 if the word is not the first word in the sentense and its first letter is capitalizsed        
    features.append(words[index][0].isupper())
    
    #9 ending in "ing"
    features.append(words[index][-3:] == "ing")
    
    #10: ending in "ly"
    features.append(words[index][-2:] == "ly")
        
    #11: contain a number
    pattern = re.compile(r'\d')
    features.append(len(pattern.findall(words[index])) > 0)
    
    #12: hyphen
    pattern = re.compile(r'-')
    features.append(len(pattern.findall(words[index])) > 0)
    
    #13 if the previous word's pos tag is Adjective then there is high prob of current word being noun
    
    return features

In [8]:
# the feature vector is of the format: W-3 POS-3 W-2 POS-2 W-1 POS-1 W POS W+1 POS+1 W+2 POS+2 Chunkig_Tag-1
def getFeatures(sentence,index,tags):
    features = list()
    wordFeature_len = [0]*12
    posTagFeature_length = [0]*len(unique_tags)    
                
    # getting feature vector for current word
    features.extend(getWordFeatures(sentence,index))    
    
    # getting feature vector for posTag-1
    if index>=1:
        features.extend(getOneHotTagEncoding(tags[index-1]))
    else:        
        features.extend(posTagFeature_length)
    
    return features

features_found = getFeatures(['Confidence','in','the','pound','is','widely','expected'],3,['NN','IN','DT','NN','VBZ','RB','VBN'])
feature_length = len(features_found)
print(features_found)

[0, 0.1, 1, 0, 0, 0, 5, False, False, False, False, False, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [14]:
#get Feature Matrix
def getFeatureMatrix(train_sentenses,train_pos_sentenses):
    resultset = list()
    
    size = len(train_sentenses)
    i=0
    while i<size:
        sentence = train_sentenses[i]
        posTags = train_pos_sentenses[i]    
        if i==0:
            print(sentence,posTags)
        j=0
        while j<len(sentence):
            resultset.append(getFeatures(sentence,j,posTags))
            #y.append(chunkingTags[j])
            j=j+1
        i = i+1
        
    return resultset

def getLabelVector(posTags):
    y = list()
    for t in posTags:
        y.append(tag_to_int[t])
    return y

In [15]:
X = getFeatureMatrix(train_sentenses,train_pos_sentenses)
y = getLabelVector(train_POS_tags)

['Confidence', 'in', 'the', 'pound', 'is', 'widely', 'expected', 'to', 'take', 'another', 'sharp', 'dive', 'if', 'trade', 'figures', 'for', 'September', ',', 'due', 'for', 'release', 'tomorrow', ',', 'fail', 'to', 'show', 'a', 'substantial', 'improvement', 'from', 'July', 'and', 'August', "'s", 'near-record', 'deficits', '.'] ['NN', 'IN', 'DT', 'NN', 'VBZ', 'RB', 'VBN', 'TO', 'VB', 'DT', 'JJ', 'NN', 'IN', 'NN', 'NNS', 'IN', 'NNP', ',', 'JJ', 'IN', 'NN', 'NN', ',', 'VB', 'TO', 'VB', 'DT', 'JJ', 'NN', 'IN', 'NNP', 'CC', 'NNP', 'POS', 'JJ', 'NNS', '.']


In [16]:
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(X,y)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [None]:
test_sentenses,test_pos_sentenses = createSentense('test.txt')

# Implementation of modified viterbi on the 'C'=44 (i.e, the count of tags)
# Initialy trying to implement GREEDY MEMM Decoding
def modifiedViterbi(sentenses):
    n = len(sentenses)
    posTags = list()
    k=0
    while k<2:
        sentense = sentenses[k]        
        length = len(sentense)
        X_test = list()
        i=0
        while i<length:
            X_test.append(getFeatures(sentense,i,posTags))
        chunkingTags.append(logreg.predict(X_test))
                
    return posTags

prediction = modifiedViterbi(test_sentenses)
print(prediction)