## Structured prediction: PoS tagging Twitter
### Ildar Nurgaliev (Innopolis university)

### Feature extraction

In [20]:
# utils
import os
get_full_path = lambda f_name : os.path.join(os.getcwd(), 'data', f_name)

cng_postfix = lambda f_name, ext: f_name.rsplit( ".", 1 )[ 0 ] + ext

### Analyse train

In [3]:
def show_form(f_name, labels=None, size_show = 5):
    fname = get_full_path(f_name)
    res = []
    content = []
    with open(fname, 'r') as f:
        content = f.readlines()
    d = {}
    for line in content:
        line = line.strip() # remove newline from the end of the line
    
        if not len(line): #skip blank lines
            continue

        label, word = line.split("\t")
        if labels:
            if label in labels:
                if label in d:
                    d[label].append(word)
                else:
                   d[label] = [word] 
        else:
            if label in d:
                d[label].append(word)
            else:
               d[label] = [word]
    for v in d:
        print("%s[%d]\t%s" % (v, len(d[v]), " || ".join(d[v][:size_show])))
show_form('pos_train.conll', ('UH'), 127)

UH[344]	&lt; || 3 || : || 3 || Ugh || LOL || SMH || hey || aaahh || :( || XD || wait || :o || :D || :D || :D || Oh || :/ || &lt; || 3( || : || hahaha || aww || lol || Mmmm || Damn || Hey || there || LOL || btw || Amazing || yehhh || ( || : || Good || golly || good || morning || lol || ahhhh || =( || :) || Yeah || :' || Anytime || biotch || :) || haha || yes || :) || :) || xxx || right || lool || lol || Yes-sir || yep || btw || xD || lol || lol || yo || hey || :) || :) || lush || like || : || L || LMAO || ?: || P || :) || WTF || :D || &lt; || &lt; || 3 || yo || oh || =( || =) || alright || bro || ya || know....he || WELL || Okay || :' || ) || shit || :D || xHahaa || ;) || x || Morning || :( || hell || Lol || Yes || plz || :) || :) || o_O || yo || :( || (( || Yo || : || 3 || so || yeah~ || :D || OMG || yaaaayyy || ;) || smh || ok || Yes || ugh || so || :) || Hey || lmao || okay || :o || &lt;


### Analyse test

In [8]:
show_form('pos_test.conll')

CD	['2', '7', '2', '2010', '40s']
RB	['not', 'away', 'again', 'again', 'there']
PDT	['Half']
NNP	['Benitez', 'Soulja', 'Boy', 'TheDeAndreWay', 'DeAndre']
JJ	['4-1', 'New', 'ready', 'wet', 'cold']
URL	['http://youtu.be/pnhXhR07s14', 'com', 'souljaboytellem-iga.ning.com', 'http://bit.ly/fkdrr6', 'http://4sq.com/epmWHV']
JJR	['less', 'Less', 'More', 'better']
EX	['there', 'there', 'there', 'There']
VBD	['were', 'heard', 'came', 'tweeted', 'were']
CC	['But', 'and', 'and', 'and', 'and']
PRP	['I', 'you', 'it', 'it', 'you']
VBZ	['leads', 'scores', 'is', 'loves', '*kisses']
PRP$	['my', 'yo', 'your', 'my', 'my']
WP	['what', 'whts', 'WHAT', 'who', 'what']
RBR	['more', 'more', 'more', 'more', 'Earlier']
TO	['to', 'to', 'to', 'to', 'to']
,	[',', ',', ',', ',', ',']
POS	["'s", "'s", "'s", "'s"]
NONE	['[', ']']
WRB	['when', 'where', 'when', 'When', 'WHEN']
VBP	['suppose', "'m", 'love', 'LUVZ', 'live']
:	[':', '|', '-', ':', ':']
RP	['ON', 'out', 'out', 'out', 'off']
IN	['til', 'in', 'for', 'after', 

### Check labels of train and test datasets, outliers detection

In [4]:
def get_labels(f_name):
    fname = get_full_path(f_name)
    res = []
    content = []
    s = set()
    with open(fname, 'r') as f:
        for line in f.readlines():
            line = line.strip() # remove newline from the end of the line
    
            if not len(line): #skip blank lines
                continue

            label, _ = line.split("\t")
            s.add(label)
    return s
        
train_l = get_labels('pos_train.conll')
test_l = get_labels('pos_test.conll')

print("TRAIN/test:")
show_form('pos_train.conll', (train_l - test_l))
print()
print("TEST/train:")
show_form('pos_test.conll', (test_l - train_l))
print()
print("Intersection: %s" % (train_l & test_l))

TRAIN/test:
O[1]	"..
FW[3]	Etc || Etc || Etc
VPP[1]	please
NNPS[6]	kids || Engineers || queens || Monsters || Eats
TD[1]	a
RBS[1]	most
LS[1]	1

TEST/train:
PDT[1]	Half
NONE[2]	[ || ]

Intersection: {'RBR', 'NN', ':', 'RB', 'HT', 'RT', '(', 'EX', 'VBD', 'CC', 'JJ', 'WRB', ',', 'WP', 'JJS', 'USR', ')', 'WDT', 'MD', 'TO', 'CD', 'DT', 'SYM', '.', 'JJR', 'RP', 'PRP$', 'IN', 'NNP', 'VB', 'VBP', 'UH', 'URL', "''", 'VBN', 'VBG', 'PRP', 'NNS', 'VBZ', 'POS'}


### PoS taggins:
https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
#### Train
- ...
- NNPS	Proper noun, plural

#### Test
- None	nothing
- PDT	Predeterminer

### Get sentences and tag its words

In [21]:
def get_sentences(f_name):
    fname = get_full_path(f_name)
    rows = []
    with open(fname, 'r') as f:
        rows = f.readlines()
    sents = []
    sent = []
    for row in rows:
        row = row.strip()
        if row == '':
            if len(sent) > 0:
                sents.append(sent)
            sent = []
            continue
        pos, word = row.split('\t',1)
        sent.append(word + "_" + pos)
    print(f_name, len(sents), 'sentences')
    return sents
for x in get_sentences('pos_train.conll')[:1]:
    print(x)

pos_train.conll 551 sentences
['Antick_NNP', 'Musings_NNP', 'post_NN', ':_:', 'Book-A-Day_NNP', '2010_CD', '#_NN', '243_CD', '(_(', '10/4_CD', ')_)', '--_:', 'Gray_NNP', 'Horses_NNP', 'by_IN', 'Hope_NNP', 'Larson_NNP', 'http://bit.ly/as8fvc_URL']


### Feature extractor

In [38]:
from nltk.corpus import words
from nltk.wsd import lesk

words = set(words.words())
def word_features(sent, iw):
    try:
        word, _ = sent[iw].rsplit( "_", 1 )
    except ValueError as v:
        print(sent[iw])
#   feature functions
    is_hashtag = lambda x : x[0] == '#'
    is_user = lambda x : x[0] == '@'
    is_url = lambda x : 'http' in x or 'com' in x
    is_upper = lambda x : x[0].isupper()
    is_digit = lambda x : x[0].isdigit()
    is_word = lambda x : x.lower() in words
#     pos = lambda x : 
    def pos(x):
        p = lesk(sent, x.lower())
        if p:
            return p.pos()
        else:
            "0"
    
    def num_same_elemseq(x):
        el = len(set(x))
        if el == 1:
            return "1"
        elif el == 2:
            return "2"
        elif el == 0:
            return "0"
        else:
            return "+"
    def rev_type(x):
        if is_upper(x):
            return "1"
        elif is_digit(x):
            return "2"
        elif is_user(x):
            return "3"
        elif is_hashtag(x):
            return "4"
        elif is_url(x):
            return "5"
        else:
            return "0"
        
        
#     word features
    features = [
        'w.lower=' + word.lower(),
        'w.len=%s' % len(word),
        'w.last=' + word[-1],
        'w.sameseq=%s' % num_same_elemseq(word),
        'w.isupper=%s' % is_upper(word),
        'w.type=%s' % rev_type(word),
        'w.iw=%s' % iw
                
    ]
    if (is_word(word)):
        features.extend([
                'w.isword=True',
                'w[-2]=' + word[-2:],
#                 'w[-3]=' + word[-3:],
            ])
    else:
        features.append('w.isword=False')
        
#   relative words features (PRE)
    if iw > 0:
        pre_word = sent[iw - 1][0]
        features.extend([
            '-w.lower=' + pre_word.lower(),
            '-w.len=%s' % len(pre_word),
            '-w.isupper=%s' % is_upper(pre_word),
            '-w[-1]=' + pre_word[-1],
            '-w.sameseq=%s' % num_same_elemseq(pre_word),
            '-w.type=%s' % rev_type(pre_word)
        ])
        
        if (is_word(pre_word)):
            features.extend([
                '-w.isword=True',
                '-w[-2]=' + pre_word[-2:],
                '-w.pos=%s' % pos(pre_word)
            ])
        else:
            features.append('w.isword=False')
        
    else:
        features.append('F:BoS')
#   relative words features (POST)
    if iw < len(sent)-1:
        post_word = sent[iw + 1][0]
        features.extend([
            '+w.lower=' + post_word.lower(),
            '+w.len=%s' % len(post_word),
            '+w.isupper=%s' % is_upper(post_word),
            '+w.last=%s' % post_word[-1],
            '+w.sameseq=%s' % num_same_elemseq(post_word),
            '+w.type=%s' % rev_type(post_word)
        ])
        
        if (is_word(post_word)):
            features.extend([
                '+w.isword=True',
                '+w[-2]=' + post_word[-2:],
                '+w.pos=%s' % pos(post_word)
            ])
        else:
            features.append('+w.isword=False')
        
    else:
        features.append('F:EoS')
   
                
    return features

sent_word_features=lambda sent:[word_features(sent, i) for i in range(len(sent))]
sent_word_labels=lambda sent: [word_tag.rsplit('_',1)[1] for word_tag in sent]

### Create feature files

In [39]:
%%time
outliers_tag = set(('LS',
    'TD',
    'FW',
    'NNPS',
    'VPP',
    'RBS',
    'O'))
def filter_tag(x):
    return x not in outliers_tag

def features_preserve(f_name):
    sents = get_sentences(f_name)
    res = []
    for sent in sents:
        tags = sent_word_labels(sent)
        features = sent_word_features(sent)
        for label, crfsuite_features in zip(tags, features):
            if filter_tag(label):
                res.append(label + "\t" + "\t".join(crfsuite_features))
        res.append('\n')
    # output file
    f_name = cng_postfix(f_name, '.input')
    fname = get_full_path(f_name)
    # preserve data
    with open(fname, 'w') as f:
        for word in res:
            f.write(word)
            f.write('\n')
    print("saved to", f_name)
            
features_preserve('pos_train.conll')
features_preserve('pos_test.conll')

pos_train.conll 551 sentences
saved to pos_train.input
pos_test.conll 118 sentences
saved to pos_test.input
CPU times: user 1.62 s, sys: 36.3 ms, total: 1.65 s
Wall time: 1.68 s


### Train model

In [52]:
%%time
from subprocess import check_output, call

fmodel='my1.model'
ftrain='data/pos_train.input'
ftest='data/pos_test.input'

algs = (
    'lbfgs'                # L-BFGS with L1/L2 regularization
    'l2sgd'                # SGD with L2-regularization
    'ap'                   # Averaged Perceptron
    'pa'                   # Passive Aggressive
    'arow'                 # Adaptive Regularization of Weights (AROW)
    )

# res = call(['crfsuite', 'learn', '-m', fmodel, -p',  'feature.possible_states=1',ftrain])
# res = call(['crfsuite', 'learn', '-m', fmodel,'-p', 'feature.possible_states=1', 
#              '-p', 'feature.minfreq=2', '-p', 'feature.possible_transitions=1', '-p', 'feature.minfreq=2', ftrain])

# res = call(['crfsuite', 'learn', '-m', fmodel, '-a', 'lbfgs', '-p', 'c1=0.1', '-p', 'c2=0.01',
#   '-p', 'feature.possible_transitions=1', ftrain])
# res = call(['crfsuite', 'learn', '-m', fmodel, '-a', 'lbfgs', '-p', 'linesearch=StrongBacktracking', #MoreThuente Backtracking StrongBacktracking
#             '-p', 'c1=0.1', '-p', 'c2=0.01', ftrain])

# res = call(['crfsuite', 'learn', '-m', fmodel, '-a', 'l2sgd', '-p', 'c2=0.01', ftrain])

# res = call(['crfsuite', 'learn', '-m', fmodel, '-a', 'ap', '-p', 'max_iterations=150', ftrain])
#                                                                 0 1 2
res = call(['crfsuite', 'learn', '-m', fmodel, '-a', 'pa', '-p', 'type=1', '-p', 'c=0.1', '-p', 'max_iterations=300',
            '-p', 'error_sensitive=1', '-p', 'averaging=1', '-p', 'feature.possible_transitions=1', ftrain])

# res = call(['crfsuite', 'learn', '-m', fmodel, '-a', 'arow', '-p', 'max_iterations=200',
#             '-p', 'gamma=0.1', '-p', 'variance=0.08', ftrain])


if res == 0:
    out = check_output(['crfsuite', 'dump', fmodel, '|', 'less'])
#     print(out.decode())
else:
    print('Error')

CPU times: user 3.26 ms, sys: 13.1 ms, total: 16.4 ms
Wall time: 1min 5s


### Test model

In [53]:
print(fmodel)
out = check_output(['crfsuite', 'tag', '-q', '-m',fmodel, '-t' ,ftest])
print(out.decode())

my10.model
Performance by label (#match, #model, #ref) (precision, recall, F1):
    NNP: (133, 200, 169) (0.6650, 0.7870, 0.7209)
    NN: (227, 280, 286) (0.8107, 0.7937, 0.8021)
    : (77, 82, 81) (0.9390, 0.9506, 0.9448)
    CD: (33, 40, 35) (0.8250, 0.9429, 0.8800)
    (: (4, 4, 4) (1.0000, 1.0000, 1.0000)
    ): (5, 5, 6) (1.0000, 0.8333, 0.9091)
    IN: (158, 171, 167) (0.9240, 0.9461, 0.9349)
    URL: (22, 24, 23) (0.9167, 0.9565, 0.9362)
    RT: (24, 24, 24) (1.0000, 1.0000, 1.0000)
    USR: (63, 63, 64) (1.0000, 0.9844, 0.9921)
    HT: (26, 26, 26) (1.0000, 1.0000, 1.0000)
    .: (122, 125, 123) (0.9760, 0.9919, 0.9839)
    WRB: (22, 23, 22) (0.9565, 1.0000, 0.9778)
    PRP: (158, 161, 164) (0.9814, 0.9634, 0.9723)
    VBP: (68, 86, 82) (0.7907, 0.8293, 0.8095)
    MD: (27, 28, 27) (0.9643, 1.0000, 0.9818)
    RB: (83, 100, 92) (0.8300, 0.9022, 0.8646)
    VB: (67, 86, 89) (0.7791, 0.7528, 0.7657)
    UH: (42, 61, 63) (0.6885, 0.6667, 0.6774)
    VBG: (32, 40, 35) (0.8000, 0.91