In [22]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from sklearn.metrics import recall_score
from sklearn.preprocessing import MultiLabelBinarizer
from collections import Counter
from spacy.tokenizer import Tokenizer
from spacy.lang import en, sr, sl, ga, da, hr, it, es
import matplotlib.pyplot as plt
import os

In [33]:
langs=['es','it','sr','hr','da','en','ir','sl']

In [46]:
def lower_case(text):
    """
    This makes a string lowercase
    """
    new_text=''
    for i in text:
        if i.isupper():
            new_text=new_text+i.lower()
        else:
            new_text=new_text+i
    return new_text

def clean(ini_list):
    """
    Helper function to lower_case all strings in a list
    """
    temp_list=[]
    for token in ini_list:
        clean_token=lower_case(token)
        temp_list.append(clean_token)
    return temp_list

def create_pred_list(true,pred):
    """
    This function creates a list of ones and zeroes for how many tokens of pred
    are in true.
    """
    temp_list=[]
    count=Counter(true)
    for token in true:
        
        if token in pred and count[token]>0:
            temp_list.append(1)
            count[token]=count[token]-1
        else:
            temp_list.append(0)
    return temp_list

def make_tokenized_pair(file_name):
    """
    This function takes a conll-style file, and returns a list of lists
    with [[nontokenized text, tokenized text list, string_tokenized],...] where 
    string tokenized is a string of tokens separated with the omega_ts separator
    """
    final_list=[]
    with open(file_name,'r') as file:
        temp_list=[]
        temp_token_list=[]
        tokenized_strings=[]
        temp_string=''
        flag=0
        flag2=0
        for line in file:
            line=line.strip()
            if line!='':
                if line[0]=='#' and line[2]=='t':
                    text=line[9:]
                    temp_list.append(text)
                    flag=1
                    
                elif flag==1 and line[0]!='#':
                    split=line.split('\t')
                    try:
                        temp_token_list.append(split[1])
                        temp_string=temp_string+split[1]+'omega_ts'
                    except:
                        temp_token_list.append(split[0])
                        temp_string=temp_string+split[0]+'omega_ts'
            else:
                temp_list.append(temp_token_list)
                temp_list.append(temp_string)
                tokenized_strings.append(temp_string)
                if temp_list not in final_list:
                    final_list.append(temp_list)
                temp_list=[]
                temp_token_list=[]
                temp_string=''
    return final_list


# NLTK tweet

In [169]:
stats=dict()#create a stats dictionary
low_scores=[]
tokenizer=TweetTokenizer()
for i in langs:
    stats[i]=[]
    current_lang=os.getcwd()+'/machamp/data/'+i+'_conl.eval'
    data=make_tokenized_pair(current_lang)
    for line in data:
        #creating the lists to compare
        tokenized = tokenizer.tokenize(line[0])
        tokenized_true = line[2].split('omega_ts')[:-1]
        #normalization for the languages that have discrepancies 
        if i=='es':
            tokenized = clean(tokenized)
        if i =='da':
            tokenized_true=clean(tokenized_true)
            tokenized = clean(tokenized)

        ones=[1 for i in tokenized_true]
        pred_list=create_pred_list(tokenized_true,tokenized)
        recall=recall_score(y_true=ones, y_pred=pred_list)
        #grabbing recall score
        score=recall
        #putting all low scores into a list to look over
        if score<0.93:
            low_scores.append([tokenized,tokenized_true,score,i])
        stats[i].append(score)

In [170]:
for lang in stats:
    print('Average score for ', lang,': ', sum(stats[lang])/len(stats[lang]))

Average score for  es :  0.8931693432272717
Average score for  sr :  0.9704317895206129
Average score for  sl :  0.960971956073257
Average score for  hr :  0.9580496890536514
Average score for  da :  0.9411341615656754
Average score for  en :  0.8943863136216621
Average score for  ir :  0.9753090372040067
Average score for  it :  0.9608047228660299


In [94]:
for i in low_scores:
    if i[3]=='sl':
        print('The True ones: ', i[1])
        print('The Pred ones: ', i[0])
        print('score: ', i[2])
        print('\n')

The True ones:  ['probejte', 'si', 'downloadat', 'popup', '-', 'blocker', ',', 'mogoce', 'bo', 'kej', 'pomagov', 'ceprov', 'dvomim', 'sam', 'probat', 'pa', 'ni', 'greh']
The Pred ones:  ['probejte', 'si', 'downloadat', 'popup-blocker', ',', 'mogoce', 'bo', 'kej', 'pomagov', 'ceprov', 'dvomimsam', 'probat', 'pa', 'ni', 'greh']
score:  0.7222222222222222


The True ones:  ['Lohk', 'pa', 'kakšno', 'pivce', 'v', 'soboto', '....']
The Pred ones:  ['Lohk', 'pa', 'kakšno', 'pivce', 'v', 'soboto', '...']
score:  0.8571428571428571


The True ones:  ['ThePirateBay', 'je', 'bil', 'itak', 'shitty', 'site', 's', 'popupi', 'in', 'tono', 'reklam', '.']
The Pred ones:  ['ThePirateBayje', 'bil', 'itak', 'shitty', 'site', 's', 'popupi', 'in', 'tono', 'reklam', '.']
score:  0.8333333333333334


The True ones:  ['@StellarGirl_', 'hahaha', ':))']
The Pred ones:  ['@StellarGirl_', 'hahaha', ':)', ')']
score:  0.6666666666666666


The True ones:  ['#LINK#']
The Pred ones:  ['#LINK', '#']
score:  0.0


The T

score:  0.6666666666666666


The True ones:  ['Še', 'vedno', 'potrebujete', 'občasno', 'varstvo', '??']
The Pred ones:  ['Še', 'vedno', 'potrebujete', 'občasno', 'varstvo', '?', '?']
score:  0.8333333333333334


The True ones:  ['p.', 's.', 'ponudbe', 'na', 'zasebno']
The Pred ones:  ['p', '.', 's', '.', 'ponudbe', 'na', 'zasebno']
score:  0.6


The True ones:  ['@cikibucka', '@freewiseguy', '@JJansaSDS', 'Ti', 'ovca', 'janševa', ',', 'kje', 'pa', 'sise', 'učila', 'angleščineTam', 'kot', 'alenka', '.']
The Pred ones:  ['@cikibucka', '@freewiseguy', '@JJansaSDS', 'Ti', 'ovca', 'janševa', ',', 'kje', 'pa', 'siseučila', 'angleščineTamkot', 'alenka', '.']
score:  0.7333333333333333


The True ones:  ['Prej', 'pa', 'so', 'lovili', 'na', 'minihakelzih', 'pri', 'prevodu', ',', 'kaj', 'je', 'boljshe', '?']
The Pred ones:  ['Prej', 'pa', 'so', 'lovili', 'na', 'minihakelzihpri', 'prevodu', ',', 'kaj', 'je', 'boljshe', '?']
score:  0.8461538461538461


The True ones:  ['Brendi', '!?']
The Pred on


The True ones:  ['lej', ',', 'čez', 'mesec', 'dni', 'bodo', 'tud', 'žlebi', 'novi', ':)))']
The Pred ones:  ['lej', ',', 'čez', 'mesec', 'dni', 'bodo', 'tud', 'žlebi', 'novi', ':)', ')', ')']
score:  0.9


The True ones:  ['Mater', 'še', 'vedno', 'neki', 'manjka', '......']
The Pred ones:  ['Mater', 'še', 'vedno', 'neki', 'manjka', '...']
score:  0.8333333333333334


The True ones:  ['to', 'be', 'done', '....']
The Pred ones:  ['to', 'be', 'done', '...']
score:  0.75


The True ones:  ['@3.', '8)', '......']
The Pred ones:  ['@3', '.', '8)', '...']
score:  0.3333333333333333


The True ones:  ['hi', 'hi', 'hi', ';);)']
The Pred ones:  ['hi', 'hi', 'hi', ';)', ';)']
score:  0.75


The True ones:  ['Slike', '...', '#LINK#']
The Pred ones:  ['Slike', '...', '#LINK', '#']
score:  0.6666666666666666


The True ones:  ['nekoč', 'danes', 'odstranjevanje', 'zračne', 'rje']
The Pred ones:  ['nekočdanesodstranjevanje', 'zračne', 'rje']
score:  0.4


The True ones:  ['Bemti', ',', 'ko', 'ti', 's

# SpaCy eval

In [99]:
langs=['es','sr','sl','hr','da','en','ir','it']
text_dict=dict()
tokenizers=[]


nlp = es.Spanish()
tokenizer_es = Tokenizer(nlp.vocab)
tokenizers.append(tokenizer_es)


nlp = sr.Serbian()
tokenizer_sr = Tokenizer(nlp.vocab)
tokenizers.append(tokenizer_sr)


nlp = sl.Slovenian()
tokenizer_sl = Tokenizer(nlp.vocab)
tokenizers.append(tokenizer_sl)

nlp = hr.Croatian()
tokenizer_hr = Tokenizer(nlp.vocab)
tokenizers.append(tokenizer_hr)


nlp = da.Danish()
tokenizer_da = Tokenizer(nlp.vocab)
tokenizers.append(tokenizer_da)


nlp = en.English()
tokenizer_en = Tokenizer(nlp.vocab)
tokenizers.append(tokenizer_en)


nlp = ga.Irish()
tokenizer_ir = Tokenizer(nlp.vocab)
tokenizers.append(tokenizer_ir)


nlp = it.Italian()
tokenizer_it = Tokenizer(nlp.vocab)
tokenizers.append(tokenizer_it)

In [113]:
stats=dict()
low_scores=[]
for tokenizer,lang in zip(tokenizers,langs):
    stats[lang]=[]
    current_lang=os.getcwd()+'/machamp/data/'+lang+'_conl.eval'
    data=make_tokenized_pair(current_lang)
    for line in data:
        tokenized = [i.text for i in tokenizer(line[0])]
        tokenized_true = line[2].split('omega_ts')[:-1]
        if i=='es':
            tokenized = clean(tokenized)
        if i =='da':
            tokenized_true=clean(tokenized_true)
            tokenized = clean(tokenized)
        ones=[1 for i in tokenized_true]
        pred_list=create_pred_list(tokenized_true,tokenized)
        recall=recall_score(y_true=ones, y_pred=pred_list)
        score=recall
        if score<0.93:
            low_scores.append([tokenized,tokenized_true,score,lang])
        stats[lang].append(score)

In [114]:
for lang in stats:
    print('Average score for ', lang,': ', sum(stats[lang])/len(stats[lang]))

Average score for  es :  0.9566301738650272
Average score for  sr :  0.7526440729807663
Average score for  sl :  0.680309730510343
Average score for  hr :  0.7538599107598141
Average score for  da :  0.5811236570581381
Average score for  en :  0.7244511635156132
Average score for  ir :  0.7626815695947229
Average score for  it :  0.7961049364583433


In [132]:
for i in low_scores:
    if i[3]=='hr':
        for j in i[0]:
            if "'" in j:
                print('The True ones: ', i[1])
                print('The Pred ones: ', i[0])
                print('score: ', i[2])
                print('\n')
                continue

The True ones:  ["''", 'nećemo', 'noćas', 'doma', 'nećemo', 'do', 'zore', 'jednom', 'se', 'samo', 'živi', 'jednom', 'se', 'umire', "''", '...', 'itd', ':)']
The Pred ones:  ["''nećemo", 'noćas', 'doma', 'nećemo', 'do', 'zore', 'jednom', 'se', 'samo', 'živi', 'jednom', 'se', "umire''...itd", ':)']
score:  0.7222222222222222


The True ones:  ["''", 'nećemo', 'noćas', 'doma', 'nećemo', 'do', 'zore', 'jednom', 'se', 'samo', 'živi', 'jednom', 'se', 'umire', "''", '...', 'itd', ':)']
The Pred ones:  ["''nećemo", 'noćas', 'doma', 'nećemo', 'do', 'zore', 'jednom', 'se', 'samo', 'živi', 'jednom', 'se', "umire''...itd", ':)']
score:  0.7222222222222222


The True ones:  ['@Karudijan3', 'Gledam', 'i', 'sušam', 'jer', 'je', '@maki_992', 'spomenula', "'", 'harakiri', 'bez', 'žaljenja', "'", 'pa', 'da', 'nešto', 'ne', 'propustim', ':)', '@NataaPoli']
The Pred ones:  ['@Karudijan3', 'Gledam', 'i', 'sušam', 'jer', 'je', '@maki_992', 'spomenula', "'harakiri", 'bez', "žaljenja'", 'pa', 'da', 'nešto', '

score:  0.7777777777777778




# Twokenizer eval 

In [133]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Twokenize -- a tokenizer designed for Twitter text in English and some other European languages.
This tokenizer code has gone through a long history:

(1) Brendan O'Connor wrote original version in Python, http://github.com/brendano/tweetmotif
       TweetMotif: Exploratory Search and Topic Summarization for Twitter.
       Brendan O'Connor, Michel Krieger, and David Ahn.
       ICWSM-2010 (demo track), http://brenocon.com/oconnor_krieger_ahn.icwsm2010.tweetmotif.pdf
(2a) Kevin Gimpel and Daniel Mills modified it for POS tagging for the CMU ARK Twitter POS Tagger
(2b) Jason Baldridge and David Snyder ported it to Scala
(3) Brendan bugfixed the Scala port and merged with POS-specific changes
    for the CMU ARK Twitter POS Tagger
(4) Tobi Owoputi ported it back to Java and added many improvements (2012-06)

Current home is http://github.com/brendano/ark-tweet-nlp and http://www.ark.cs.cmu.edu/TweetNLP

There have been at least 2 other Java ports, but they are not in the lineage for the code here.

Ported to Python by Myle Ott <myleott@gmail.com>.
"""
from __future__ import unicode_literals

import operator
import re
import sys

try:
    from html.parser import HTMLParser
except ImportError:
    from HTMLParser import HTMLParser

try:
    import html
except ImportError:
    pass

def regex_or(*items):
    return '(?:' + '|'.join(items) + ')'

Contractions = re.compile(u"(?i)(\w+)(n['’′]t|['’′]ve|['’′]ll|['’′]d|['’′]re|['’′]s|['’′]m)$", re.UNICODE)
Whitespace = re.compile(u"[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+", re.UNICODE)

punctChars = r"['\"“”‘’.?!…,:;]"
#punctSeq   = punctChars+"+"	#'anthem'. => ' anthem '.
punctSeq   = r"['\"“”‘’]+|[.?!,…]+|[:;]+"	#'anthem'. => ' anthem ' .
entity     = r"&(?:amp|lt|gt|quot);"
#  URLs


# BTO 2012-06: everyone thinks the daringfireball regex should be better, but they're wrong.
# If you actually empirically test it the results are bad.
# Please see https://github.com/brendano/ark-tweet-nlp/pull/9

urlStart1  = r"(?:https?://|\bwww\.)"
commonTLDs = r"(?:com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|pro|tel|travel|xxx)"
ccTLDs	 = r"(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" + \
r"bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" + \
r"er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" + \
r"hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" + \
r"lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" + \
r"nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" + \
r"sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" + \
r"va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)"	#TODO: remove obscure country domains?
urlStart2  = r"\b(?:[A-Za-z\d-])+(?:\.[A-Za-z0-9]+){0,3}\." + regex_or(commonTLDs, ccTLDs) + r"(?:\."+ccTLDs+r")?(?=\W|$)"
urlBody    = r"(?:[^\.\s<>][^\s<>]*?)?"
urlExtraCrapBeforeEnd = regex_or(punctChars, entity) + "+?"
urlEnd     = r"(?:\.\.+|[<>]|\s|$)"
url        = regex_or(urlStart1, urlStart2) + urlBody + "(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")"


# Numeric
timeLike   = r"\d+(?::\d+){1,2}"
#numNum     = r"\d+\.\d+"
numberWithCommas = r"(?:(?<!\d)\d{1,3},)+?\d{3}" + r"(?=(?:[^,\d]|$))"
numComb	 = u"[\u0024\u058f\u060b\u09f2\u09f3\u09fb\u0af1\u0bf9\u0e3f\u17db\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6\u00a2-\u00a5\u20a0-\u20b9]?\\d+(?:\\.\\d+)+%?"

# Abbreviations
boundaryNotDot = regex_or("$", r"\s", r"[“\"?!,:;]", entity)
aa1  = r"(?:[A-Za-z]\.){2,}(?=" + boundaryNotDot + ")"
aa2  = r"[^A-Za-z](?:[A-Za-z]\.){1,}[A-Za-z](?=" + boundaryNotDot + ")"
standardAbbreviations = r"\b(?:[Mm]r|[Mm]rs|[Mm]s|[Dd]r|[Ss]r|[Jj]r|[Rr]ep|[Ss]en|[Ss]t)\."
arbitraryAbbrev = regex_or(aa1, aa2, standardAbbreviations)
separators  = "(?:--+|―|—|~|–|=)"
decorations = u"(?:[♫♪]+|[★☆]+|[♥❤♡]+|[\u2639-\u263b]+|[\ue001-\uebbb]+)"
thingsThatSplitWords = r"[^\s\.,?\"]"
embeddedApostrophe = thingsThatSplitWords+r"+['’′]" + thingsThatSplitWords + "*"

#  Emoticons
# myleott: in Python the (?iu) flags affect the whole expression
#normalEyes = "(?iu)[:=]" # 8 and x are eyes but cause problems
normalEyes = "[:=]" # 8 and x are eyes but cause problems
wink = "[;]"
noseArea = "(?:|-|[^a-zA-Z0-9 ])" # doesn't get :'-(
happyMouths = r"[D\)\]\}]+"
sadMouths = r"[\(\[\{]+"
tongue = "[pPd3]+"
otherMouths = r"(?:[oO]+|[/\\]+|[vV]+|[Ss]+|[|]+)" # remove forward slash if http://'s aren't cleaned

# mouth repetition examples:
# @aliciakeys Put it in a love song :-))
# @hellocalyclops =))=))=)) Oh well

# myleott: try to be as case insensitive as possible, but still not perfect, e.g., o.O fails
#bfLeft = u"(♥|0|o|°|v|\\$|t|x|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)".encode('utf-8')
bfLeft = u"(♥|0|[oO]|°|[vV]|\\$|[tT]|[xX]|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)"
bfCenter = r"(?:[\.]|[_-]+)"
bfRight = r"\2"
s3 = r"(?:--['\"])"
s4 = r"(?:<|&lt;|>|&gt;)[\._-]+(?:<|&lt;|>|&gt;)"
s5 = "(?:[.][_]+[.])"
# myleott: in Python the (?i) flag affects the whole expression
#basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
basicface = "(?:" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5

eeLeft = r"[＼\\ƪԄ\(（<>;ヽ\-=~\*]+"
eeRight= u"[\\-=\\);'\u0022<>ʃ）/／ノﾉ丿╯σっµ~\\*]+"
eeSymbol = r"[^A-Za-z0-9\s\(\)\*:=-]"
eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight

oOEmote = r"(?:[oO]" + bfCenter + r"[oO])"


emoticon = regex_or(
        # Standard version  :) :( :] :D :P
        "(?:>|&gt;)?" + regex_or(normalEyes, wink) + regex_or(noseArea,"[Oo]") + regex_or(tongue+r"(?=\W|$|RT|rt|Rt)", otherMouths+r"(?=\W|$|RT|rt|Rt)", sadMouths, happyMouths),

        # reversed version (: D:  use positive lookbehind to remove "(word):"
        # because eyes on the right side is more ambiguous with the standard usage of : ;
        regex_or("(?<=(?: ))", "(?<=(?:^))") + regex_or(sadMouths,happyMouths,otherMouths) + noseArea + regex_or(normalEyes, wink) + "(?:<|&lt;)?",

        #inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style
        eastEmote.replace("2", "1", 1), basicface,
        # iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb]
        # TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this

        # myleott: o.O and O.o are two of the biggest sources of differences
        #          between this and the Java version. One little hack won't hurt...
        oOEmote
)

Hearts = "(?:<+/?3+)+" #the other hearts are in decorations

Arrows = regex_or(r"(?:<*[-―—=]*>+|<+[-―—=]*>*)", u"[\u2190-\u21ff]+")

# BTO 2011-06: restored Hashtag, AtMention protection (dropped in original scala port) because it fixes
# "hello (#hashtag)" ==> "hello (#hashtag )"  WRONG
# "hello (#hashtag)" ==> "hello ( #hashtag )"  RIGHT
# "hello (@person)" ==> "hello (@person )"  WRONG
# "hello (@person)" ==> "hello ( @person )"  RIGHT
# ... Some sort of weird interaction with edgepunct I guess, because edgepunct
# has poor content-symbol detection.

# This also gets #1 #40 which probably aren't hashtags .. but good as tokens.
# If you want good hashtag identification, use a different regex.
Hashtag = "#[a-zA-Z0-9_]+"  #optional: lookbehind for \b
#optional: lookbehind for \b, max length 15
AtMention = "[@＠][a-zA-Z0-9_]+"

# I was worried this would conflict with at-mentions
# but seems ok in sample of 5800: 7 changes all email fixes
# http://www.regular-expressions.info/email.html
Bound = r"(?:\W|^|$)"
Email = regex_or("(?<=(?:\W))", "(?<=(?:^))") + r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}(?=" +Bound+")"

# We will be tokenizing using these regexps as delimiters
# Additionally, these things are "protected", meaning they shouldn't be further split themselves.
Protected  = re.compile(
    regex_or(
        Hearts,
        url,
        Email,
        timeLike,
        #numNum,
        numberWithCommas,
        numComb,
        emoticon,
        Arrows,
        entity,
        punctSeq,
        arbitraryAbbrev,
        separators,
        decorations,
        embeddedApostrophe,
        Hashtag,
        AtMention), re.UNICODE)

# Edge punctuation
# Want: 'foo' => ' foo '
# While also:   don't => don't
# the first is considered "edge punctuation".
# the second is word-internal punctuation -- don't want to mess with it.
# BTO (2011-06): the edgepunct system seems to be the #1 source of problems these days.
# I remember it causing lots of trouble in the past as well.  Would be good to revisit or eliminate.

# Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes)
#edgePunctChars    = r"'\"“”‘’«»{}\(\)\[\]\*&" #add \\p{So}? (symbols)
edgePunctChars    = u"'\"“”‘’«»{}\\(\\)\\[\\]\\*&" #add \\p{So}? (symbols)
edgePunct    = "[" + edgePunctChars + "]"
notEdgePunct = "[a-zA-Z0-9]" # content characters
offEdge = r"(^|$|:|;|\s|\.|,)"  # colon here gets "(hello):" ==> "( hello ):"
EdgePunctLeft  = re.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")", re.UNICODE)
EdgePunctRight = re.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge, re.UNICODE)

def splitEdgePunct(input):
    input = EdgePunctLeft.sub(r"\1\2 \3", input)
    input = EdgePunctRight.sub(r"\1 \2\3", input)
    return input

# The main work of tokenizing a tweet.
def simpleTokenize(text):

    # Do the no-brainers first
    splitPunctText = splitEdgePunct(text)

    textLength = len(splitPunctText)

    # BTO: the logic here got quite convoluted via the Scala porting detour
    # It would be good to switch back to a nice simple procedural style like in the Python version
    # ... Scala is such a pain.  Never again.

    # Find the matches for subsequences that should be protected,
    # e.g. URLs, 1.0, U.N.K.L.E., 12:53
    bads = []
    badSpans = []
    for match in Protected.finditer(splitPunctText):
        # The spans of the "bads" should not be split.
        if (match.start() != match.end()): #unnecessary?
            bads.append( [splitPunctText[match.start():match.end()]] )
            badSpans.append( (match.start(), match.end()) )

    # Create a list of indices to create the "goods", which can be
    # split. We are taking "bad" spans like
    #     List((2,5), (8,10))
    # to create
    #     List(0, 2, 5, 8, 10, 12)
    # where, e.g., "12" here would be the textLength
    # has an even length and no indices are the same
    indices = [0]
    for (first, second) in badSpans:
        indices.append(first)
        indices.append(second)
    indices.append(textLength)

    # Group the indices and map them to their respective portion of the string
    splitGoods = []
    for i in range(0, len(indices), 2):
        goodstr = splitPunctText[indices[i]:indices[i+1]]
        splitstr = goodstr.strip().split(" ")
        splitGoods.append(splitstr)

    #  Reinterpolate the 'good' and 'bad' Lists, ensuring that
    #  additonal tokens from last good item get included
    zippedStr = []
    for i in range(len(bads)):
        zippedStr = addAllnonempty(zippedStr, splitGoods[i])
        zippedStr = addAllnonempty(zippedStr, bads[i])
    zippedStr = addAllnonempty(zippedStr, splitGoods[len(bads)])

    # BTO: our POS tagger wants "ur" and "you're" to both be one token.
    # Uncomment to get "you 're"
    #splitStr = []
    #for tok in zippedStr:
    #    splitStr.extend(splitToken(tok))
    #zippedStr = splitStr

    return zippedStr

def addAllnonempty(master, smaller):
    for s in smaller:
        strim = s.strip()
        if (len(strim) > 0):
            master.append(strim)
    return master

# "foo   bar " => "foo bar"
def squeezeWhitespace(input):
    return Whitespace.sub(" ", input).strip()

# Final pass tokenization based on special patterns
def splitToken(token):
    m = Contractions.search(token)
    if m:
        return [m.group(1), m.group(2)]
    return [token]

# Assume 'text' has no HTML escaping.
def tokenize(text):
    return simpleTokenize(squeezeWhitespace(text))


# Twitter text comes HTML-escaped, so unescape it.
# We also first unescape &amp;'s, in case the text has been buggily double-escaped.
def normalizeTextForTagger(text):
    assert sys.version_info[0] >= 3 and sys.version_info[1] > 3, 'Python version >3.3 required'
    text = text.replace("&amp;", "&")
    text = html.unescape(text)
    return text

# This is intended for raw tweet text -- we do some HTML entity unescaping before running the tagger.
#
# This function normalizes the input text BEFORE calling the tokenizer.
# So the tokens you get back may not exactly correspond to
# substrings of the original text.
def tokenizeRawTweetText(text):
    tokens = tokenize(normalizeTextForTagger(text))
    return tokens


if __name__ == '__main__':
    for line in sys.stdin:
        print(' '.join(tokenizeRawTweetText(line)))


In [135]:
stats=dict()
low_scores=[]
tokenizer=TweetTokenizer()
for i in langs:
    stats[i]=[]
    current_lang=os.getcwd()+'/machamp/data/'+i+'_conl.eval'
    data=make_tokenized_pair(current_lang)
    for line in data:
        tokenized = tokenize(line[0])
        tokenized_true = line[2].split('omega_ts')[:-1]
        if i=='es':
            tokenized = clean(tokenized)
        if i =='da':
            tokenized_true=clean(tokenized_true)
            tokenized = clean(tokenized)

        ones=[1 for i in tokenized_true]
        pred_list=create_pred_list(tokenized_true,tokenized)
        recall=recall_score(y_true=ones, y_pred=pred_list)
        score=recall
        if score<0.93:
            low_scores.append([tokenized,tokenized_true,score,i])
        stats[i].append(score)

In [136]:
for lang in stats:
    print('Average score for ', lang,': ', sum(stats[lang])/len(stats[lang]))

Average score for  es :  0.905361809540099
Average score for  sr :  0.97403194004355
Average score for  sl :  0.9682785634631763
Average score for  hr :  0.9758417008964796
Average score for  da :  0.9005999649932893
Average score for  en :  0.9419050952073615
Average score for  ir :  0.9707942123597965
Average score for  it :  0.9651606028383585


In [147]:
for i in low_scores:
    if i[3]=='da':
        print('The True ones: ', i[1])
        print('The Pred ones: ', i[0])
        print('score: ', i[2])
        print('\n')

The True ones:  ['eeeej', '´', '?']
The Pred ones:  ['eeeej´', '?']
score:  0.3333333333333333


The True ones:  ['eej', '´', '?']
The Pred ones:  ['eej´', '?']
score:  0.3333333333333333


The True ones:  ['19-10-', '2007', '18:39', 'skrev', 'katiiiee', 'p13', 'erhm', '.', '-', 'sweety', '?']
The Pred ones:  ['19-10-2007', '18:39', 'skrev', 'katiiiee', 'p13', 'erhm', '.', '-', 'sweety', '?']
score:  0.8181818181818182


The True ones:  ['saa', 'skal', 'vi', 'sq', 'lave', 'sange', 'samn', 'd', ';', '<3', '3', 'nar', 'jeg', 'høre', 'dig', 'synge', ',', '.']
The Pred ones:  ['saa', 'skal', 'vi', 'sq', 'lave', 'sange', 'samn', 'd', ';<33', 'nar', 'jeg', 'høre', 'dig', 'synge', ',.']
score:  0.7222222222222222


The True ones:  [';', 'd', 'i', 'just', 'love', 'teh', 'scrabbles', '!']
The Pred ones:  [';', 'di', 'just', 'love', 'teh', 'scrabbles', '!']
score:  0.75


The True ones:  ['_', '\\', '-', '...', 'af', 'ren', 'og', 'sk', 'aer', 'jalouxi', '.', '_', 'arto', 'historier', ':', '11-09

# Evaluating MaChAmp

In [157]:
stats = dict()
low_scores=[]
for i in langs:
    stats[i]=[]#load in the results from the predictions 
    pred_file=os.getcwd()+'/machamp/predictions/{}_result.out'.format(i)
    pred_list=make_tokenized_pair(pred_file)
    
    true_file=os.getcwd()+'/machamp/data/{}_conl.eval'.format(i)
    true_list=make_tokenized_pair(true_file)
    glob_counter=0
    for pred,true in zip(pred_list,true_list):
        counter=1
        while pred[0]!=true[0]:
            true=true_list[glob_counter+counter]
            counter+=1
        tokenized=pred[2].split('omega_ts')[:-1]
        tokenized_true=true[2].split('omega_ts')[:-1]
        if i=='es':
            tokenized = clean(tokenized)
        if i =='da':
            tokenized_true=clean(tokenized_true)
            tokenized = clean(tokenized)

        ones=[1 for i in tokenized_true]
        predicted_list=create_pred_list(tokenized_true,tokenized)
        recall=recall_score(y_true=ones, y_pred=predicted_list)
        score=recall
        if recall<0.9:
            low_scores.append([tokenized,tokenized_true,score,i])
        stats[i].append(score)
        glob_counter+=1


es 439 439
sr 2730 2730
sl 2977 2977
hr 3099 3099
da 84 84
en 1419 1419
ir 345 345
it 2677 2677


In [158]:
for lang in stats:
    print('Average score for ', lang,': ', sum(stats[lang])/len(stats[lang]))

Average score for  es :  0.9280632977167205
Average score for  sr :  0.9904905610368828
Average score for  sl :  0.9841375742308036
Average score for  hr :  0.9837810025683275
Average score for  da :  0.9333860519562285
Average score for  en :  0.9857017316093317
Average score for  ir :  0.9879718721889826
Average score for  it :  0.9948890216062959


In [168]:
for i in low_scores:
    if i[3]=='sr':
        print('The True ones: ', i[1])
        print('The Pred ones: ', i[0])
        print('score: ', i[2])
        print('\n')

The True ones:  ['Ja', 'sam', 'na', 'nivou', '12godišnjaka', '.']
The Pred ones:  ['Ja', 'sam', 'na', 'nivou', '12', 'godišnjaka', '.']
score:  0.8333333333333334


The True ones:  ['1.', 'Omiljena', 'boja', '2.', 'Ime', 'mog', 'ljubimca', '3.', 'Ja', 'sam', 'odeljenje', '8', '/', '__']
The Pred ones:  ['1.', 'Omiljena', 'boja', '2.', 'Ime', 'mog', 'ljubimca', '3.', 'Ja', 'sam', 'odeljenje', '8', '/__']
score:  0.8571428571428571


The True ones:  ['@billyjane', "Vid'", 'u', 'pravu', 'si', ',', 'bokte', '.']
The Pred ones:  ['@billyjane', "Vid'u", 'pravu', 'si', ',', 'bokte', '.']
score:  0.75


The True ones:  ['C:', 'C:']
The Pred ones:  ['C:C:']
score:  0.0


The True ones:  ['e', 'sad', 'zbog', 'vase', 'reklame', 'sta', 'je', 'za', 'vas', 'najbolje', 'moju', 'babu', 'interesuje', 'sta', 'je', 'poza', '69', '...']
The Pred ones:  ['e', 'sad', 'zbog', 'vase', 'reklame', 'sta', 'je', 'za', 'vas', 'najbolje', 'moju', 'babu', 'interesuje', 'sta', 'je', 'poza69', '...']
score:  0.8888888