In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy.tokens import Token
from spacy.tokenizer import Tokenizer
from spacy.matcher import Matcher
from spacy.lang.tokenizer_exceptions import URL_PATTERN
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex
import re

In [22]:
# clean text before spacy
def cleanText(text):
    # get rid of new line terminator
    text = text.strip().replace("\n", " ").replace("\r", " ").replace("\r\n", " ").replace("  ", " ")
    return text

# deal with more symbols to seperate tokens
def custom_tokenizer_modified(nlp):
    # spacy defaults: when the standard behaviour is required, 
    # they need to be included when subclassing the tokenizer
    extended_prefixes = tuple(list(nlp.Defaults.prefixes) + ["-"])
    prefix_re = compile_prefix_regex(extended_prefixes)
    extended_suffixes = tuple(list(nlp.Defaults.suffixes) + ["-"])
    suffix_re = compile_suffix_regex(extended_suffixes)

    # extending the default url regex
    url = URL_PATTERN
    url_re = re.compile(url)
    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                     suffix_search=suffix_re.search,
                     token_match=url_re.match
                     )

# Test customized tokenization on Sample

In [None]:
# Default read text in spanish
nlp = spacy.load('es_core_news_lg')
# Tokenize text using custom tokenizer
nlp.tokenizer = custom_tokenizer_modified(nlp)
# Load the text
text_Sample = open("data/Sample-text.txt", encoding="utf-8").read()
# clean the text
clean_text_Sample = cleanText(text_Sample)
# Write into a sequence of Token objects
doc_Sample = nlp(clean_text_Sample)

In [None]:
# Initialize the Matcher with a vocab
matcher = Matcher(nlp.vocab)

# Deal with Hashtag
###############################################################
# Add pattern for valid hashtag, i.e. '#' plus any ASCII token
matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ALPHA": True}])

# Register token extension for hashtag
Token.set_extension("is_hashtag", default=False, force=True)

# Fit in text in matcher
matches = matcher(doc_Sample)

# Find hashtag and merge, assign hashtag label
hashtags = []
for match_id, start, end in matches:
    if doc_Sample.vocab.strings[match_id] == "HASHTAG":
        hashtags.append(doc_Sample[start:end])
with doc_Sample.retokenize() as retokenizer:
    for span in hashtags:
        retokenizer.merge(span)
        for token in span:
            token._.is_hashtag = True
##############################################################

# Assign tag to tokens
for i, token in enumerate(doc_Sample):
    if token._.is_hashtag:
        token.tag_ = 'Hashtag'
    if token.like_url:
        token.tag_ = 'URL'
    if token.like_email:
        token.tag_ = 'Email'
    if token.is_stop:
        token.tag_ = 'Stop Word'
    if token.like_num:
        token.tag_ = 'Number'
    if token.is_punct:
        token.tag_ = 'Punctuation'

In [None]:
# Write the tokens to data frame
df_Sample = pd.DataFrame()
df_Sample['Token'] = [token.text for token in doc_Sample]
df_Sample['POS'] = [token.pos_ for token in doc_Sample]
df_Sample['NE'] = [token.ent_iob_ for token in doc_Sample]
df_Sample['Lemma'] = [token.lemma_ for token in doc_Sample]
df_Sample['Tag'] = [token.tag_ for token in doc_Sample]
df_Sample.head()

In [None]:
Sample_GS = pd.read_csv('data/Sample-GS.csv', encoding='utf-8')
print(Sample_GS)

In [None]:
# check the  OpinionArticles
outer_df_Sample = pd.merge(Sample_GS, df_Sample, on='Token', how='outer', indicator='Exist')
diff_df_Sample = outer_df_Sample.loc[outer_df_Sample['Exist'] != 'both']
print("Number of non-matching token for Sample is %s" % len(diff_df_Sample))

# Test customized tokenization on OpinionArticles

In [23]:
# Default read text in spanish
nlp = spacy.load('es_core_news_lg')
# Tokenize text using custom tokenizer
nlp.tokenizer = custom_tokenizer_modified(nlp)
# Load the text
text_OpinionArticles = open("data/OpinionArticles-text.txt", encoding="utf-8").read()
# clean the text
clean_text_OpinionArticles = cleanText(text_OpinionArticles)
# Write into a sequence of Token objects
doc = nlp(clean_text_OpinionArticles)

In [24]:
# Initialize the Matcher with a vocab
matcher = Matcher(nlp.vocab)

# Deal with Hashtag
###############################################################
# Add pattern for valid hashtag, i.e. '#' plus any ASCII token
matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ALPHA": True}])

# Register token extension for hashtag
Token.set_extension("is_hashtag", default=False, force=True)

# Fit in text in matcher
matches = matcher(doc)

# Find hashtag and merge, assign hashtag label
hashtags = []
for match_id, start, end in matches:
    if doc.vocab.strings[match_id] == "HASHTAG":
        hashtags.append(doc[start:end])
with doc.retokenize() as retokenizer:
    for span in hashtags:
        retokenizer.merge(span)
        for token in span:
            token._.is_hashtag = True
##############################################################

# Assign tag to tokens
for i, token in enumerate(doc):
    if token._.is_hashtag:
        token.tag_ = 'Hashtag'
    if token.like_url:
        token.tag_ = 'URL'
    if token.like_email:
        token.tag_ = 'Email'
    if token.is_stop:
        token.tag_ = 'Stop Word'
    if token.like_num:
        token.tag_ = 'Number'
    if token.is_punct:
        token.tag_ = 'Punctuation'

In [25]:
# Write the tokens to data frame
df = pd.DataFrame()
df['Token'] = [token.text for token in doc]
df['POS'] = [token.pos_ for token in doc]
df['NE'] = [token.ent_iob_ for token in doc]
df['Lemma'] = [token.lemma_ for token in doc]
df['Tag'] = [token.tag_ for token in doc]
df.head()

Unnamed: 0,Token,POS,NE,Lemma,Tag
0,"ARTICLE0003,""Qué",INTJ,O,"ARTICLE0003,""Qué",PROPN
1,es,AUX,O,ser,Stop Word
2,el,DET,O,el,Stop Word
3,"""",PUNCT,O,"""",Punctuation
4,sharenting,INTJ,O,sharenting,PROPN


In [26]:
GoldStandard_csv_OA = pd.read_csv('data/OpinionArticlesRetokenized-GS.csv', encoding='utf-8')
print(GoldStandard_csv_OA)

       New_Index        Token Language Anglicism Adapted
0              1  ARTICLE0003      NaN     FALSE     NaN
1              2            ,      NaN     FALSE     NaN
2              3            "      NaN     FALSE     NaN
3              4          Qué      NaN     FALSE     NaN
4              5           es      NaN     FALSE     NaN
...          ...          ...      ...       ...     ...
13326      13327     millones      NaN     FALSE     NaN
13327      13328           de      NaN     FALSE     NaN
13328      13329   visitantes      NaN     FALSE     NaN
13329      13330       únicos      NaN     FALSE     NaN
13330      13331            "      NaN     FALSE     NaN

[13331 rows x 5 columns]


In [27]:
WebAnno_csv_OA = pd.read_csv('data/OpinionArticles-WebAnno-GS.csv', encoding='utf-8')
print(WebAnno_csv_OA)

      text-token   token-char        Token Adapted EngCategory
0            1-1         0-11  ARTICLE0003       _           _
1            1-2        11-12            ,       _           _
2            1-3        12-13            "       _           _
3            1-4        13-16          Qué       _           _
4            1-5        17-19           es       _           _
...          ...          ...          ...     ...         ...
13320     511-11  69361-69369     millones       _           _
13321     511-12  69370-69372           de       _           _
13322     511-13  69373-69383   visitantes       _           _
13323     511-14  69384-69390       únicos       _           _
13324     511-15  69390-69391            "       _           _

[13325 rows x 5 columns]


In [30]:
# check the  OpinionArticles
outer_df_OA = pd.merge(GoldStandard_csv_OA, df, on='Token', how='outer', indicator='Exist')
diff_df_OA = outer_df_OA.loc[outer_df_OA['Exist'] != 'both']
print("Number of non-matching token for OpinionArticles is %s" % len(diff_df_OA))
print(diff_df_OA)
diff_df_OA.to_csv(r'data/diff-token-OA.csv', index=None, header=True)

Number of non-matching token for OpinionArticles is 77
         New_Index          Token Language Anglicism Adapted   POS   NE  \
0              1.0    ARTICLE0003      NaN     FALSE     NaN   NaN  NaN   
2246082      705.0              %      NaN     FALSE     NaN   NaN  NaN   
2246083      768.0              %      NaN     FALSE     NaN   NaN  NaN   
2246084     9026.0              %      NaN     FALSE     NaN   NaN  NaN   
2246085    12523.0              %      NaN     FALSE     NaN   NaN  NaN   
...            ...            ...      ...       ...     ...   ...  ...   
2273376        NaN        Educ.ar      NaN       NaN     NaN  INTJ    B   
2273377        NaN  libre/abierto      NaN       NaN     NaN  INTJ    O   
2273378        NaN            10%      NaN       NaN     NaN   SYM    O   
2273379        NaN            25%      NaN       NaN     NaN   SYM    O   
2273380        NaN          12,8%      NaN       NaN     NaN   SYM    O   

                 Lemma                      