In [114]:
from pathlib import Path
url = 'https://github.com/UniversalDependencies/UD_Spanish-AnCora.git'
path = Path('/tmp/spanish-corpus/')

In [115]:
from conllu import parse_incr

In [116]:
!git clone $url $path && echo 'Done.'

fatal: destination path '/tmp/spanish-corpus' already exists and is not an empty directory.


In [119]:
import numpy as np

In [126]:
word_list = []
first_word_list = []
counter = 0
conllu_file = 'es_ancora-ud-dev.conllu'
with open(path / conllu_file, 'r', encoding='utf-8') as f:
    for tokenlist in parse_incr(f):
        for token in tokenlist:
            word_list.append(token)
        first_word_list.append(tokenlist[0])

$$ C(tag) $$

$$ C(\text{word} \cap \text{tag}) $$ 

$$ C(\text{word}_{\text{first}}) $$ 

$$ C(\text{tag} \cap \text{tag}_{\text{prev}})$$

In [101]:
import pandas as pd
import numpy as np

In [102]:
df = pd.DataFrame(word_list)
df = df[['form','upos']]
df.rename(columns={'form': 'word', 'upos': 'tag'}, inplace=True)
df.word = df.word.str.lower()
df['tag_prev'] = df['tag'].shift(1)
df

Unnamed: 0,word,tag,tag_prev
0,el,DET,
1,gobernante,NOUN,DET
2,",",PUNCT,NOUN
3,con,ADP,PUNCT
4,ganada,ADJ,ADP
...,...,...,...
55477,llamaba,VERB,PRON
55478,francisco,PROPN,VERB
55479,franco,PROPN,PROPN
55480,"""",PUNCT,PROPN


In [131]:
print(df.head().to_markdown())

|    | word       | tag   | tag_prev   |
|---:|:-----------|:------|:-----------|
|  0 | el         | DET   |            |
|  1 | gobernante | NOUN  | DET        |
|  2 | ,          | PUNCT | NOUN       |
|  3 | con        | ADP   | PUNCT      |
|  4 | ganada     | ADJ   | ADP        |


In [135]:
tag_count = df['tag'].value_counts()
tagpair_count = df[['tag', 'tag_prev']].drop(index=0).value_counts()

In [104]:
wordtag_count = df[['word', 'tag']].value_counts()

In [155]:
pd.Series(list(map(lambda x: x['upos'], first_word_list))).value_counts(normalize=True)

DET      0.362757
ADP      0.155381
PROPN    0.112455
PUNCT    0.082225
PRON     0.063482
ADV      0.056832
CCONJ    0.032648
NOUN     0.027207
SCONJ    0.024184
VERB     0.024184
NUM      0.019952
AUX      0.016324
ADJ      0.010883
_        0.009069
PART     0.001814
INTJ     0.000605
dtype: float64

In [157]:
df_init = pd.DataFrame(first_word_list)
df_init = df_init.apply(lambda row: {"init_tag": row["upos"]}, axis = 1, result_type='expand')
inittag_prob = df_init["init_tag"].value_counts(normalize=True)
for tag_type in tag_count.index:
    if tag_type not in inittag_prob.index:
        inittag_prob[tag_type] = 0
assert(inittag_prob)
inittag_prob

DET      0.362757
ADP      0.155381
PROPN    0.112455
PUNCT    0.082225
PRON     0.063482
ADV      0.056832
CCONJ    0.032648
NOUN     0.027207
SCONJ    0.024184
VERB     0.024184
NUM      0.019952
AUX      0.016324
ADJ      0.010883
_        0.009069
PART     0.001814
INTJ     0.000605
SYM      0.000000
Name: init_tag, dtype: float64

In [106]:
tag_prob = tag_count / tag_count.sum()
tagpair_prob = tagpair_count / tagpair_count.sum()
wordtag_prob = wordtag_count / wordtag_count.sum()

In [158]:
prior = tagpair_prob /  tag_prob[tagpair_count.reset_index()['tag_prev']].values
prior

tag    tag_prev
NOUN   DET         0.705362
DET    ADP         0.514030
ADP    NOUN        0.323474
PUNCT  NOUN        0.217386
NOUN   ADP         0.194653
                     ...   
SCONJ  NUM         0.001143
ADJ    SYM         0.027778
PART   PROPN       0.000247
VERB   INTJ        0.100002
PART   PRON        0.000317
Length: 212, dtype: float64

In [108]:
likelihood = wordtag_prob / tag_prob[wordtag_count.reset_index()['tag']].values
likelihood

word       tag  
de         ADP      0.461457
,          PUNCT    0.451372
el         DET      0.342932
la         DET      0.232469
.          PUNCT    0.263208
                      ...   
estimular  VERB     0.000219
estimulan  VERB     0.000219
estimar    VERB     0.000219
estimado   ADJ      0.000282
únicos     ADJ      0.000282
Length: 10526, dtype: float64

In [109]:
Path.mkdir(Path.cwd().parent.parent.parent / 'data', exist_ok=True)

In [110]:
prior.sort_index().to_pickle(Path.cwd().parent.parent.parent / 'data' / 'transitionHMM.npy')
likelihood.sort_index().to_pickle(Path.cwd().parent.parent.parent / 'data' / 'emissionHMM.npy')
inittag_prob.sort_index().to_pickle(Path.cwd().parent.parent.parent / 'data' / 'inittagprobHMM.npy')