# Get all verbal lemmas for Quechua in CONLL 2017

In [1]:
import pandas as pd

In [2]:
quechua_dev_path = "quechua-dev"

In [3]:
quechua_dev = pd.read_csv(quechua_dev_path, header=None, sep= "\t")

In [4]:
quechua_dev.columns

Int64Index([0, 1, 2], dtype='int64')

In [5]:
# what is inside the third column? The tags!
quechua_dev[2] 

0          N;PSS2P;NOM;PL
1          N;PSS1S;ACC;SG
2         N;PSS1PE;ABL;SG
3          N;PSS2P;NOM;PL
4          N;PSS3P;NOM;SG
              ...        
995      N;PSS1S;EXCLV;SG
996            V;PRS;3;PL
997            N;COMPV;SG
998    ADJ;PSS3S;COMPV;PL
999      N;PSS2S;EXCLV;PL
Name: 2, Length: 1000, dtype: object

In [6]:
# keep only the verbal lemmas!
# They have the tag "V" in their tagset

In [7]:
verbs_index = quechua_dev[2].apply(lambda st: st.startswith("V;"))

In [8]:
quechua_dev_verbs = quechua_dev[verbs_index]
quechua_dev_verbs

Unnamed: 0,0,1,2
6,hunt'ay,hunt'aychik,V;IMP;POS;2;PL
12,tiyamuy,tiyamuq,V;NFIN;AGT
15,t'ustuy,t'ustuychik,V;IMP;POS;2;PL
16,ch'isiyay,ch'isiyarqanku,V;PST;FH;3;PL
33,pisipay,pisipanqa,V;FUT;3;SG
...,...,...,...
960,ch'ipchiy,ch'ipchisqanki,V;PST;NFH;2;SG
981,hayratachiy,amahayratachiychu,V;IMP;NEG;2;SG
986,ch'isiyay,ch'isiyanqa,V;FUT;3;SG
988,anchaykachay,anchaykacharqaniku,V;PST;FH;1+EXCL;PL


In [9]:
# All the lemmas are in infinitive, which always ends with "-y"
all(quechua_dev_verbs[0].apply(lambda st: st.endswith("y")))

True

In [10]:
# see which tags are used for verbs
tags = ";".join(
    list(quechua_dev_verbs[2])
)
tags = set(tags.split(";"))
tags

{'1',
 '1+EXCL',
 '1+INCL',
 '2',
 '3',
 'AGT',
 'FH',
 'FUT',
 'IMP',
 'NEG',
 'NFH',
 'NFIN',
 'PL',
 'POS',
 'PRS',
 'PST',
 'SG',
 'V'}

In [12]:
# how many different tagsets are there?
len(set(list(quechua_dev_verbs[2])))

37

In [13]:
tags_list = list(set(list(quechua_dev_verbs[2])))
tags_list.sort()
tags_list

['V;FUT;1+INCL;PL',
 'V;FUT;1;SG',
 'V;FUT;2;PL',
 'V;FUT;2;SG',
 'V;FUT;3;PL',
 'V;FUT;3;SG',
 'V;IMP;NEG;2;PL',
 'V;IMP;NEG;2;SG',
 'V;IMP;NEG;3;PL',
 'V;IMP;NEG;3;SG',
 'V;IMP;POS;2;PL',
 'V;IMP;POS;2;SG',
 'V;IMP;POS;3;PL',
 'V;IMP;POS;3;SG',
 'V;NFIN',
 'V;NFIN;AGT',
 'V;PRS;1+EXCL;PL',
 'V;PRS;1+INCL;PL',
 'V;PRS;1;SG',
 'V;PRS;2;PL',
 'V;PRS;2;SG',
 'V;PRS;3;PL',
 'V;PRS;3;SG',
 'V;PST;FH;1+EXCL;PL',
 'V;PST;FH;1+INCL;PL',
 'V;PST;FH;1;SG',
 'V;PST;FH;2;PL',
 'V;PST;FH;2;SG',
 'V;PST;FH;3;PL',
 'V;PST;FH;3;SG',
 'V;PST;NFH;1+EXCL;PL',
 'V;PST;NFH;1+INCL;PL',
 'V;PST;NFH;1;SG',
 'V;PST;NFH;2;PL',
 'V;PST;NFH;2;SG',
 'V;PST;NFH;3;PL',
 'V;PST;NFH;3;SG']

In [14]:
# some examples

quechua_dev_verbs.groupby(quechua_dev_verbs[2]).sample(1)

Unnamed: 0,0,1,2
670,añaychay,añaychasunchik,V;FUT;1+INCL;PL
440,q'apay,q'apasaq,V;FUT;1;SG
191,sinchiyay,sinchiyankichik,V;FUT;2;PL
227,allay,allanki,V;FUT;2;SG
77,hamuy,hamunqaku,V;FUT;3;PL
661,qunqay,qunqanqa,V;FUT;3;SG
857,wikch'uy,amawikch'uychikchu,V;IMP;NEG;2;PL
518,chapatiyay,amachapatiyaychu,V;IMP;NEG;2;SG
579,tiyamuy,amatiyamuchunkuchu,V;IMP;NEG;3;PL
788,chimpay,amachimpachunchu,V;IMP;NEG;3;SG


In [16]:
# add columns
quechua_dev_verbs.columns = ['root', 'inflection', 'tags']
quechua_dev_verbs.head()

Unnamed: 0,root,inflection,tags
6,hunt'ay,hunt'aychik,V;IMP;POS;2;PL
12,tiyamuy,tiyamuq,V;NFIN;AGT
15,t'ustuy,t'ustuychik,V;IMP;POS;2;PL
16,ch'isiyay,ch'isiyarqanku,V;PST;FH;3;PL
33,pisipay,pisipanqa,V;FUT;3;SG


# make a column of root + tags

In [22]:
# put the tagset in brackets
quechua_dev_verbs['root_and_tags'] = (
    [root + "[" + tag + "]" for root, tag in zip(list(quechua_dev_verbs['root']), list(quechua_dev_verbs['tags']))]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  quechua_dev_verbs['root_and_tags'] = (


In [23]:
quechua_dev_verbs.head()

Unnamed: 0,root,inflection,tags,root_and_tags
6,hunt'ay,hunt'aychik,V;IMP;POS;2;PL,hunt'ay[V;IMP;POS;2;PL]
12,tiyamuy,tiyamuq,V;NFIN;AGT,tiyamuy[V;NFIN;AGT]
15,t'ustuy,t'ustuychik,V;IMP;POS;2;PL,t'ustuy[V;IMP;POS;2;PL]
16,ch'isiyay,ch'isiyarqanku,V;PST;FH;3;PL,ch'isiyay[V;PST;FH;3;PL]
33,pisipay,pisipanqa,V;FUT;3;SG,pisipay[V;FUT;3;SG]


# save

In [24]:
# first sort alphabetically
quechua_dev_verbs = quechua_dev_verbs.sort_values('root_and_tags')

In [27]:
#save the dataset
quechua_dev_verbs.to_csv(
    "quechua_dev_verbs.csv", 
    index=False,
)

In [33]:
# save the roots
quechua_dev_verbs['root'].to_csv('quechua_lemmas.txt', header=None, index=False)