
@author: Lucas Ferro Antunes de Oliveira

This script transform the PUCPR corpus tagset to PennTreebank tagset (with no INFLECTION tags)
The original tagset: https://visl.sdu.dk/visl/pt/info/portsymbol.html


In [None]:
# Importing libraries

import matplotlib.pyplot as plt
import io
import collections
import unidecode


In [None]:
_PATH_CORPORA = ""

In [None]:
# Dictionaries to count words and tags in the corpus

dictTags = {}
dictTagsPostNormalization = {}
dictTagsPostNormalization2 = {}

dictPRN = {}
dictART= {}
dictN = {}
dictPREP = {}
dictADV = {}
dictNUM = {}
dictADJ = {}
dictIN = {}
dictNPROP = {}
dictPCP = {}
dictV = {}

In [None]:
# More dictionaries

dictPRN2 = {}
dictART2= {}
dictN2 = {}
dictPREP2 = {}
dictADV2 = {}
dictNUM2 = {}
dictADJ2 = {}
dictIN2 = {}
dictNPROP2 = {}
dictPCP2 = {}
dictV2 = {}

In [None]:
# Create a new corpus file with the adapted tagset

fw = io.open(_PATH_CORPORA + "PUCPR-tagset-adapted-penntreebank-update.txt",'w',encoding='utf8')
fw.write("-DOCSTART- -X- -X- O\n")

In [None]:
# Create a new corpus file with the adapted normalized tagset

fw2 = io.open(_PATH_CORPORA + "PUCPR-tagset-adapted-penntrebank-no-accents.txt",'w',encoding='utf8')
fw2.write("-DOCSTART- -X- -X- O\n")

In [None]:
# Open corpus

with io.open(_PATH_CORPORA + "corpora\PUCPR.txt",'r',encoding='utf8') as f:
    
    # Get the text
    text = f.read()
    
    # Define replace lists
    # More on https://www.infoescola.com/portugues/pronomes/ and https://www.infoescola.com/portugues/adverbios/
    PRNtoDT = ["esta", "este", "estas", "estes", "isto", "isso", "aquilo", "aquele", "aquela", "aqueles", "aquelas", "meu", "minha", "meus", "minhas", "teu", "tua", "teus", "tuas", "sua", "seu", "suas", "seus", "nosso", "nossa", "nossos", "nossas", "vosso", "vossa", "vossos", "vossas", "cujo", "cuja", "cujos", "cujas", "demais", "os demais", "as demais", "todo", "todos", "qual","alguma","algum","algumas","alguns","outro","outra","outros","outras","mesmo","mesma","mesmos","mesmas","neste","nesta","nestes","nestas","tanto","cada", "quanto", "quantos", "quanta", "quantas"]
    PRNtoPRP = ["eu", "tu", "ele", "ela", "nós", "vós", "eles", "elas", "me", "mim", "comigo", "te", "ti", "contigo", "se", "si", "consigo", "o", "a", "lhe", "nos", "conosco", "vos", "convosco", "os", "as", "lhes", "lo", "la", "los", "las", "mim"]
    PRNtoPRPS = ["meu", "minha", "meus", "minhas", "teu", "tua","teus", "tuas", "nosso", "nossa", "nossos", "nossas", "vosso", "vossa", "vossos", "vossas", "seu", "sua", "seus", "suas"]
        
    #Iterate through sentences
    for s in text.split("\n"):
        
        fw.write("\n")
        
        #Iterate through tokens
        for t in s.split(" "):
            
            item = t.split("_")
             
            word = item[0]
                
            tag = item[1]
            
            dictTags[tag] = dictTags.get(tag, 0) + 1
            
            
            # REPLACE RULES
            
            # 1) ART -> DT (determiners)
            if tag.upper() == "ART":
                if word.lower() == "uterina" and tag.upper() == "ART":  #Specific rule for (uterina == art) / 1 error
                    tag = tag.replace("ART", "ADJ")
                else:
                    tag = tag.replace("ART", "DT")
                    dictART[word] = dictART.get(word, 0) + 1
            
            # 2) IN -> UH (interjections)
            if tag.upper() == "IN":
                dictIN[word] = dictIN.get(word, 0) + 1
                tag = tag.replace("IN", "UH")
            
            # 3) CJ -> CC or IN (Coordinating and subordinating conjunctions)
            if tag.upper() == "CJ":
                if word.lower() == "e" or word.lower() == "ou":  
                    tag = "CC" #e, ou (coordinating conjuctions)
                else:
                    tag = "IN" # "que", "se", "porque", etc (subordinating conjuctions)
    
            # 4) PCP -> VB (past participle -> verb base form)
            if tag.upper() == "PCP":
                tag = tag.replace("PCP", "VB")
                dictPCP[word] = dictPCP.get(word, 0) + 1
            
            # 5) PRN -> (SPEC / PRP / DT / ADJ)
            if tag.upper() == "PRN":
                
                dictPRN[word] = dictPRN.get(word, 0) + 1
                
                if word.lower() in PRNtoDT:  #determiners
                    tag = "DT"
                else:
                    if word.lower() == "próprio":  #specific rule
                        tag = "ADJ"
                    else:
                        if word.lower() in PRNtoPRP:  #personal pronouns 
                            tag = "PRP"
                        else:
                            ###if word.lower() in PRNtoSPEC:  #No matching tag in Penn Treebank Tagset
                                ###tag = "SPEC"
                            ###else:
                            if word.lower() in PRNtoPRPS: #pocessive pronouns
                                tag = "PRP$"
                            else:
                                tag = "DT" #determiners
                                
            # 6) NPROP -> NNP (Proper Nouns Singular)
            if tag.upper() == "NPROP":
                dictNPROP[word] = dictNPROP.get(word, 0) + 1
                tag = tag.replace("NPROP", "NNP") #(Proper Nouns Singular)
                     
            # 7) Rule for "x" when it's tagged as preposition - manually check if it's plural or singular
            if tag.upper() == "PREP" and word.lower() == "x":
                word = word.replace("x", "vezes")
            
            # 8) PREP -> IN (Preposition)
            if tag.upper() == "PREP":
                dictPREP[word] = dictPREP.get(word, 0) + 1
                tag = tag.replace("PREP", "IN")
            
            # 9) N -> NN (noun, singular)
            if len(tag) == 1 and tag == "N":
                if tag.upper() == "N":
                    dictN[word] = dictN.get(word, 0) + 1    
                    tag = tag.replace("N", "NN")
               
            # 10) ADV -> RB (adverb)
            if tag.upper() == "ADV":
                dictADV[word] = dictADV.get(word, 0) + 1
                tag = tag.replace("ADV", "RB")
            
            # 11) NUM -> CD (cardinal numbers)
            if tag.upper() == "NUM":
                dictNUM[word] = dictNUM.get(word, 0) + 1
                tag = tag.replace("NUM", "CD")
            
            # 12) ADJ -> JJ (adjectives)
            if tag.upper() == "ADJ":
                dictADJ[word] = dictADJ.get(word, 0) + 1
                tag = tag.replace("ADJ", "JJ")
            
            # 13) V -> VB (verbs)
            if len(tag) == 1 and tag == "V":
                if tag.upper() == "V":
                    dictV[word] = dictV.get(word, 0) + 1
                    tag = tag.replace("V", "VB") #Verbs on base form
            
            dictTagsPostNormalization[tag] = dictTagsPostNormalization.get(tag, 0) + 1
            
            # Write 
            fw.write(word + " " + tag + " " + "O - O" + "\n")
            
        # Close sentence
        fw.write("")

In [None]:
# Open corpus
with io.open(_PATH_CORPORA + "corpora\PUCPR.txt",'r',encoding='utf8') as f:
    
    # Get the text
    text = f.read()
    
    # Define replace lists
    # More on https://www.infoescola.com/portugues/pronomes/ and https://www.infoescola.com/portugues/adverbios/
    PRNtoDT = ["esta", "este", "estas", "estes", "isto", "isso", "aquilo", "aquele", "aquela", "aqueles", "aquelas", "meu", "minha", "meus", "minhas", "teu", "tua", "teus", "tuas", "sua", "seu", "suas", "seus", "nosso", "nossa", "nossos", "nossas", "vosso", "vossa", "vossos", "vossas", "cujo", "cuja", "cujos", "cujas", "demais", "os demais", "as demais", "todo", "todos", "qual","alguma","algum","algumas","alguns","outro","outra","outros","outras","mesmo","mesma","mesmos","mesmas","neste","nesta","nestes","nestas","tanto","cada", "quanto", "quantos", "quanta", "quantas"]
    PRNtoPRP = ["eu", "tu", "ele", "ela", "nós", "vós", "eles", "elas", "me", "mim", "comigo", "te", "ti", "contigo", "se", "si", "consigo", "o", "a", "lhe", "nos", "conosco", "vos", "convosco", "os", "as", "lhes", "lo", "la", "los", "las", "mim"]
    PRNtoPRPS = ["meu", "minha", "meus", "minhas", "teu", "tua","teus", "tuas", "nosso", "nossa", "nossos", "nossas", "vosso", "vossa", "vossos", "vossas", "seu", "sua", "seus", "suas"]
    ###PRNtoSPEC = ["que", "quem", "nada", "cada um", "as quais", "os quais", "a qual", "o qual", "onde", "todo o mundo"]
    
    #Iterate through sentences
    for s in text.split("\n"):
        
        fw2.write("\n")
        #Iterate through tokens
        for t in s.split(" "):
            
            item = t.split("_")
            
            #normalize accents and cases
            wordNorm = item[0]
            
            for i in wordNorm:
                accented_word = wordNorm
                # accented_word is of type 'unicode'
            
                unaccented_word = unidecode.unidecode(accented_word)
                # unaccented_word contains 'Malaga'and is of type 'str'
                
            for i in unaccented_word:
                lowerword = unaccented_word.lower()
            
            tag2 = item[1]
            
            
            # REPLACE RULES
            
            # 1) ART -> DT (determiners)
            if tag2.upper() == "ART":
                if lowerword.lower() == "uterina" and tag2.upper() == "ART":  #Specific rule for (uterina == art) / 1 error
                    tag2 = tag2.replace("ART", "ADJ")
                else:
                    tag2 = tag2.replace("ART", "DT")
                    dictART2[lowerword] = dictART2.get(lowerword, 0) + 1
                    
            # 2) IN -> UH (interjections)
            if tag2.upper() == "IN":
                dictIN2[lowerword] = dictIN2.get(lowerword, 0) + 1
                tag2 = tag2.replace("IN", "UH")
            
            # 3) CJ -> CC or IN (Coordinating and subordinating conjunctions)
            if tag2.upper() == "CJ":
                if lowerword.lower() == "e" or lowerword.lower() == "ou":  
                    tag2 = "CC" #e, ou (coordinating conjuctions)
                else:
                    tag2 = "IN" # "que", "se", "porque", etc (subordinating conjuctions)
    
            # 4) PCP -> VBN (past participle -> verb base form)
            if tag2.upper() == "PCP":
                tag2 = tag2.replace("PCP", "VB")
                dictPCP2[lowerword] = dictPCP2.get(lowerword, 0) + 1
            
            # 5) PRN -> (SPEC / PRP / DT / ADJ)
            if tag2.upper() == "PRN":
                
                dictPRN2[lowerword] = dictPRN2.get(lowerword, 0) + 1
                
                if lowerword.lower() in PRNtoDT:  #determiners
                    tag2 = "DT"
                else:
                    if lowerword.lower() == "próprio":  #specific rule
                        tag2 = "ADJ"
                    else:
                        if lowerword.lower() in PRNtoPRP:  #personal pronouns 
                            tag2 = "PRP"
                        else:
                            ###if word.lower() in PRNtoSPEC:  #No matching in Penn Treebank
                                ###tag = "SPEC"
                            ###else:
                            if lowerword.lower() in PRNtoPRPS: #pocessive pronouns
                                tag2 = "PRP$"
                            else:
                                tag2 = "DT" #determiners
                            
            # 6) NPROP -> NNP (Proper Nouns Singular)
            if tag2.upper() == "NPROP":
                dictNPROP2[lowerword] = dictNPROP2.get(lowerword, 0) + 1
                tag2 = tag2.replace("NPROP", "NNP") #(Proper Nouns Singular)
            
           
            # 7) Rule for "x" when it's tagged as preposition - manually check if it's plural or singular
            if tag2.upper() == "PREP" and lowerword.lower() == "x":
                lowerword = lowerword.replace("x", "vezes")
            
            # 8) PREP -> IN (Preposition)
            if tag2.upper() == "PREP":
                dictPREP2[lowerword] = dictPREP2.get(lowerword, 0) + 1
                tag2 = tag2.replace("PREP", "IN")
            
            # 9) N -> NN (noun, singular)
            if len(tag2) == 1 and tag2 == "N":
                if tag2.upper() == "N":
                    dictN2[lowerword] = dictN2.get(lowerword, 0) + 1    
                    tag2 = tag2.replace("N", "NN")  #(Noun, Singular)
               
            # 10) ADV -> RB (adverb)
            if tag2.upper() == "ADV":
                dictADV2[lowerword] = dictADV2.get(lowerword, 0) + 1
                tag2 = tag2.replace("ADV", "RB")
            
            # 11) NUM -> CD (cardinal numbers)
            if tag2.upper() == "NUM":
                dictNUM2[lowerword] = dictNUM2.get(lowerword, 0) + 1
                tag2 = tag2.replace("NUM", "CD")
            
            # 12) ADJ -> JJ (adjectives)
            if tag2.upper() == "ADJ":
                dictADJ2[lowerword] = dictADJ2.get(lowerword, 0) + 1
                tag2 = tag2.replace("ADJ", "JJ")
            
            # 13) V -> VB (verbs)
            if len(tag2) == 1 and tag2 == "V":
                if tag2.upper() == "V":
                    dictV2[lowerword] = dictV2.get(lowerword, 0) + 1
                    tag2 = tag2.replace("V", "VB") #Verbs on base form
            
            # Other corrections - Common errors
            
            # If the word isso is labelled as DET (UNIFESP corpus está como SPEC)
                
            #TODO: If you find any other common error - put the correction here
            
            #TODO: normalize multiword expressions like UNIFESP corpus
            
            # Normalization survey
            dictTagsPostNormalization2[tag2] = dictTagsPostNormalization2.get(tag2, 0) + 1
            
            # Write
            fw2.write(lowerword + " " + tag2 + " " + "O - O" + "\n")
            
        fw2.write("")

In [None]:
fw.close()

In [None]:
fw2.close()

In [None]:
#Number of tags inside the corpus
numberoftags = []
for key, value in sorted(dictTagsPostNormalization.items(), key=lambda x:-x[1]):
    numberoftags.append(value)
    
totalnumber = sum(numberoftags)
print(totalnumber)

# The following cells show every word and its count in the corpus for each tag.

In [None]:
for key, value in sorted(dictTagsPostNormalization.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictTagsPostNormalization2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictTags.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictART.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictART2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictPCP.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictPCP2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictPRN.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictPRN2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictNPROP.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictNPROP2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictPREP.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictPREP2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictN.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictN2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictADV.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictADV2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictNUM.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictNUM2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictADJ.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictADJ2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictIN.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictIN2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictV.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictV2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))