
@author: Lucas Ferro Antunes de Oliveira

This script transform the UNIFESP corpus tagset to Penn Treebank tagset (with some inflection tags)


In [None]:
#importing libraries

import matplotlib.pyplot as plt
import io
import collections
import unidecode


In [None]:
_PATH_CORPORA = ""

In [None]:
# Dictionaries to count words and tags in the corpus

dictTags = {}
dictTagsPostNormalization = {}

dictIN = {}
dictDT= {}
dictNN = {}
dictNNS = {}
dictCD = {}
dictJJ = {}
dictPRP = {}
dictNNP = {}
dictVB = {}
dictVBP = {}
dictVBG = {}
dictVBN = {}
dictRB = {}
dictCC = {}

In [None]:
dictTags2 = {}
dictTagsPostNormalization2 = {}

dictIN2 = {}
dictDT2= {}
dictNN2 = {}
dictNNS2 = {}
dictCD2 = {}
dictJJ2 = {}
dictPRP2 = {}
dictNNP2 = {}
dictVB2 = {}
dictVBP2 = {}
dictVBG2 = {}
dictVBN2 = {}
dictRB2 = {}
dictCC2 = {}

In [None]:
# Create a new corpus file with the adapted tagset
fw = io.open(_PATH_CORPORA + "UNIFESP-tagset-adapted-penntreebank-update.txt",'w',encoding='utf8')
fw.write("-DOCSTART- -X- -X- O\n")

In [None]:
# Create a new corpus file with the adapted tagset with no accents
fw2 = io.open(_PATH_CORPORA + "UNIFESP-tagset-adapted-penntreebank-no-accents.txt",'w',encoding='utf8')
fw2.write("-DOCSTART- -X- -X- O\n")

In [None]:
# Open corpus
with io.open(_PATH_CORPORA + "corpora/UNIFESP.txt", 'r', encoding='utf8') as f:
    
    # Get the text
    text = f.read()
    
    # Define replace lists
    # More on https://www.infoescola.com/portugues/pronomes/ and https://www.infoescola.com/portugues/adverbios/
    PRNtoDT = ["esta", "este", "estas", "estes", "isto", "isso", "aquilo", "aquele", "aquela", "aqueles", "aquelas", "meu", "minha", "meus", "minhas", "teu", "tua", "teus", "tuas", "sua", "seu", "suas", "seus", "nosso", "nossa", "nossos", "nossas", "vosso", "vossa", "vossos", "vossas", "cujo", "cuja", "cujos", "cujas", "demais", "os demais", "as demais", "todo", "todos", "qual","alguma","algum","algumas","alguns","outro","outra","outros","outras","mesmo","mesma","mesmos","mesmas","neste","nesta","nestes","nestas","tanto","cada", "quanto", "quantos", "quanta", "quantas"]
    PRNtoPRP = ["eu", "tu", "ele", "ela", "nós", "vós", "eles", "elas", "me", "mim", "comigo", "te", "ti", "contigo", "se", "si", "consigo", "o", "a", "lhe", "nos", "conosco", "vos", "convosco", "os", "as", "lhes", "lo", "la", "los", "las", "mim"]
    PRNtoPRPS = ["meu", "minha", "meus", "minhas", "teu", "tua","teus", "tuas", "nosso", "nossa", "nossos", "nossas", "vosso", "vossa", "vossos", "vossas", "seu", "sua", "seus", "suas"]
    
    #Iterate through sentences
    for s in text.split("\n"):
        
        fw.write("\n")
        
        #Iterate through tokens
        for t in s.split(" "):
            
            item = t.split("_")
             
            word = item[0]
            
            tag = item[1]
               
            dictTags[tag] = dictTags.get(tag, 0) + 1
            
            # Replace Rules
            
            # 1) PRP to IN (prepositions)
            if tag.upper() == "PRP":
                tag = tag.replace("PRP", "IN")
                dictIN[word] = dictIN.get(word, 0) + 1
            
            # 2) N and its inflections to NN and NNS (singular nouns and plural nouns)
            if tag.startswith("N"):
                
                # 2.1) NFS and NMS to NN (singular nouns)
                tag = tag.replace("NFS", "NN")
                tag = tag.replace("NMS", "NN")
                if tag.upper() == "NN":
                    dictNN[word] = dictNN.get(word, 0) + 1
                
                # 2.2) NMP and NFP to NNS (plural nouns)
                tag = tag.replace("NMP", "NN")
                tag = tag.replace("NFP", "NN")
                if tag.upper() == "NNS":
                    dictNN[word] = dictNN.get(word, 0) + 1
                
                # 2.3) NM to NN (singular nouns)
                tag = tag.replace("NM", "NN")
                dictNN[word] = dictNN.get(word, 0) + 1
            
            # 3) PROP to NNP (proper nouns)
            if tag.startswith("PROP"):
                tag = "NNP"
                dictNNP[word] = dictNNP.get(word, 0) + 1
            
            # 4) generic rule for verbs
            if tag.startswith("V"):
            
                # 4.1) VINF to VB (verbs in base form)
                if tag.upper() == "VINF":
                    tag = tag.replace("VINF", "VB")
                    dictVB[word] = dictVB.get(word, 0) + 1
                
                # 4.2) VFIN to VB (non-third person verbs, singular present)
                elif tag.upper() == "VFIN":
                    tag = tag.replace("VFIN", "VB")
                    dictVB[word] = dictVB.get(word, 0) + 1
                
                # 4.3) VGER to VB (gerund verbs)
                elif tag.upper() == "VGER":
                    tag = tag.replace("VGER", "VB")
                    dictVB[word] = dictVB.get(word, 0) + 1
                
                # 4.4) VPCP and its inflections to VB (past participle verb)  
                elif tag.startswith("VPCP"):
                    tag = tag.replace("VPCPMS", "VB")
                    tag = tag.replace("VPCPFS", "VB")
                    tag = tag.replace("VPCPMP", "VB")
                    tag = tag.replace("VPCPFP", "VB")
                    dictVB[word] = dictVB.get(word, 0) + 1
                    
                else:
                    tag = "VB"
                    dictVB[word] = dictVB.get(word, 0) + 1
            
            # 5) SPEC to DT (determiners)
            if tag.startswith("SPEC"):
                tag = "DT"
                dictDT[word] = dictDT.get(word, 0) + 1
            
            # 6) DET and its inflections to DT (determiners)
            if tag.startswith("DET"):
                tag = "DT"
                dictDT[word] = dictDT.get(word, 0) + 1
            
            # 7) PERS to PRP (personal pronouns)
            if tag.startswith("PERS"):
                tag = "PRP"
                dictPRP[word] = dictPRP.get(word, 0) + 1
            
            # 8) ADJ to JJ (adjetives)
            if tag.startswith("ADJ"):
                tag = "JJ"
                dictJJ[word] = dictJJ.get(word, 0) + 1
            
            # 9) NUM and its inflections to CD (cardinal numbers)
            if tag.startswith("NUM"):
                tag = "CD"
                dictCD[word] = dictCD.get(word, 0) + 1
            
            # 10) ADV to RB (adverbs)
            if tag.startswith("ADV"):
                tag = "RB"
                dictRB[word] = dictRB.get(word, 0) + 1
            
            # 11) KC to CC or IN (coordinating or subordinating conjunctions)
            if tag.upper() == "KC":
                if word.lower() == "e" or word.lower() == "ou":  
                    #e, ou (coordinating conjuctions)
                    tag = "CC" 
                    dictCC[word] = dictCC.get(word, 0) + 1
                else:
                    # "que", "se", "porque", etc (subordinating conjuctions)
                    tag = "IN"
                    dictIN[word] = dictIN.get(word, 0) + 1
            
            # 12) KS to IN (subordinating conjunctions)
            if tag.upper() == "KS":
                tag = "IN"
                dictIN[word] = dictIN.get(word, 0) + 1
            
            # Other corrections - Common errors
            # 13) If the word SUS is labelled as an interjection
            if word.lower() == "sus" and tag.upper() == "IN":
                word = "SUS"
                tag = "NNP"
                dictNNP[word] = dictNNP.get(word, 0) + 1
            
            # 14) Rule for 'finanças', probably tagged wrong
            if tag.upper() == "<NFP":
                tag = tag.replace("<NFP", "NN")
                dictNN[word] = dictNN.get(word, 0) + 1
            
            # 15) "SEM" is tagged as hyphen-separated and it's not. Switch to preposition
            if word.lower() == "sem" and tag.upper() == "EC":
                tag = tag.replace("EC", "IN")
                dictIN[word] = dictIN.get(word, 0) + 1
             
            # 16) "PRÉ" is tagged as hyphen-separated and it's not in the text. Switch to noun
            if word.lower() == "pré" and tag.upper() == "EC":
                tag = tag.replace("EC", "NN")
                dictNN[word] = dictNN.get(word, 0) + 1
            
            # 17) "MICRO" is tagged as hyphen-separated and it's not. Switch to noun
            if word.lower() == "micro" and tag.upper() == "EC":
                tag = tag.replace("EC", "NN")
                dictNN[word] = dictNN.get(word, 0) + 1
            
            ### Review word "Espontânea" - line 2972 - UNIFESP, with two tags for being spread out ###
            ### Review words of type "de=acordo" tagged with "PP" ###
            
            #TODO: If you find any other common error - put the correction here
            
            
            #Dict for post normalization tags analysis
            dictTagsPostNormalization[tag] = dictTagsPostNormalization.get(tag, 0) + 1
            
            # Write 
            fw.write(word + " " + tag + " " + "O - O" + "\n")
            
        # Close sentence
        fw.write("")
        


In [None]:
# Open corpus
with io.open(_PATH_CORPORA + "corpora/UNIFESP.txt", 'r', encoding='utf8') as f:
    
    # Get the text
    text = f.read()
    
    # Define replace lists
    # More on https://www.infoescola.com/portugues/pronomes/ and https://www.infoescola.com/portugues/adverbios/
    PRNtoDT = ["esta", "este", "estas", "estes", "isto", "isso", "aquilo", "aquele", "aquela", "aqueles", "aquelas", "meu", "minha", "meus", "minhas", "teu", "tua", "teus", "tuas", "sua", "seu", "suas", "seus", "nosso", "nossa", "nossos", "nossas", "vosso", "vossa", "vossos", "vossas", "cujo", "cuja", "cujos", "cujas", "demais", "os demais", "as demais", "todo", "todos", "qual","alguma","algum","algumas","alguns","outro","outra","outros","outras","mesmo","mesma","mesmos","mesmas","neste","nesta","nestes","nestas","tanto","cada", "quanto", "quantos", "quanta", "quantas"]
    PRNtoPRP = ["eu", "tu", "ele", "ela", "nós", "vós", "eles", "elas", "me", "mim", "comigo", "te", "ti", "contigo", "se", "si", "consigo", "o", "a", "lhe", "nos", "conosco", "vos", "convosco", "os", "as", "lhes", "lo", "la", "los", "las", "mim"]
    PRNtoPRPS = ["meu", "minha", "meus", "minhas", "teu", "tua","teus", "tuas", "nosso", "nossa", "nossos", "nossas", "vosso", "vossa", "vossos", "vossas", "seu", "sua", "seus", "suas"]
    ###PRNtoSPEC = ["que", "quem", "nada", "cada um", "as quais", "os quais", "a qual", "o qual", "onde", "todo o mundo"]
    
    #Iterate through sentences
    for s in text.split("\n"):
        
        fw2.write("\n")
        
        #Iterate through tokens
        for t in s.split(" "):
            
            item = t.split("_")
             
            wordNorm = item[0]#normalizar acentos e case
            
            for i in wordNorm:
                accented_word = wordNorm
                # accented_word is of type 'unicode'
            
                unaccented_word = unidecode.unidecode(accented_word)
                # unaccented_word contains 'Malaga'and is of type 'str'
                
            for i in unaccented_word:
                word2 = unaccented_word.lower()
            
            tag2 = item[1]
               
            dictTags2[tag2] = dictTags2.get(tag2, 0) + 1

            # Any DET Inflection -> DET
            # Any PROP Inflection -> PROP
            # Any ADJ Inflection -> ADJ
            
            # PRP to IN (prepositions)
            if tag2.upper() == "PRP":
                tag2 = tag2.replace("PRP", "IN")
                dictIN2[word2] = dictIN2.get(word2, 0) + 1
            
            # N and its inflections to NN and NNS (singular nouns and plural nouns)
            if tag2.startswith("N"):
                # NFS and NMS to NN (singular nouns)
                tag2 = tag2.replace("NFS", "NN")
                tag2 = tag2.replace("NMS", "NN")
                if tag2.upper() == "NN":
                    dictNN2[word2] = dictNN2.get(word2, 0) + 1
                
                # NMP and NFP to NNS (plural nouns)
                tag2 = tag2.replace("NMP", "NN")
                tag2 = tag2.replace("NFP", "NN")
                if tag2.upper() == "NN":
                    dictNN2[word2] = dictNN2.get(word2, 0) + 1
                
                # NM to NN (singular nouns)
                tag2 = tag2.replace("NM", "NN")
                dictNN2[word2] = dictNN2.get(word2, 0) + 1
            
            # PROP to NNP (proper nouns)
            if tag2.startswith("PROP"):
                tag2 = "NNP"
                dictNNP2[word2] = dictNNP2.get(word2, 0) + 1
            
            # generic rule for verbs
            if tag2.startswith("V"):
            
                # VINF to VB (verbs in base form)
                if tag2.upper() == "VINF":
                    tag2 = tag2.replace("VINF", "VB")
                    dictVB2[word2] = dictVB2.get(word2, 0) + 1
                
                # VFIN to VBP (non-third person verbs, singular present)         ### Rever regra
                elif tag2.upper() == "VFIN":
                    tag2 = tag2.replace("VFIN", "VB")
                    dictVB2[word2] = dictVB2.get(word2, 0) + 1
                
                elif tag2.upper() == "VGER":
                    tag2 = tag2.replace("VGER", "VB")
                    dictVB2[word2] = dictVB2.get(word2, 0) + 1
                
                # VPCP and its inflections to VNB (past participle verb)  
                elif tag2.startswith("VPCP"):
                    tag2 = tag2.replace("VPCPMS", "VB")
                    tag2 = tag2.replace("VPCPFS", "VB")
                    tag2 = tag2.replace("VPCPMP", "VB")
                    tag2 = tag2.replace("VPCPFP", "VB")
                    dictVB2[word2] = dictVB2.get(word2, 0) + 1
                
                # Rule for those others verb forms who nobody understards
                else:
                    tag2 = "VB"
                    dictVB2[word2] = dictVB2.get(word2, 0) + 1
            
            # SPEC to DT (determiners)
            if tag.startswith("SPEC"):
                tag = "DT"
                dictDT2[word] = dictDT2.get(word, 0) + 1
            
            # DET and its inflections to DT (determiners)
            if tag2.startswith("DET"):
                tag2 = "DT"
                dictDT2[word2] = dictDT2.get(word2, 0) + 1
            
            # PERS to PRP (personal pronouns)
            if tag2.startswith("PERS"):
                tag2 = "PRP"
                dictPRP2[word2] = dictPRP2.get(word2, 0) + 1
            
            # ADJ to JJ (adjetives)
            if tag2.startswith("ADJ"):
                tag2 = "JJ"
                dictJJ2[word2] = dictJJ2.get(word2, 0) + 1
            
            # NUM and its inflections to CD (cardinal numbers)
            if tag2.startswith("NUM"):
                tag2 = "CD"
                dictCD2[word2] = dictCD2.get(word2, 0) + 1
            
            # ADV to RB (adverbs)
            if tag2.startswith("ADV"):
                tag2 = "RB"
                dictRB2[word2] = dictRB2.get(word2, 0) + 1
            
            # KC to CC or IN (coordinating or subordinating conjunctions)
            if tag2.upper() == "KC":
                if word2.lower() == "e" or word2.lower() == "ou":  
                    #e, ou (coordinating conjuctions)
                    tag2 = "CC" 
                    dictCC2[word2] = dictCC2.get(word2, 0) + 1
                else:
                    # "que", "se", "porque", etc (subordinating conjuctions)
                    tag2 = "IN"
                    dictIN2[word2] = dictIN2.get(word2, 0) + 1
            
            # KS to IN (subordinating conjunctions)
            if tag2.upper() == "KS":
                tag2 = "IN"
                dictIN2[word2] = dictIN2.get(word2, 0) + 1
            
            # Other corrections - Common errors
            # If the word SUS is labelled as an interjection
            if word2.lower() == "sus" and tag2.upper() == "IN":
                word2 = "SUS"
                tag2 = "NNP"
                dictNNP2[word2] = dictNNP2.get(word2, 0) + 1
            
            #Rule for 'finanças', taggeada errado provavelmente
            if tag2.upper() == "<NFP":
                tag2 = tag2.replace("<NFP", "NN")
                dictNN2[word2] = dictNN2.get(word2, 0) + 1
            
            #"SEM" is tagged as hyphen-separated and it's not. Switch to preposition
            if word2.lower() == "sem" and tag2.upper() == "EC":
                tag2 = tag2.replace("EC", "IN")
                dictIN2[word2] = dictIN2.get(word2, 0) + 1
            
            #"PRÉ" is tagged as hyphen-separated and it's not in the text. Switch to noun
            if word2.lower() == "pré" and tag2.upper() == "EC":
                tag2 = tag2.replace("EC", "NN")
                dictNN2[word2] = dictNN2.get(word2, 0) + 1
            
            #"MICRO" is tagged as hyphen-separated and it's not. Switch to noun
            if word2.lower() == "micro" and tag2.upper() == "EC":
                tag2 = tag2.replace("EC", "NN")
                dictNN2[word2] = dictNN2.get(word2, 0) + 1
            
            
            ### Review word "Espontânea" - line 2972 - UNIFESP, with two tags for being spread out ###
            ### Review words of type "de=acordo" tagged with "PP" ###
            
            
            #TODO: If you find any other common error - put the correction here
            
            
            #Dict for analysis of tags post normalization
            dictTagsPostNormalization2[tag2] = dictTagsPostNormalization2.get(tag2, 0) + 1
            
            # Write 
            fw2.write(word2 + " " + tag2 + " " + "O - O" + "\n")
            
        # Close sentence
        fw2.write("")
        


In [None]:
fw.close()

In [None]:
fw2.close()

In [None]:
for key, value in sorted(dictTags.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictTags2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
#Number of tags inside the corpus
numberoftags = []
for key, value in sorted(dictTagsPostNormalization.items(), key=lambda x:-x[1]):
    numberoftags.append(value)
    
totalnumber = sum(numberoftags)
print(totalnumber)

# The following cells show every word and its count in the corpus for each tag.

In [None]:
for key, value in sorted(dictTagsPostNormalization.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictTagsPostNormalization2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictVB.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictVB2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictVBP.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictVBP2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictVBG.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictVBG2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictVBN.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictVBN2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictNN.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictNN2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictNNS.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictNNS2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictNNP.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictNNP2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictCD.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictCD2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictPRP.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictPRP2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictJJ.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictJJ2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictDT.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictDT2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictIN.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictIN2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictCC.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictCC2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictRB.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))

In [None]:
for key, value in sorted(dictRB2.items(), key=lambda x:-x[1]):
    print(key + " = " +  str(value))