In [1]:
import spacy
import corpus_tools
import corruption_tools 
import os
from tqdm import tqdm

In [2]:
nlp_med = spacy.load('es_core_news_md')

In [9]:

# Initialize the array of corruptors
corruptortypes = ["verbRM", "verbInfl", "adjInfl"]
corruptors = {}
corruptors["verbRM"] = corruption_tools.VerbRemover("verbRM", nlp_med)
corruptors["verbInfl"] = corruption_tools.VerbInflCorruptor("verbInfl", nlp_med)
corruptors["adjInfl"] = corruption_tools.AdjInflCorruptor("adjInfl", nlp_med)
# Initialize counters for corrupted sentences
corruptCount = {}
uncorrupted_count = 0
for typ in corruptortypes:
    corruptCount[typ] = 0

# Load sentence generator
# in_corpus_filename = sys.argv[1]()
in_corpus_filename = os.path.abspath("../data/exp1_mini/exp1_mini-CT")
out_corpus_folder = in_corpus_filename + "_"
in_corpus_file = open(in_corpus_filename, "r")
sentence_gen = corpus_tools.sentence_generator(in_corpus_file, nlp_med)

# Create outfiles for each type of corrupted sentence
outfiles = {}
for kind in corruptortypes:
    outname = out_corpus_folder + "corrupted-by_" + kind
    outfiles[kind] = open(outname, "w")

processed_count = 0
# Iterate parsed sentences and test for coruptibility
print("Begining Corruption")
for parsed_sentence in tqdm(sentence_gen):
    
    # Test for each corruptor, store the possible transformations
    possib_trans = {}
    for cor_type in corruptortypes:
        target = corruptors[cor_type].test_possible(parsed_sentence)
        if target is not None:
            possib_trans[cor_type]=target
    # Choose corruptor that has the fewest sentences so far
    
    
    success = False
    while possib_trans and not success:
        #Choose the valid corruption with the fewest sentences
        kind, target = corruption_tools.select_corruption(possib_trans, corruptCount)
        # Corrupt sentence
        corruptedVersion = corruptors[kind].transform(parsed_sentence, target)
        if corruptedVersion is not None:
            # Save corrupted sentence to corresponding file
            outfiles[kind].write(corruptedVersion + " <eos>\n")
            corruptCount[kind] += 1
            # Finish the while loop
            success=True
    if not success:
        uncorrupted_count += 1
    processed_count += 1
    
# Close files
for kind in outfiles:
    outfiles[kind].close()
# Print summary to console
total = 0
for trans_type in corruptCount:
    print(trans_type + ":" + str(corruptCount[trans_type]))
    total += corruptCount[trans_type]
print("Total: {0}".format(total))
print("Incorruptible: {0}".format(uncorrupted_count))



6it [00:00, 56.35it/s]

Begining Corruption


235it [00:02, 113.86it/s]

verbRM:60
verbInfl:58
adjInfl:62
Total: 180
Incorruptible: 55





In [7]:
sent = nlp_med("Señor Presidente , quieren que mis primeras palabras , al iniciar mi intervención sobre el Libro Blanco , sean de felicitación al ponente , Sr. von Wogau .")
root = [ sentence for sentence in sent.sents ][0].root 


In [8]:

for child in root.children:
    print(str(child) + ": "+  child.dep_ + ", "+ child.tag_) 

Presidente: nsubj, PROPN___
felicitación: ccomp, NOUN__Gender=Fem|Number=Sing
.: punct, PUNCT__PunctType=Peri


In [17]:
adj.dep_

'amod'

In [20]:
adj.head.pos_

'NOUN'