In [1]:
import spacy
import corpus_tools
import corruption_tools
import os
import sys
from tqdm import tqdm

print("Loading language models")
nlp_med = spacy.load('es_core_news_md')

# Initialize the array of corruptors
corruptortypes = ["RV", "VA", "AA"]
corruptors = {}
corruptors["RV"] = corruption_tools.VerbRemover("verbRM", nlp_med)
corruptors["VA"] = corruption_tools.VerbInflCorruptor("verbInfl", nlp_med)
corruptors["AA"] = corruption_tools.AdjInflCorruptor("adjInfl", nlp_med)



Loading language models


In [2]:

# Initialize counters for corrupted sentences
corruptCount = {}
uncorrupted_count = 0
for typ in corruptortypes:
    corruptCount[typ] = 0

# Load sentence generator
#in_corpus_filename = sys.argv[1]
corpus_basename = os.path.abspath("../Data/exp3-pairtest/exp3-pairtest")

in_corpus_fn = corpus_basename + "-base"
in_corpus_file = open(in_corpus_fn, "r")


sentence_gen = corpus_tools.sentence_generator(in_corpus_file, nlp_med)
corruptCount={}
# Create outfiles for each type of corrupted sentence
corrupted_files = {}
uncorrupted_files = {}
for kind in corruptortypes:
    corrupted_name = corpus_basename + "_" + kind + "_corrupted"
    corrupted_files[kind] = open(corrupted_name, "w")
    uncorrupted_name = corpus_basename + "_" + kind + "_uncorrupted"
    uncorrupted_files[kind] = open(uncorrupted_name, "w")
    corruptCount[kind]=0
    
processed_count = 0
# Iterate parsed sentences and test for coruptibility
print("Begining Corruption")
for parsed_sentence in tqdm(sentence_gen):

    # Test for each corruptor, store the possible transformations
    possib_trans = {}
    for cor_type in corruptortypes:
        target = corruptors[cor_type].test_possible(parsed_sentence)
        if target is not None:
            possib_trans[cor_type] = target
    #Execute each of the possible transformations
    for kind in possib_trans:
        # Corrupt sentence
        target = possib_trans[kind]
        corruptedVersion = corruptors[kind].transform(parsed_sentence, target)
        if corruptedVersion is not None:
            # Save corrupted sentence to corresponding file
            outline = "0 {} {} <eos>\n".format(kind,corruptedVersion)
            corrupted_files[kind].write(outline)
            
            uncorrupted = parsed_sentence.text
            unc_line = "1 G {} <eos>\n".format(uncorrupted)
            uncorrupted_files[kind].write(unc_line)
            corruptCount[kind] +=1
# Close files
for kind in corruptortypes:
    corrupted_files[kind].close()
    uncorrupted_files[kind].close()
# Print summary to console
total = 0
for trans_type in corruptCount:
    print(trans_type + ":" + str(corruptCount[trans_type]))
    total += corruptCount[trans_type]
print("Total: {0}".format(total))


19it [00:00, 86.77it/s]

Begining Corruption


302762it [1:01:47, 81.65it/s]

RV:145432
VA:103787
AA:160744
Total: 409963





In [6]:
sent=nlp_med("Yo soy una oracion , sin tildes")

In [8]:
sent.text

'Yo soy una oracion , sin tildes'