# seq2seq: incorrect sentences -> correct sentences

In [19]:
import json

def apply_corrections(sentence, corrections):
    """
    Apply corrections to the original sentence.
    Corrections are provided as a list of edit operations.
    """
    words = sentence.split()
    for correction in corrections[::-1]:
        parts = correction.split("|||")
        span = parts[0].split()[1:]  # The span to be corrected
        start, end = int(span[0]), int(span[1])
        correction_text = parts[2]  # The text to replace the span
        
        if start == -1 and end == -1:  # Special case for sentence-level corrections
            continue
        elif correction_text == "-NONE-":  # Deletion
            words[start:end] = []
        else:  # Replacement or insertion
            words[start:end] = correction_text.split()
    
    return " ".join(words)

def extract_correct_sentences_from_m2(filepath, output_file):
    with open(filepath, 'r') as file:
        lines = file.readlines()
    
    sentence_pairs = []
    current_sentence = ""
    corrections = []
    for line in lines:
        if line.startswith('S '):  # Start of a new sentence
            if current_sentence:  # Process the previous sentence
                corrected_sentence = apply_corrections(current_sentence, corrections)
                # sentence_pairs.append({"src": current_sentence, "tgt": corrected_sentence})
                sentence_pairs.append({"translation": {"src": current_sentence, "tgt": corrected_sentence}})
            current_sentence = line[2:].strip()  # Update current sentence
            corrections = []  # Reset corrections
        elif line.startswith('A '):  # Correction for the current sentence
            corrections.append(line.strip())

    # Don't forget to process the last sentence
    if current_sentence:
        corrected_sentence = apply_corrections(current_sentence, corrections)
        # sentence_pairs.append({"src": current_sentence, "tgt": corrected_sentence})
        sentence_pairs.append({"translation": {"src": current_sentence, "tgt": corrected_sentence}})

    # Save the sentence pairs to a JSON file
    with open(output_file, 'w') as out_file:
        # json.dump(sentence_pairs, out_file, indent=2)
        for d in sentence_pairs:
            out_file.write(json.dumps(d, ensure_ascii=False) + '\n')


In [20]:
# m2_filepath = './wi+locness/m2/A.dev.gold.bea19.m2'  # Adjust to your M2 file's location
# output_filepath = './wi+locness/seq2seq/A.dev.gold.bea19.json'  # Output file for corrected sentences
# extract_correct_sentences_from_m2(m2_filepath, output_filepath)
# print(f"Corrected sentences have been saved to {output_filepath}")

import os
# walk through the directory
path = './wi+locness/m2/'
for foldername, subfolders, filenames in os.walk(path):
    for filename in filenames:
        if filename.endswith('.m2'):
            m2_filepath = os.path.join(foldername, filename)
            output_filepath = os.path.join(foldername.replace('m2','seq2seq_t5'), filename.replace('.m2', '.json'))
            extract_correct_sentences_from_m2(m2_filepath, output_filepath)
            print(f"Corrected sentences have been saved to {output_filepath}")

Corrected sentences have been saved to ./wi+locness/seq2seq_t5/N.dev.gold.bea19.json
Corrected sentences have been saved to ./wi+locness/seq2seq_t5/ABCN.dev.gold.bea19.json
Corrected sentences have been saved to ./wi+locness/seq2seq_t5/A.dev.gold.bea19.json
Corrected sentences have been saved to ./wi+locness/seq2seq_t5/B.dev.gold.bea19.json
Corrected sentences have been saved to ./wi+locness/seq2seq_t5/C.dev.gold.bea19.json
Corrected sentences have been saved to ./wi+locness/seq2seq_t5/B.train.gold.bea19.json
Corrected sentences have been saved to ./wi+locness/seq2seq_t5/C.train.gold.bea19.json
Corrected sentences have been saved to ./wi+locness/seq2seq_t5/ABC.train.gold.bea19.json
Corrected sentences have been saved to ./wi+locness/seq2seq_t5/A.train.gold.bea19.json


# seq2seq: template -> correct sentences

## 2-class

In [11]:
import json

def apply_templates(sentence, corrections):
    """
    Apply corrections to the original sentence.
    Corrections are provided as a list of edit operations.
    """
    """
    Apply corrections to the original sentence.
    Corrections are provided as a list of edit operations.
    """
    words = sentence.split()
    template = []
    tmp = []
    for i, correction in enumerate(corrections):
        parts = correction.split("|||")
        span = parts[0].split()[1:]  # The span to be corrected
        start, end = int(span[0]), int(span[1])
        correction_text = parts[2]  # The text to replace the span
        
        if start == -1 and end == -1:  # Special case for sentence-level corrections
            continue
        else:  
            if start==end:
                end += 1
            template.append(f'<extra_id_{i}>')
            template.extend(words[start:end])
            tmp.append((start,end))

    for i, t in enumerate(tmp[::-1]):
        start, end = t
        words[start:end] = f'<extra_id_{len(tmp)-1-i}>'.split()
           
            
    output = f"{' '.join(template)} </s> {' '.join(words)}"
    # print(output)
    
    return output


def apply_corrections(sentence, corrections):
    """
    Apply corrections to the original sentence.
    Corrections are provided as a list of edit operations.
    """
    words = sentence.split()
    for correction in corrections[::-1]:
        parts = correction.split("|||")
        span = parts[0].split()[1:]  # The span to be corrected
        start, end = int(span[0]), int(span[1])
        correction_text = parts[2]  # The text to replace the span
        
        if start == -1 and end == -1:  # Special case for sentence-level corrections
            continue
        elif correction_text == "-NONE-":  # Deletion
            words[start:end] = []
        else:  # Replacement or insertion
            words[start:end] = correction_text.split()
    
    return " ".join(words)

def extract_correct_sentences_from_m2(filepath, output_file):
    with open(filepath, 'r') as file:
        lines = file.readlines()
    
    sentence_pairs = []
    current_sentence = ""
    corrections = []
    for line in lines:
        if line.startswith('S '):  # Start of a new sentence
            if current_sentence:  # Process the previous sentence
                corrected_sentence = apply_corrections(current_sentence, corrections)
                template_sentence = apply_templates(current_sentence, corrections)
                # sentence_pairs.append({"src": template_sentence, "tgt": corrected_sentence})
                sentence_pairs.append({"translation": {"src": template_sentence, "tgt": corrected_sentence}})
            current_sentence = line[2:].strip()  # Update current sentence
            corrections = []  # Reset corrections
        elif line.startswith('A '):  # Correction for the current sentence
            corrections.append(line.strip())

    # Don't forget to process the last sentence
    if current_sentence:
        corrected_sentence = apply_corrections(current_sentence, corrections)
        template_sentence = apply_templates(current_sentence, corrections)
        # sentence_pairs.append({"src": template_sentence, "tgt": corrected_sentence})
        sentence_pairs.append({"translation": {"src": template_sentence, "tgt": corrected_sentence}})

    # Save the sentence pairs to a JSON file
    with open(output_file, 'w') as out_file:
        # json.dump(sentence_pairs, out_file, indent=2)
        for d in sentence_pairs:
            out_file.write(json.dumps(d, ensure_ascii=False) + '\n')


In [12]:
# m2_filepath = './wi+locness/m2/A.dev.gold.bea19.m2'  # Adjust to your M2 file's location
# output_filepath = './wi+locness/template2seq/A.dev.gold.bea19.json'  # Output file for corrected sentences
# extract_correct_sentences_from_m2(m2_filepath, output_filepath)
# print(f"Corrected sentences have been saved to {output_filepath}")

import os
# walk through the directory
path = './wi+locness/m2/'
for foldername, subfolders, filenames in os.walk(path):
    for filename in filenames:
        if filename.endswith('.m2'):
            m2_filepath = os.path.join(foldername, filename)
            output_filepath = os.path.join(foldername.replace('m2','template2seq_2class_t5'), filename.replace('.m2', '.json'))
            extract_correct_sentences_from_m2(m2_filepath, output_filepath)
            print(f"Corrected sentences have been saved to {output_filepath}")

Corrected sentences have been saved to ./wi+locness/template2seq_2class_t5/N.dev.gold.bea19.json
Corrected sentences have been saved to ./wi+locness/template2seq_2class_t5/ABCN.dev.gold.bea19.json
Corrected sentences have been saved to ./wi+locness/template2seq_2class_t5/A.dev.gold.bea19.json
Corrected sentences have been saved to ./wi+locness/template2seq_2class_t5/B.dev.gold.bea19.json
Corrected sentences have been saved to ./wi+locness/template2seq_2class_t5/C.dev.gold.bea19.json
Corrected sentences have been saved to ./wi+locness/template2seq_2class_t5/B.train.gold.bea19.json
Corrected sentences have been saved to ./wi+locness/template2seq_2class_t5/C.train.gold.bea19.json
Corrected sentences have been saved to ./wi+locness/template2seq_2class_t5/ABC.train.gold.bea19.json
Corrected sentences have been saved to ./wi+locness/template2seq_2class_t5/A.train.gold.bea19.json


## 4-class

In [23]:
import json

def apply_templates(sentence, corrections):
    """
    Apply corrections to the original sentence.
    Corrections are provided as a list of edit operations.
    """
    """
    Apply corrections to the original sentence.
    Corrections are provided as a list of edit operations.
    """
    words = sentence.split()
    template = []
    tmp = []
    for i, correction in enumerate(corrections):
        parts = correction.split("|||")
        span = parts[0].split()[1:]  # The span to be corrected
        start, end = int(span[0]), int(span[1])
        correction_type = parts[1]  # The type of correction
        correction_type = correction_type.split(':')[0]
        correction_text = parts[2]  # The text to replace the span
        
        if start == -1 and end == -1:  # Special case for sentence-level corrections
            continue
        else:  
            if start==end:
                end += 1
            template.append(f'<extra_id_{i}> {correction_type}:')
            template.extend(words[start:end])
            tmp.append((start,end))

    for i, t in enumerate(tmp[::-1]):
        start, end = t
        words[start:end] = f'<extra_id_{len(tmp)-1-i}>'.split()
           
            
    output = f"{' '.join(template)} </s> {' '.join(words)}"
    # print(output)
    
    return output


def apply_corrections(sentence, corrections):
    """
    Apply corrections to the original sentence.
    Corrections are provided as a list of edit operations.
    """
    words = sentence.split()
    for correction in corrections[::-1]:
        parts = correction.split("|||")
        span = parts[0].split()[1:]  # The span to be corrected
        start, end = int(span[0]), int(span[1])
        correction_text = parts[2]  # The text to replace the span
        
        if start == -1 and end == -1:  # Special case for sentence-level corrections
            continue
        elif correction_text == "-NONE-":  # Deletion
            words[start:end] = []
        else:  # Replacement or insertion
            words[start:end] = correction_text.split()
    
    return " ".join(words)

def extract_correct_sentences_from_m2(filepath, output_file):
    with open(filepath, 'r') as file:
        lines = file.readlines()
    
    sentence_pairs = []
    current_sentence = ""
    length = 0
    corrections = []
    for line in lines:
        if line.startswith('S '):  # Start of a new sentence
            if current_sentence:  # Process the previous sentence
                corrected_sentence = apply_corrections(current_sentence, corrections)
                template_sentence = apply_templates(current_sentence, corrections)
                # sentence_pairs.append({"src": template_sentence, "tgt": corrected_sentence})
                sentence_pairs.append({"translation": {"src": template_sentence, "tgt": corrected_sentence}})
            current_sentence = line[2:].strip()  # Update current sentence
            length = max(length, len(current_sentence))
            corrections = []  # Reset corrections
        elif line.startswith('A '):  # Correction for the current sentence
            corrections.append(line.strip())

    # Don't forget to process the last sentence
    if current_sentence:
        corrected_sentence = apply_corrections(current_sentence, corrections)
        template_sentence = apply_templates(current_sentence, corrections)
        # sentence_pairs.append({"src": template_sentence, "tgt": corrected_sentence})
        sentence_pairs.append({"translation": {"src": template_sentence, "tgt": corrected_sentence}})
    # print(length)
        
    # Save the sentence pairs to a JSON file
    with open(output_file, 'w') as out_file:
        # json.dump(sentence_pairs, out_file, indent=2)
        for d in sentence_pairs:
            out_file.write(json.dumps(d, ensure_ascii=False) + '\n')

In [24]:
# m2_filepath = './wi+locness/m2/A.dev.gold.bea19.m2'  # Adjust to your M2 file's location
# output_filepath = './wi+locness/template2seq/A.dev.gold.bea19.json'  # Output file for corrected sentences
# extract_correct_sentences_from_m2(m2_filepath, output_filepath)
# print(f"Corrected sentences have been saved to {output_filepath}")

import os
# walk through the directory
path = './wi+locness/m2/'
for foldername, subfolders, filenames in os.walk(path):
    for filename in filenames:
        if filename.endswith('.m2'):
            m2_filepath = os.path.join(foldername, filename)
            output_filepath = os.path.join(foldername.replace('m2','template2seq_4class_t5'), filename.replace('.m2', '.json'))
            extract_correct_sentences_from_m2(m2_filepath, output_filepath)
            print(f"Corrected sentences have been saved to {output_filepath}")

516
Corrected sentences have been saved to ./wi+locness/template2seq_4class_t5/N.dev.gold.bea19.json
790
Corrected sentences have been saved to ./wi+locness/template2seq_4class_t5/ABCN.dev.gold.bea19.json
754
Corrected sentences have been saved to ./wi+locness/template2seq_4class_t5/A.dev.gold.bea19.json
647
Corrected sentences have been saved to ./wi+locness/template2seq_4class_t5/B.dev.gold.bea19.json
790
Corrected sentences have been saved to ./wi+locness/template2seq_4class_t5/C.dev.gold.bea19.json
751
Corrected sentences have been saved to ./wi+locness/template2seq_4class_t5/B.train.gold.bea19.json
652
Corrected sentences have been saved to ./wi+locness/template2seq_4class_t5/C.train.gold.bea19.json
1094
Corrected sentences have been saved to ./wi+locness/template2seq_4class_t5/ABC.train.gold.bea19.json
1094
Corrected sentences have been saved to ./wi+locness/template2seq_4class_t5/A.train.gold.bea19.json
