Note: This code was made to run on Google Colab, some minor adjustments to the file loading may be needed if you want to run it locally

# Setup

## Imports

In [None]:
# Generic Imports
import os
import csv

# Install XML reader dependencies
!pip install beautifulsoup4
!pip install lxml
from bs4 import BeautifulSoup as bs

## Load the dataset

In [None]:
# Load the iceErrorCorpus XML files in a zip folder named 'data', and then unzip it
from google.colab import files
uploaded = files.upload()
!unzip data.zip

## Configure the output settings

In [None]:
removeCorrections = False
filterSomeErrors = True
# BREADCRUMBS
maybeTestTheseLater = ['wrong-prep','ind4def','collocation','n4nn','í4ý','i4y','ind4sub-verb','nn4n','dir4loc','u4y',
                       'af4að','pro4reflexive','að4af','agreement-concord','nominal-inflection','compound-collocation',
                       'gen-escape','agreement-pred','adjective-inflection','def4ind','agreement-pro','case-prep','verb-inflection',
                       'sub4ind-conj','dative-sub','adj4adv','pro-inflection','sub4ind','act4mid','mid4act','ind4sub-conj',
                      'numeral-inflection','case-verb','genitive''wrong-prep','ind4def','collocation','n4nn','í4ý',
                      'i4y','ind4sub-verb','nn4n','dir4loc','u4y','af4að','pro4reflexive','að4af']
errorsToBeIncluded = ['nominal-inflection','adjective-inflection','verb-inflection','pro-inflection','numeral-inflection']

## Load helper functions meant to be abstracted

In [None]:
def returnCorrectBSSentences(document):
  content = []
  with open(document, "r") as file:
    content = file.readlines()
  content = "".join(content)
  bs_correct_content = bs(content, "lxml")
  for correction in bs_correct_content("original"):
    correction.decompose()
  return bs_correct_content.find_all("s")

def returnIncorrectBSSentences(document):
  content = []
  with open(document, "r") as file:
    content = file.readlines()
  content = "".join(content)
  bs_incorrect_content = bs(content, "lxml")
  for correction in bs_incorrect_content("corrected"):
    correction.decompose()
  return bs_incorrect_content.find_all("s")

def preprocessSentence(taggedSentence, label):
    # Create a plainText string out of the BS sentence
    taggedSentence = sentence.find_all("w")
    plainTextSentence = ''
    for tag in taggedSentence:
      plainTextSentence += tag.get_text() + " "
    plainTextSentence =  plainTextSentence[:-1]
    plainTextSentence += '.'
    # If correct, send to the CSV file
    if(sentence.find("revision")==None):
      doNothing = 0
    # If incorrect, strip the error from the sentence, log the error code, and send to the CSV File
    else:
      myString = str(sentence.find_all("error"))
      errorCodes =[]
      loopExit = False
      while(loopExit==False):
        index = myString.find("xtype=")
        if(index==-1):
          loopExit=True
        else:
          myString = myString[index+7:]
          index = myString.find(">")
          errorCodes.append(myString[:index-1])
          myString = myString[index:]
      if(filterSomeErrors == True):
        # For each incorrect sentence, categorize it into declension or non declension errors
        for error in errorCodes:
          if error in errorsToBeIncluded:
            return [plainTextSentence ,label,errorCodes]
      else:
        return [plainTextSentence ,label,errorCodes]

def createCSV(data):
  with open('CorrectionsData.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["text","label","Error"])
    writer.writerows(data)


# A currently unused function that was used to help decide which Error codes are relevant
def outputUniqueErrorCodes():
  import pandas as pd
  data2 = pd.read_csv("labeledData.csv")
  data2 = data2.drop('text', axis=1)
  data2 = data2.drop('label', axis=1)
  errorCodes = pd.unique(list(data2['Error']))
  df = pd.DataFrame(errorCodes)
  print(len(errorCodes))
  print(errorCodes)

# Parse the files

In [None]:
# Create an array to hold all the individual sentences after they have been parsed, but before they are written to a CSV file
correctOutput = []
incorrectOutput = []

# Individually load each file in the data folder
for document in os.scandir('data'):
    # Create a bs (beautiful soup) list filled with setences
    correctSentenceSoup = returnCorrectBSSentences(document)
    incorrectSentenceSoup = returnIncorrectBSSentences(document)

    # For each sentence in that object, process them into the format expected by the model, and append them to our array
    for sentence in correctSentenceSoup:
      x = preprocessSentence(sentence,"correct")
      if x is not None:
        correctOutput.append(x)

    # For each sentence in that object, process them into the format expected by the model, and append them to our array
    for sentence in incorrectSentenceSoup:
      y = preprocessSentence(sentence,"incorrect")
      if y is not None:
        incorrectOutput.append(y)


# Download the file to your local PC

In [None]:
#Create a CSV file to record the data from our array
createCSV(correctOutput)
files.download("CorrectionsData.csv")

In [None]:
#Create a CSV file to record the data from our array
createCSV(incorrectOutput)
files.download("CorrectionsData.csv")