<div class="alert" style="background-color:#fff; color:white; padding:0px 10px; border-radius:5px;"><h1 style='margin:15px 15px; color:#0b1354; font-size:40px'>1.2 Data Preparation - Convert to CSV</h1>
</div>


<div class="alert alert-info" style="background-color:#0b1354; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'>Automate Code (Final)</h2>
</div>

In [8]:
import lxml.etree
import csv
import pandas as pd
import os
from os import listdir, path
from tqdm import tqdm

#import xml.etree.ElementTree as ET
#xml = lxml.etree.parse('my_folder/001_dummy.xml')

In [9]:
####################
# Reset Processing #
####################
# first check whether file exists or not
# calling remove method to delete the csv file
# in remove method you need to pass file name and type

file = 'amica_data_original_raw_version.csv'
if(os.path.exists(file) and os.path.isfile(file)):
    os.remove(file)
    print("File deleted")
else:
    print("File cleared")

    
########################
# Input XML files path #
########################
# List the path for folder with all the parsed XML files

mypath = 'xml_folder' 
files = [path.join(mypath, f) for f in listdir(mypath) if f.endswith('.xml')]

################################
# Convert XML file to CSV file #
################################
# tqdm: to show progress bar
# getchildren: to extract the children of the xml root
# the code is scripted to accomodate the structure of xml file parsed using bratreader
# each xml file is iterated to extract the text and annotate part
# There are four major parts for each loop
# then save to csv file for further text preprocessing

for index, file in tqdm(enumerate(files)):
    xml = lxml.etree.parse(file)
    
    ##################################
    # Part I - Extract Text (Corpus) #
    ##################################
    result=[]
    cols = ["tag","text"]
    tag_id =[]

    for sentence in xml.xpath('//sentences'):
        for sent in sentence.getchildren() :
            w = []

            if len(sent):
                for i, word in enumerate(sent.getchildren()):
                    if (len(word.text) == 0):
                        pass
                    elif(len(word.text) > 0):
                        w.append(word.text)  # reads all the words
                        if i == 0:
                            tag = word.get('id')
                        else:
                            pass

                text=' '.join(w)         # join the words for full sentence
                result.append(text)
                tag_id.append(tag)

            else:
                pass

    # Text Data
    data=list(zip(tag_id,result))
    df = pd.DataFrame(data, columns=cols)


    ########################################
    # Part II - Extract Annotation (Label) #
    ########################################
    cols = ["tag","label","role","harmfulness_score","oth_language"]

    for annotation in xml.xpath('//annotations'):
        a = []
        role = []
        label = []
        harm_score = []
        oth_lang = []
        
        for i, ann in enumerate(annotation.getchildren()):
            
            # Write row for each annotation tags (if exist)
            # Harmfulness score = 1
            if ann.get('one_Harasser') is not None:
                a.append(ann.get('words'))
                role.append('Harasser')
                label.append('Cyberbullying')
                harm_score.append('1')
                oth_lang.append('')
            if ann.get('one_Victim') is not None:
                a.append(ann.get('words'))
                role.append('Victim')
                label.append('Cyberbullying')
                harm_score.append('1')
                oth_lang.append('')
            if ann.get('one_Bystander_defender') is not None:
                a.append(ann.get('words'))
                role.append('Bystander_defender')
                label.append('Cyberbullying')
                harm_score.append('1')
                oth_lang.append('')
            if ann.get('one_Bystander_assistant') is not None:
                a.append(ann.get('words'))
                role.append('Bystander_assistant')
                label.append('Cyberbullying')
                harm_score.append('1')
                oth_lang.append('')
            
            # Harmfulness score = 2
            if ann.get('two_Harasser') is not None:
                a.append(ann.get('words'))
                role.append('Harasser')
                label.append('Cyberbullying')
                harm_score.append('2')
                oth_lang.append('')
            if ann.get('two_Victim') is not None:
                a.append(ann.get('words'))
                role.append('Victim')
                label.append('Cyberbullying')
                harm_score.append('2')
                oth_lang.append('')
            if ann.get('two_Bystander_defender') is not None:
                a.append(ann.get('words'))
                role.append('Bystander_defender')
                label.append('Cyberbullying')
                harm_score.append('2')
                oth_lang.append('')
            if ann.get('two_Bystander_assistant') is not None:
                a.append(ann.get('words'))
                role.append('Bystander_assistant')
                label.append('Cyberbullying')
                harm_score.append('2')
                oth_lang.append('')
                
            # Tag for corpus with mixed of other language
            if ann.get('Other_language') is not None:
                a.append(ann.get('words'))
                role.append('')
                label.append('')
                harm_score.append('')
                oth_lang.append('1')

    # Annotate Data
    data2=list(zip(a,label,role,harm_score,oth_lang))
    df2 = pd.DataFrame(data2, columns=cols)

    #############################################
    # Part III - Merge Text and Annotation Data #
    #############################################
    df_final=df.merge(df2, on='tag', how='left')   
    df_final['file_index'] = file
    df_final=df_final.drop_duplicates(subset=['tag','file_index'],  keep='first')


    ###############################
    # Part IV - Write to CSV file #
    ###############################
    df_final.to_csv('amica_data_original_raw_version.csv',mode='a', header=(index==0))
print("DONE AND COMPLETE") # End of Code

File cleared


8323it [01:18, 106.41it/s]

DONE AND COMPLETE



