In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from google.colab import files
import zipfile
import glob

import xml.etree.ElementTree as ET

# **Dataset import**

In [None]:
# Import the microtexts dataset.

uploaded = files.upload()

Saving arg-microtexts-master.zip to arg-microtexts-master.zip


In [None]:
# Function to extract zip file.
# Takes in input the path to the zip file (path_zip) and the one to store the destination directory (path_destination).

def extract_zip(path_zip,path_destination):
  with zipfile.ZipFile(path_zip, 'r') as zip_ref:
    zip_ref.extractall(path_destination)

In [None]:
# Extract zip file.

extract_zip('arg-microtexts-master.zip','arg-microtexts-master')

# **Extract argumentative sections and their relations**

In [None]:
# Extract the list of xml files containing the argumentative sections in the 
# dataset and their relations.

xml_files = sorted(glob.glob("arg-microtexts-master/arg-microtexts-master/corpus/en/micro_*.xml"))

In [None]:
# List of xml files of the essays.
xml_files[0:5]

['arg-microtexts-master/arg-microtexts-master/corpus/en/micro_b001.xml',
 'arg-microtexts-master/arg-microtexts-master/corpus/en/micro_b002.xml',
 'arg-microtexts-master/arg-microtexts-master/corpus/en/micro_b003.xml',
 'arg-microtexts-master/arg-microtexts-master/corpus/en/micro_b004.xml',
 'arg-microtexts-master/arg-microtexts-master/corpus/en/micro_b005.xml']

In [None]:
# Transform each xml file into the xml tree representation.

list_xml_tree_representation=[ET.parse(xml_file) for xml_file in xml_files]

# Than extract from each tree its root.

list_xml_root=[tree.getroot() for tree in list_xml_tree_representation]

In [None]:
for i in range(5):
  print(list_xml_root[i].tag)

arggraph
arggraph
arggraph
arggraph
arggraph


In [None]:
# list_argumentative_sections will contain the lists, for each document in the
# corpus, of its argumentative sections.
list_argumentative_sections=[]
# list_arg_section_id_in_document will contain the lists, for each document in
# the corpus, of the id that identify the sections inside the document.
list_arg_section_id_in_document=[]

# relations_type_list will contain one element equat to 'sup' (support) or 'reb'
# (attack) according to the type of each relation in the corpus.
relations_type_list=[]
# relations_sections_list will contains the two identifiers of the related sections
# in the relative document (each element correspond to the relation in the same 
# position in relations_type_list).
relations_sections_list=[]
# relations_sections_list will contains the identifier of the document containing
# the relation (each element correspond to the relation in the same position in 
# relations_type_list).
relations_document_map=[]

for root_id in range(len(list_xml_root)):
  temp_list_argumentative_sections=[]
  temp_list_arg_section_id_in_document=[]
  new_temp_list_arg_section_id_in_document=[]
  for child in list_xml_root[root_id]:
    # The nodes tagged with 'edu' contains the text of the arg. section and an unique identifier in the document.
    if child.tag=='edu':
      temp_list_argumentative_sections.append(child.text)
      temp_list_arg_section_id_in_document.append(child.get('id'))
    # In the 'edge' nodes the original id of the sections (in src) are sobstitute with new ones (in trg).
    if (child.tag=='edge'):
      src=child.get('src')
      trg=child.get('trg')
      type_rel=child.get('type')
      if src in temp_list_arg_section_id_in_document:
        new_temp_list_arg_section_id_in_document.append(trg)
      else:
        if (src in new_temp_list_arg_section_id_in_document) and (type_rel=='sup' or type_rel=='reb'): 
          relations_type_list.append(type_rel)
          relations_sections_list.append((src,trg))
          relations_document_map.append(root_id)
  list_argumentative_sections.append(temp_list_argumentative_sections)
  list_arg_section_id_in_document.append(new_temp_list_arg_section_id_in_document)

In [None]:
for i in range(len(list_argumentative_sections[0])):
  print("section {} in document {}: {}".format(list_arg_section_id_in_document[0][i],0,list_argumentative_sections[0][i]))

section a1 in document 0: Yes, it's annoying and cumbersome to separate your rubbish properly all the time.
section a2 in document 0: Three different bin bags stink away in the kitchen and have to be sorted into different wheelie bins.
section a3 in document 0: But still Germany produces way too much rubbish
section a4 in document 0: and too many resources are lost when what actually should be separated and recycled is burnt.
section a5 in document 0: We Berliners should take the chance and become pioneers in waste separation!


In [None]:
for i in range(5):
  print("{} relation between {} in document {}".format(relations_type_list[i],relations_sections_list[i],relations_document_map[i]))

reb relation between ('a1', 'a5') in document 0
sup relation between ('a2', 'a1') in document 0
sup relation between ('a1', 'a3') in document 1
sup relation between ('a2', 'a3') in document 1
reb relation between ('a4', 'a3') in document 1


# **Extract sentences**

In [None]:
# All the argumentative sections are contained in a single sentence.

# Get the list of all the sentences.

sentence_list=[]

# List that contains, for each document, the list for all the argumentative 
# sections in the document the position of its complete sentence (odered by 
# documents in the outer list and by the position in the text of the section 
# in the inner list). 
argument_to_sentence_map=[]

# List of the number of sentences that are part of an agumentative sections 
# besides the first one.
# One element for each argumentative section.

arg_section_len_list=[]

for i in range(len(list_argumentative_sections)):
  sentence=""
  document_argument_to_sentence_map=[]
  for j in range(len(list_argumentative_sections[i])):

    sentence+=list_argumentative_sections[i][j]

    document_argument_to_sentence_map.append(len(sentence_list))
    # An argumentative section which ends a sentence has a '.' as last character.
    if sentence[-1] in ['.','?','!']:
      sentence_list.append(sentence)
      sentence=""

  if not (sentence==""):  
    sentence_list.append(sentence)
    
  argument_to_sentence_map.append(document_argument_to_sentence_map)   

In [None]:
sentence_list[:10]

["Yes, it's annoying and cumbersome to separate your rubbish properly all the time.",
 'Three different bin bags stink away in the kitchen and have to be sorted into different wheelie bins.',
 'But still Germany produces way too much rubbishand too many resources are lost when what actually should be separated and recycled is burnt.',
 'We Berliners should take the chance and become pioneers in waste separation!',
 'One can hardly move in Friedrichshain or Neukölln these days without permanently scanning the ground for dog dirt.',
 "And when bad luck does strike and you step into one of the many 'land mines' you have to painstakingly scrape the remains off your soles.",
 'Higher fines are therefore the right measure against negligent, lazy or simply thoughtless dog owners.',
 "Of course, first they'd actually need to be caught in the act by public order officers,but once they have to dig into their pockets, their laziness will sure vanish!",
 'Health insurance companies should not cove

In [None]:
len(sentence_list)

450

In [None]:
argument_to_sentence_map[0]

[0, 1, 2, 2, 3]

# **Create the Dataframe**

In [None]:
# Create the list containing as elements the entry of the desired dataframe
# containing for each relation the identifier of the document and of the 
# sentences that contains the two components, and the text of the two components.
relations_list=[]
column_names=['type','doc_id','supporting_text','supporting_sentence_id','supported_text','supported_sentence_id']


for i in range(len(relations_document_map)):
  doc_id=relations_document_map[i]
  supporting_section=relations_sections_list[i][0]
  supported_section=relations_sections_list[i][1]
  if relations_type_list[i]=='sup':
    relation_type='supports'
  else:
    relation_type='attacks'
  for j in range(len(list_arg_section_id_in_document[doc_id])):
    if list_arg_section_id_in_document[doc_id][j]==supporting_section:
      supporting_text=list_argumentative_sections[doc_id][j]
      supporting_sentence_id=argument_to_sentence_map[doc_id][j]
    if list_arg_section_id_in_document[doc_id][j]==supported_section:
      supported_text=list_argumentative_sections[doc_id][j]
      supported_sentence_id=argument_to_sentence_map[doc_id][j]
  relations_list.append([relation_type,doc_id,supporting_text,supporting_sentence_id,supported_text,supported_sentence_id])        

In [None]:
# Create the dataframe.
Essay_ann_dataset=pd.DataFrame(relations_list,columns=column_names)

In [None]:
Essay_ann_dataset.head()

Unnamed: 0,type,doc_id,supporting_text,supporting_sentence_id,supported_text,supported_sentence_id
0,attacks,0,"Yes, it's annoying and cumbersome to separate ...",0,We Berliners should take the chance and become...,3
1,supports,0,Three different bin bags stink away in the kit...,1,"Yes, it's annoying and cumbersome to separate ...",0
2,supports,1,One can hardly move in Friedrichshain or Neukö...,4,Higher fines are therefore the right measure a...,6
3,supports,1,And when bad luck does strike and you step int...,5,Higher fines are therefore the right measure a...,6
4,attacks,1,"Of course, first they'd actually need to be ca...",7,Higher fines are therefore the right measure a...,6


In [None]:
# Print some the dataset characteristics.
print("Informations about the dataset:\n")
print("Number element in the dataset: {}".format(len(Essay_ann_dataset)))
print("Number support relations: {}".format(len(Essay_ann_dataset.loc[Essay_ann_dataset['type']=='supports'])))
print("Number attack relations: {}".format(len(Essay_ann_dataset.loc[Essay_ann_dataset['type']=='attacks'])))

Informations about the dataset:

Number element in the dataset: 371
Number support relations: 263
Number attack relations: 108


In [None]:
# Export the dataset.
Essay_ann_dataset.to_csv('microtext_relation_dataset.csv',index=False)