In [None]:
!pip install mendelai-brat-parser

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

nltk.download('universal_tagset')

Collecting mendelai-brat-parser
  Downloading mendelai_brat_parser-0.0.4-py3-none-any.whl (4.2 kB)
Installing collected packages: mendelai-brat-parser
Successfully installed mendelai-brat-parser-0.0.4
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [None]:
import pandas as pd
import numpy as np

from google.colab import files
import zipfile
import glob

import regex as re

import xml.etree.ElementTree as ET
from nltk.stem import WordNetLemmatizer

In [None]:
# Imported words and bigrams indicated in the appendix of the guidelines of 
# the Stab and Gurevych dataset as indicative of a premise or of a claim.
# The n-gram which are absent in each one of the dataset considered have been
# already removed from the list.

claim_premise_indicators=['accordingly','consequently','conclude that','clearly','demonstrates that','hence','implies','in short','in conclusion','indicates that','it follows that',
'it should be clear that','it should be clear','so','suggests that','therefore','thus','to sum up','assuming that','as',
'besides','because','deduced','derived from','due to','for','for example','for instance','for the reason that','furthermore','given that','in addition','in light of','in that','in view of','indicated by','is supported by','moreover','since',
'whereas']

# **Dataset import**

In [None]:
# Import the microtexts dataset.

uploaded = files.upload()

Saving arg-microtexts-master.zip to arg-microtexts-master.zip


In [None]:
# Function to extract zip file.
# Takes in input the path to the zip file (path_zip) and the one to store the destination directory (path_destination).

def extract_zip(path_zip,path_destination):
  with zipfile.ZipFile(path_zip, 'r') as zip_ref:
    zip_ref.extractall(path_destination)

In [None]:
# Extract zip file.

extract_zip('arg-microtexts-master.zip','arg-microtexts-master')

# **Extract argumentative sections and their relations**

In [None]:
# Extract the list of xml files containing the argumentative sections in the 
# dataset and their relations.

xml_files = sorted(glob.glob("arg-microtexts-master/arg-microtexts-master/corpus/en/micro_*.xml"))

In [None]:
# List of xml files of the essays.
xml_files[0:5]

['arg-microtexts-master/arg-microtexts-master/corpus/en/micro_b001.xml',
 'arg-microtexts-master/arg-microtexts-master/corpus/en/micro_b002.xml',
 'arg-microtexts-master/arg-microtexts-master/corpus/en/micro_b003.xml',
 'arg-microtexts-master/arg-microtexts-master/corpus/en/micro_b004.xml',
 'arg-microtexts-master/arg-microtexts-master/corpus/en/micro_b005.xml']

In [None]:
# Transform each xml file into the xml tree representation.

list_xml_tree_representation=[ET.parse(xml_file) for xml_file in xml_files]

# Than extract from each tree its root.

list_xml_root=[tree.getroot() for tree in list_xml_tree_representation]

In [None]:
# list_argumentative_sections will contain the lists, for each document in the
# corpus, of its argumentative sections.
list_argumentative_sections=[]
# list_arg_section_id_in_document will contain the lists, for each document in
# the corpus, of the id that identify the sections inside the document.
list_arg_section_id_in_document=[]


for root_id in range(len(list_xml_root)):
  temp_list_argumentative_sections=[]
  temp_list_arg_section_id_in_document=[]
  new_temp_list_arg_section_id_in_document=[]
  for child in list_xml_root[root_id]:
    # The nodes tagged with 'edu' contains the text of the arg. section and an unique identifier in the document.
    if child.tag=='edu':
      temp_list_argumentative_sections.append(child.text)
      temp_list_arg_section_id_in_document.append(child.get('id'))
    # In the 'edge' nodes the original id of the sections (in src) are sobstitute with new ones (in trg).
    if (child.tag=='edge'):
      src=child.get('src')
      trg=child.get('trg')
      if src in temp_list_arg_section_id_in_document:
        new_temp_list_arg_section_id_in_document.append(trg)
  list_argumentative_sections.append(temp_list_argumentative_sections)
  list_arg_section_id_in_document.append(new_temp_list_arg_section_id_in_document)

In [None]:
for i in range(len(list_argumentative_sections[0])):
  print("section {} in document {}: {}".format(list_arg_section_id_in_document[0][i],0,list_argumentative_sections[0][i]))

section a1 in document 0: Yes, it's annoying and cumbersome to separate your rubbish properly all the time.
section a2 in document 0: Three different bin bags stink away in the kitchen and have to be sorted into different wheelie bins.
section a3 in document 0: But still Germany produces way too much rubbish
section a4 in document 0: and too many resources are lost when what actually should be separated and recycled is burnt.
section a5 in document 0: We Berliners should take the chance and become pioneers in waste separation!


# **Extract sentences**

In [None]:
# All the argumentative sections are contained in a single sentence.

# Get the list of all the sentences.

sentence_list=[]

for i in range(len(list_argumentative_sections)):
  sentence=""
  for j in range(len(list_argumentative_sections[i])):

    sentence+=list_argumentative_sections[i][j]

    # An argumentative section which ends a sentence has a '.','?' or '!' as last character.
    if sentence[-1] in ['.','?','!']:
      sentence_list.append(sentence)
      sentence=""

  if not sentence=="":  
    sentence_list.append(sentence)        

In [None]:
sentence_list[:10]

["Yes, it's annoying and cumbersome to separate your rubbish properly all the time.",
 'Three different bin bags stink away in the kitchen and have to be sorted into different wheelie bins.',
 'But still Germany produces way too much rubbishand too many resources are lost when what actually should be separated and recycled is burnt.',
 'We Berliners should take the chance and become pioneers in waste separation!',
 'One can hardly move in Friedrichshain or Neukölln these days without permanently scanning the ground for dog dirt.',
 "And when bad luck does strike and you step into one of the many 'land mines' you have to painstakingly scrape the remains off your soles.",
 'Higher fines are therefore the right measure against negligent, lazy or simply thoughtless dog owners.',
 "Of course, first they'd actually need to be caught in the act by public order officers,but once they have to dig into their pockets, their laziness will sure vanish!",
 'Health insurance companies should not cove

In [None]:
len(sentence_list)

450

# **List imorted n-gram words**

In [None]:
# Get the list of non-repeated words in the n-gram imported.
list_words=[]
for ngram in claim_premise_indicators:
  list_words+=ngram.split(" ")

list_words=list(set(list_words))

In [None]:
list_words[0:10]

['in',
 'short',
 'therefore',
 'that',
 'derived',
 'reason',
 'furthermore',
 'due',
 'as',
 'it']

# **Customised Tokenizer**

In [None]:
#  Lemmatization is used.
def build_tokenizer(text):

  wordnet_lemmatizer = WordNetLemmatizer()

  # Text tokenization.
  tokens=nltk.word_tokenize(text)

  # Text normalization throgh lemmatizzation.

  lemmas=[]
  for word in tokens:
    word_lemma=wordnet_lemmatizer.lemmatize(word)
    if word_lemma in list_words:
      lemmas.append(word_lemma)
    else:
      # characted "|" is not presente in the n-gram imported.
      lemmas.append("|")  

  return lemmas  

# **Bag of words**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Trasformation of the passed corpus in the dataframe of the bag of ngram contained
# in it.
def bag_of_ngram(sentence_list: list, ngram: int)-> pd.DataFrame:
  vectorizer=CountVectorizer(tokenizer=build_tokenizer,ngram_range=(1,ngram))
  bag_ngram=vectorizer.fit_transform(sentence_list)

  dataframe=pd.DataFrame(bag_ngram.toarray(), columns=vectorizer.get_feature_names())

  return dataframe

# **Bag of ngram export**

In [None]:
bag_ngram_dataframe=bag_of_ngram(sentence_list, 5)

In [None]:
# Some of the n-gram in the imported list are not in the BOW just created. 
# Add them as empty columns.

ngram_list=bag_ngram_dataframe.columns

for ngram in claim_premise_indicators:
  if not (ngram in ngram_list):
    print(ngram)
    bag_ngram_dataframe[ngram]=[0 for i in range(len(sentence_list))]

consequently
conclude that
demonstrates that
implies
in short
in conclusion
indicates that
it follows that
it should be clear that
it should be clear
suggests that
to sum up
assuming that
as
deduced
derived from
for the reason that
given that
in light of
indicated by
is supported by
whereas


In [None]:
bag_ngram_dataframe[claim_premise_indicators].head()

Unnamed: 0,accordingly,consequently,conclude that,clearly,demonstrates that,hence,implies,in short,in conclusion,indicates that,it follows that,it should be clear that,it should be clear,so,suggests that,therefore,thus,to sum up,assuming that,as,besides,because,deduced,derived from,due to,for,for example,for instance,for the reason that,furthermore,given that,in addition,in light of,in that,in view of,indicated by,is supported by,moreover,since,whereas
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
bag_ngram_dataframe[claim_premise_indicators].to_csv('microtext_BOW_appendix_words.csv',index=False)