In [None]:
!pip install mendelai-brat-parser

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

nltk.download('universal_tagset')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import regex as re
import math
from collections import Counter
import random

from google.colab import files
import zipfile
import glob

from brat_parser import get_entities_relations_attributes_groups

# **Dataset import**

In [None]:
# Import the Stab and Gurevych dataset (2017 version).

uploaded = files.upload()

In [None]:
# Function to extract zip file.
# Takes in input the path to the zip file (path_zip) and the one to store the destination directory (path_destination).

def extract_zip(path_zip,path_destination):
  with zipfile.ZipFile(path_zip, 'r') as zip_ref:
    zip_ref.extractall(path_destination)


In [None]:
# Extract zip file (two zip files one inside the other).

extract_zip('ArgumentAnnotatedEssays-2.0.zip','ArgumentAnnotatedEssays-2.0')

extract_zip('ArgumentAnnotatedEssays-2.0/ArgumentAnnotatedEssays-2.0/brat-project-final.zip','ArgumentAnnotatedEssays')

# **Create and visualize the required Dataframes**

In [None]:
# Extract the list of text files of the essays in the dataset.
txt_files = sorted(glob.glob("ArgumentAnnotatedEssays/brat-project-final/essay*.txt"))

# Extract the list of text files of the essays in the dataset.
ann_files = sorted(glob.glob("ArgumentAnnotatedEssays/brat-project-final/essay*.ann"))

In [None]:
# List of text files of the essays.
txt_files[0:5]

['ArgumentAnnotatedEssays/brat-project-final/essay001.txt',
 'ArgumentAnnotatedEssays/brat-project-final/essay002.txt',
 'ArgumentAnnotatedEssays/brat-project-final/essay003.txt',
 'ArgumentAnnotatedEssays/brat-project-final/essay004.txt',
 'ArgumentAnnotatedEssays/brat-project-final/essay005.txt']

In [None]:
# List of the annotated files of the essays.
ann_files[0:5]

['ArgumentAnnotatedEssays/brat-project-final/essay001.ann',
 'ArgumentAnnotatedEssays/brat-project-final/essay002.ann',
 'ArgumentAnnotatedEssays/brat-project-final/essay003.ann',
 'ArgumentAnnotatedEssays/brat-project-final/essay004.ann',
 'ArgumentAnnotatedEssays/brat-project-final/essay005.ann']

In [None]:
# Transfor the ann files into four dictionaries.
ann_disctionaries=[get_entities_relations_attributes_groups(file) for file in ann_files]

# Transform the first dictionary (entities) obtained from each ann file into a dataset.
essay_ann_datasets = [pd.DataFrame.from_dict(entities, orient='index') for entities,_,_,_ in ann_disctionaries]

# Transform the second and the third (relations and attributes) obtained from each ann file into two datasets.
# One containing the relations between premises and other argumentative sections in the agumentative structure,
# and the other the relations between claims and major claims.
essay_premise_argument_relations_dataset = [pd.DataFrame.from_dict(relations, orient='index') for _,relations,_,_ in ann_disctionaries]
essay_claim_majorClaim_relations_dataset = [pd.DataFrame.from_dict(attributes, orient='index') for _,_,attributes,_ in ann_disctionaries]

In [None]:
# Sort the lists composed of a (entity) dataset for each essay.
essay_ann_datasets=[dataset.sort_values(by='span', key=lambda col: col.map(lambda x: x[0][0])) for dataset in essay_ann_datasets]  

# Add a coloumn into the dataset that identifies the document
for i in range(len(essay_ann_datasets)):
  essay_ann_datasets[i].insert(0,'doc_id',i)

# Create a common dataset
essay_argument_dataset = pd.concat(essay_ann_datasets)

In [None]:
# Sobstitue the values in the "span" feature (substitute each tuple with its first inner tuple)
for i in range(len(essay_argument_dataset.index)):
  essay_argument_dataset['span'][i]=essay_argument_dataset['span'][i][0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
essay_argument_dataset.head()

Unnamed: 0,doc_id,id,type,span,text
T1,0,T1,MajorClaim,"(503, 575)",we should attach more importance to cooperatio...
T3,0,T3,Claim,"(591, 714)","through cooperation, children can learn about ..."
T4,0,T4,Premise,"(716, 851)",What we acquired from team work is not only ho...
T5,0,T5,Premise,"(853, 1086)","During the process of cooperation, children ca..."
T6,0,T6,Premise,"(1088, 1191)",All of these skills help them to get on well w...


In [None]:
essay_premise_argument_relations_dataset[0]

Unnamed: 0,id,type,subj,obj
R1,R1,supports,T4,T3
R2,R2,supports,T5,T3
R3,R3,supports,T6,T3
R4,R4,supports,T10,T11
R5,R5,supports,T9,T11
R6,R6,supports,T8,T7


In [None]:
essay_claim_majorClaim_relations_dataset[0]

Unnamed: 0,id,type,target,values
A1,A1,Stance,T3,"(For,)"
A2,A2,Stance,T7,"(Against,)"
A3,A3,Stance,T11,"(For,)"


# **IOB and word list of the Corpus**

In [None]:
# List of texts of the essays
files_text=[open(file).read() for file in txt_files]

In [None]:
# Get the start and finish points of each argumentative section of each of the text.
# Also get the type of each argumentative section.
# This points are indicated in the "span" coloumn of the Dataframe of the annotation.

sorted_span=[list(essay_argument_dataset.loc[essay_argument_dataset['doc_id'] == i,['span','type']].values) for i in range(len(ann_files))]

In [None]:
sorted_span[0]

[array([(503, 575), 'MajorClaim'], dtype=object),
 array([(591, 714), 'Claim'], dtype=object),
 array([(716, 851), 'Premise'], dtype=object),
 array([(853, 1086), 'Premise'], dtype=object),
 array([(1088, 1191), 'Premise'], dtype=object),
 array([(1212, 1301), 'Premise'], dtype=object),
 array([(1332, 1376), 'Claim'], dtype=object),
 array([(1387, 1492), 'Premise'], dtype=object),
 array([(1549, 1846), 'Premise'], dtype=object),
 array([(1927, 1992), 'Claim'], dtype=object),
 array([(2154, 2231), 'MajorClaim'], dtype=object)]

In [None]:
# Tranform the couples of points ((start,end) of each section) to a list (separator of different section).

span_points=[]

for i in range(len(sorted_span)):
  list_points=[sep for sub in sorted_span[i] for sep in sub[0]]
  # Insert starting point of the text
  list_points.insert(0,0)
  list_points.append(len(files_text[i]))
  span_points.append(list_points)

In [None]:
span_points[0]

[0,
 503,
 575,
 591,
 714,
 716,
 851,
 853,
 1086,
 1088,
 1191,
 1212,
 1301,
 1332,
 1376,
 1387,
 1492,
 1549,
 1846,
 1927,
 1992,
 2154,
 2231,
 2232]

In [None]:
# Separate the texts at the point indicated for the different sections

split_text=[]

for z in range(len(files_text)):
  split_text.append([files_text[z][i: j] for i, j in zip(span_points[z], span_points[z][1:])])

In [None]:
split_text[0][0:10]

["Should students be taught to compete or to cooperate?\n\nIt is always said that competition can effectively promote the development of economy. In order to survive in the competition, companies continue to improve their products and service, and as a result, the whole society prospers. However, when we discuss the issue of competition or cooperation, what we are concerned about is not the whole society, but the development of an individual's whole life. From this point of view, I firmly believe that ",
 'we should attach more importance to cooperation during primary education',
 '.\nFirst of all, ',
 'through cooperation, children can learn about interpersonal skills which are significant in the future life of all students',
 '. ',
 'What we acquired from team work is not only how to achieve the same goal with others but more importantly, how to get along with others',
 '. ',
 'During the process of cooperation, children can learn about how to listen to opinions of others, how to com

In [None]:
# Get the list of the words of each essay and the corresponding labels 
# ( argumentative section, premise or claim indicated through IOB).

Y_IOB=[]
X_essay_word_list=[]

for i in range(len(split_text)):
  
  IOB=[]
  essay_word_list=[]

  # Remove title from essay considered (splitted_text)
  no_title=[re.sub(r".*\n\n","",text) for text in split_text[i]]

  # Set boolean value next_token_is_argumentative to false.
  # (first section of each essay is never argumentative, it's the one containing the title).
  next_token_is_argumentative=False

  for section in no_title:

    # Divide the text into token.
    seq=nltk.word_tokenize(section.lower())

    essay_word_list+=seq

    if next_token_is_argumentative:
      for token in range(len(seq)):
        IOB+=['I']  
    else:
      for token in range(len(seq)):
        IOB+=['O']

    # An argumentative section is followed by a non-argumentative section and vice-versa    
    next_token_is_argumentative=not next_token_is_argumentative

  Y_IOB.append(IOB)
  X_essay_word_list.append(essay_word_list)

In [None]:
for i in range(80,120):
  print(X_essay_word_list[0][i]+" - "+Y_IOB[0][i])

i - O
firmly - O
believe - O
that - O
we - I
should - I
attach - I
more - I
importance - I
to - I
cooperation - I
during - I
primary - I
education - I
. - O
first - O
of - O
all - O
, - O
through - I
cooperation - I
, - I
children - I
can - I
learn - I
about - I
interpersonal - I
skills - I
which - I
are - I
significant - I
in - I
the - I
future - I
life - I
of - I
all - I
students - I
. - O
what - I


# **Find out sentences which contain argumentative section**

In [None]:
# Get the list of all the sentences and the fact that they contain 
# an argumentative section.

sentence_list=[]

# List that contains for all the argumentative sections of the corpus the corresponding
# starting sentence (odered by essay and starting point and starting point in the text). 
argument_to_sentence_map=[]

# List of the number of sentences that are part of an agumentative sections 
# besides the first one.
# One element for each argumentative section.

arg_section_len_list=[]


for essay_id in range(len(X_essay_word_list)):
  sentence=""
  argumentative=False
  argument=False
  # Number of sentences that are part of an agumentative sections besides the first one.
  count_sentences_in_arg_section=0

  for word_id in range(len(X_essay_word_list[essay_id])):
    
    word=X_essay_word_list[essay_id][word_id]
    bio_of_word=Y_IOB[essay_id][word_id]

    if not (bio_of_word=='O'):
      # For each argumentative section found, store the corresponding starting sentence.
      if not argument:
        argument_to_sentence_map.append(len(sentence_list))
        argument=True
      argumentative=True
    else:
      if argument:
        arg_section_len_list.append(count_sentences_in_arg_section)
        count_sentences_in_arg_section=0
      argument=False

    # Add to the sentence every word that is not a simple \n or the end of the sentence (".").
    if not ( word=="." ):
      sentence+=word+" "
          
    # . is the end of the sentence.       
    if word in [".","?","!"] and ( not sentence==""):
      sentence_list.append(sentence)
      
      if argument:
        count_sentences_in_arg_section+=1

      argumentative=False
      sentence=""


  if argument:
      arg_section_len_list.append(count_sentences_in_arg_section)
      count_sentences_in_arg_section=0
      argument=False

  if not sentence=="":
      sentence_list.append(sentence)
      
      if argument:
        count_sentences_in_arg_section+=1
        arg_section_len_list.append(count_sentences_in_arg_section)

      argumentative=False
      sentence=""      

In [None]:
len(sentence_list)

6738

In [None]:
len(argument_to_sentence_map)

6089

In [None]:
len(arg_section_len_list)

6089

In [None]:
count=0

for count_sentences in arg_section_len_list:
  if count_sentences>0:
    count+=1

print(count)     

4


# **Relations Dataframe**

In [None]:
# Add to the dataset (entity) a column containing for each argumentative section
# the starting sentence (the coumn (list) and the dataset are already ordered in 
# the same way, and the same position correspond to the same section).
essay_argument_dataset['starting_sentence_id']=argument_to_sentence_map
essay_argument_dataset['section_length']=arg_section_len_list

In [None]:
essay_argument_dataset.head()

Unnamed: 0,doc_id,id,type,span,text,starting_sentence_id,section_length
T1,0,T1,MajorClaim,"(503, 575)",we should attach more importance to cooperatio...,3,0
T3,0,T3,Claim,"(591, 714)","through cooperation, children can learn about ...",4,0
T4,0,T4,Premise,"(716, 851)",What we acquired from team work is not only ho...,5,0
T5,0,T5,Premise,"(853, 1086)","During the process of cooperation, children ca...",6,0
T6,0,T6,Premise,"(1088, 1191)",All of these skills help them to get on well w...,7,0


In [None]:
# Remove MajorClaim from the argumentations in the imported dataset (they don't have labeled relations).
essay_argument_dataset=essay_argument_dataset.loc[essay_argument_dataset['type']!='MajorClaim']

In [None]:
essay_argument_dataset.head()

Unnamed: 0,doc_id,id,type,span,text,starting_sentence_id,section_length
T3,0,T3,Claim,"(591, 714)","through cooperation, children can learn about ...",4,0
T4,0,T4,Premise,"(716, 851)",What we acquired from team work is not only ho...,5,0
T5,0,T5,Premise,"(853, 1086)","During the process of cooperation, children ca...",6,0
T6,0,T6,Premise,"(1088, 1191)",All of these skills help them to get on well w...,7,0
T8,0,T8,Premise,"(1212, 1301)",the significance of competition is that how to...,8,0


In [None]:
# Create a list of all the possible combinations of possible relations (premise-other arguments)
# containing the identifier of the essay, the text of the two components, the identifier of 
# the two sentences that contains them, and their argumentative type (claim/premise).
relations_list=[]
column_names=['type','doc_id','supporting_text','supporting_type','supporting_sentence_id', 'supporting_section_length','supported_text','supported_type','supported_sentence_id', 'supported_section_length']

for supporting_argument in essay_argument_dataset.itertuples():
  essay_id=supporting_argument.doc_id
  supporting_argument_id=supporting_argument.id
  if supporting_argument.type=='Premise':
    for supported_argument in essay_argument_dataset[essay_argument_dataset['doc_id']==essay_id].itertuples():
      supported_argument_id=supported_argument.id
      if not supported_argument_id==supporting_argument_id:
        relation="None"
        for relation_dataset_row in essay_premise_argument_relations_dataset[essay_id].itertuples():
          if relation_dataset_row.subj==supporting_argument_id and relation_dataset_row.obj==supported_argument_id:
            relation=relation_dataset_row.type
        if (not relation=="None") or (random.random()>0.95):
          relations_list.append([relation,essay_id,supporting_argument.text,supporting_argument.type,supporting_argument.starting_sentence_id,supporting_argument.section_length,supported_argument.text,supported_argument.type,supported_argument.starting_sentence_id,supported_argument.section_length])

In [None]:
# Create the dataframe.
Essay_ann_dataset=pd.DataFrame(relations_list,columns=column_names)

In [None]:
Essay_ann_dataset.head()

Unnamed: 0,type,doc_id,supporting_text,supporting_type,supporting_sentence_id,supporting_section_length,supported_text,supported_type,supported_sentence_id,supported_section_length
0,supports,0,What we acquired from team work is not only ho...,Premise,5,0,"through cooperation, children can learn about ...",Claim,4,0
1,supports,0,"During the process of cooperation, children ca...",Premise,6,0,"through cooperation, children can learn about ...",Claim,4,0
2,,0,"During the process of cooperation, children ca...",Premise,6,0,All of these skills help them to get on well w...,Premise,7,0
3,supports,0,All of these skills help them to get on well w...,Premise,7,0,"through cooperation, children can learn about ...",Claim,4,0
4,,0,All of these skills help them to get on well w...,Premise,7,0,"without the cooperation, there would be no vic...",Claim,14,0


In [None]:
# Print some the dataset characteristics.
print("Informations about the dataset:\n")
print("Number element in the dataset: {}".format(len(Essay_ann_dataset)))
print("Number effective relations: {}".format(len(Essay_ann_dataset.loc[Essay_ann_dataset['type']!='None'])))
print("Number support relations: {}".format(len(Essay_ann_dataset.loc[Essay_ann_dataset['type']=='supports'])))
print("Number attack relations: {}".format(len(Essay_ann_dataset.loc[Essay_ann_dataset['type']=='attacks'])))
print("Number premise supported by other premises: {}".format(len(Essay_ann_dataset.loc[(Essay_ann_dataset['supported_type']=='Premise')&(Essay_ann_dataset['type']=='supports')])))

Informations about the dataset:

Number element in the dataset: 6293
Number effective relations: 3832
Number support relations: 3613
Number attack relations: 219
Number premise supported by other premises: 667


In [None]:
# Export the dataset.
Essay_ann_dataset.to_csv('essay_relation_dataset.csv',index=False)