In [None]:
!pip install mendelai-brat-parser

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

nltk.download('universal_tagset')

Collecting mendelai-brat-parser
  Downloading mendelai_brat_parser-0.0.4-py3-none-any.whl (4.2 kB)
Installing collected packages: mendelai-brat-parser
Successfully installed mendelai-brat-parser-0.0.4
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [None]:
import pandas as pd
import numpy as np

from google.colab import files
import zipfile
import glob
from brat_parser import get_entities_relations_attributes_groups

import regex as re

from nltk.stem import WordNetLemmatizer

In [None]:
# Imported words and bigrams indicated in the appendix of the guidelines of 
# the Stab and Gurevych dataset as indicative of a premise or of a claim.
# The n-gram which are absent in each one of the dataset considered have been
# already removed from the list.

claim_premise_indicators=['accordingly','consequently','conclude that','clearly','demonstrates that','hence','implies','in short','in conclusion','indicates that','it follows that',
'it should be clear that','it should be clear','so','suggests that','therefore','thus','to sum up','assuming that','as',
'besides','because','deduced','derived from','due to','for','for example','for instance','for the reason that','furthermore','given that','in addition','in light of','in that','in view of','indicated by','is supported by','moreover','since',
'whereas']

# **Import the dataset with the origninal corpus**

In [None]:
# Import the Stab and Gurevych dataset (2017 version).

uploaded = files.upload()

Saving ArgumentAnnotatedEssays-2.0.zip to ArgumentAnnotatedEssays-2.0.zip


In [None]:
# Function to extract zip file.
# Takes in input the path to the zip file (path_zip) and the one to store the destination directory (path_destination).

def extract_zip(path_zip,path_destination):
  with zipfile.ZipFile(path_zip, 'r') as zip_ref:
    zip_ref.extractall(path_destination)

In [None]:
# Extract zip file (two zip files one inside the other).

extract_zip('ArgumentAnnotatedEssays-2.0.zip','ArgumentAnnotatedEssays-2.0')

extract_zip('ArgumentAnnotatedEssays-2.0/ArgumentAnnotatedEssays-2.0/brat-project-final.zip','ArgumentAnnotatedEssays')

# **Create the ann Dataframe**

In [None]:
# Extract the list of text files of the essays in the dataset.
txt_files = sorted(glob.glob("ArgumentAnnotatedEssays/brat-project-final/essay*.txt"))

# Extract the list of text files of the essays in the dataset.
ann_files = sorted(glob.glob("ArgumentAnnotatedEssays/brat-project-final/essay*.ann"))

In [None]:
# List of text files of the essays.
txt_files[0:5]

['ArgumentAnnotatedEssays/brat-project-final/essay001.txt',
 'ArgumentAnnotatedEssays/brat-project-final/essay002.txt',
 'ArgumentAnnotatedEssays/brat-project-final/essay003.txt',
 'ArgumentAnnotatedEssays/brat-project-final/essay004.txt',
 'ArgumentAnnotatedEssays/brat-project-final/essay005.txt']

In [None]:
# List of the annotated files of the essays.
ann_files[0:5]

['ArgumentAnnotatedEssays/brat-project-final/essay001.ann',
 'ArgumentAnnotatedEssays/brat-project-final/essay002.ann',
 'ArgumentAnnotatedEssays/brat-project-final/essay003.ann',
 'ArgumentAnnotatedEssays/brat-project-final/essay004.ann',
 'ArgumentAnnotatedEssays/brat-project-final/essay005.ann']

In [None]:
# Transfor the ann files into four dictionaries
ann_disctionaries=[get_entities_relations_attributes_groups(file) for file in ann_files]

# Transform the first dictionary (entities) obtained from each ann file into a dataset
essay_ann_datasets = [pd.DataFrame.from_dict(entities, orient='index') for entities,_,_,_ in ann_disctionaries]

# Add a coloumn into the dataset that identifies the document
for i in range(len(essay_ann_datasets)):
  essay_ann_datasets[i].insert(0,'doc_id',i)

# Create a common dataset
Essay_ann_dataset = pd.concat(essay_ann_datasets)

In [None]:
# Sobstitue the values in the "span" feature (substitute each tuple with its first inner tuple)
for i in range(len(Essay_ann_dataset.index)):
  Essay_ann_dataset['span'][i]=Essay_ann_dataset['span'][i][0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
Essay_ann_dataset.head()

# **IOB and word list**

In [None]:
# List of texts of the essays
files_text=[open(file).read() for file in txt_files]

In [None]:
# Check the presence of the character | (information used later).
for essay in files_text:
  if not (essay.find("|") == -1):
      print("One found!")

In [None]:
# Get the start and finish points of each argumentative section of each of the text.
# Also get the type of each argumentative section.
# This points are indicated in the "span" coloumn of the Dataframe of the annotation.

sorted_span=[sorted(list(Essay_ann_dataset.loc[Essay_ann_dataset['doc_id'] == i,['span','type']].values), key=lambda element: (element[0][0]) ) for i in range(len(ann_files))]

In [None]:
sorted_span[0]

In [None]:
# Tranform the couples of points ((start,end) of each section) to a list (separator of different section).

span_points=[]

for i in range(len(sorted_span)):
  list_points=[sep for sub in sorted_span[i] for sep in sub[0]]
  # Insert starting point of the text
  list_points.insert(0,0)
  list_points.append(len(files_text[i]))
  span_points.append(list_points)

In [None]:
span_points[0]

In [None]:
# Separate the texts at the point indicated for the different sections

split_text=[]

for z in range(len(files_text)):
  split_text.append([files_text[z][i: j] for i, j in zip(span_points[z], span_points[z][1:])])

In [None]:
split_text[0][0:10]

In [None]:
# Get the list of the words of each essay and the corresponding labels 
# ( argumentative section, premise or claim indicated through IOB).

Y_IOB=[]
X_essay_word_list=[]

for i in range(len(split_text)):
  
  IOB=[]
  essay_word_list=[]

  # next_type is the next position still not considered in the list of argumentative sections of the essay examined.
  next_type=0

  # Remove title from essay considered (splitted_text)
  no_title=[re.sub(r".*\n\n","",text) for text in split_text[i]]

  no_title=[re.sub(r"\n"," | ",text) for text in no_title]

  # Set boolean value next_token_is_argumentative to false.
  # (first section of each essay is never argumentative, it's the one containing the title).
  next_token_is_argumentative=False

  for section in no_title:

    # Divide the text into token.
    seq=nltk.word_tokenize(section.lower())

    essay_word_list+=seq

    if next_token_is_argumentative:
      if sorted_span[i][next_type][1]=='Premise':
        IOB+=['B-P']
        for token in range(len(seq)-1):
          IOB+=['I-P']
      else:
        IOB+=['B-C']
        for token in range(len(seq)-1):
          IOB+=['I-C']
      next_type += 1     
    else:
      for token in range(len(seq)):
        IOB+=['O']

    # An argumentative section is followed by a non-argumentative section and vice-versa    
    next_token_is_argumentative=not next_token_is_argumentative

  Y_IOB.append(IOB)
  X_essay_word_list.append(essay_word_list)

In [None]:
for i in range(80,120):
  print(X_essay_word_list[0][i]+" - "+Y_IOB[0][i])

# **Find out sentences which contain argumentative section**

In [None]:
# Get the list of all the sentences and the fact that they contain 
# an argumentative section, a premise or a claim or not.

sentence_list=[]
sentences_argumentative_map=[]
sentences_claim_presence_map=[]
sentences_premise_presence_map=[]  

for essay_id in range(len(X_essay_word_list)):
  sentence=""
  argumentative=False
  claim=False
  premise=False

  for word_id in range(len(X_essay_word_list[essay_id])):
    
    word=X_essay_word_list[essay_id][word_id]
    bio_of_word=Y_IOB[essay_id][word_id]

    if not (bio_of_word=='O'):
      argumentative=True
      if (bio_of_word=='I-C') or (bio_of_word=='B-C'):
        claim=True
      else:
        premise=True  

    # Add to the sentence every word that is not a simple \n or the end of the sentence (".").
    if not ( word=="|" or word=="." ):
      sentence+=word+" "
          
    # '.', '?' or '!' is the end of the sentence.       
    if word in [".","?","!"] and ( not sentence==""):
      sentence_list.append(sentence)

      # Add the indicator of the presence of argumentative sections to the correspondent lists.
      sentences_argumentative_map.append(argumentative)
      sentences_claim_presence_map.append(claim)
      sentences_premise_presence_map.append(premise)
      
      argumentative=False
      claim=False
      premise=False
      sentence=""

  if not sentence=="":  
    sentence_list.append(sentence)  

    # Add the indicator of the presence of argumentative sections to the correspondent lists.
    sentences_argumentative_map.append(argumentative)
    sentences_claim_presence_map.append(claim)
    sentences_premise_presence_map.append(premise)
      
    argumentative=False
    claim=False
    premise=False
    sentence=""

# **List imported n-gram words**

In [None]:
# Get the list of non-repeated words in the n-gram imported.
list_words=[]
for ngram in claim_premise_indicators:
  list_words+=ngram.split(" ")

list_words=list(set(list_words))

In [None]:
list_words[0:10]

# **Customised Tokenizer**

In [None]:
#  Lemmatization is used.
def build_tokenizer(text):

  wordnet_lemmatizer = WordNetLemmatizer()

  # Text tokenization.
  tokens=nltk.word_tokenize(text)

  # Text normalization throgh lemmatizzation.

  lemmas=[]
  for word in tokens:
    word_lemma=wordnet_lemmatizer.lemmatize(word)
    if word_lemma in list_words:
      lemmas.append(word_lemma)
    else:
      # characted "|" is not presente in the original corpus.
      lemmas.append("|")  

  return lemmas  

# **Bag of words**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Trasformation of the passed corpus in the dataframe of the bag of ngram contained
# in it.
def bag_of_ngram(sentence_list: list, ngram: int)-> pd.DataFrame:
  vectorizer=CountVectorizer(tokenizer=build_tokenizer,ngram_range=(1,ngram))
  bag_ngram=vectorizer.fit_transform(sentence_list)

  dataframe=pd.DataFrame(bag_ngram.toarray(), columns=vectorizer.get_feature_names())

  return dataframe

# **Bag of ngram export**

In [None]:
bag_ngram_dataframe=bag_of_ngram(sentence_list, 5)

In [None]:
# Some of the n-gram in the imported list are not in the BOW just created. 
# Add them as empty columns.

ngram_list=bag_ngram_dataframe.columns

for ngram in claim_premise_indicators:
  if not (ngram in ngram_list):
    print(ngram)
    bag_ngram_dataframe[ngram]=[0 for i in range(len(sentence_list))]

In [None]:
bag_ngram_dataframe[claim_premise_indicators].head()

In [None]:
bag_ngram_dataframe[claim_premise_indicators].to_csv('essay_BOW_appendix_words.csv',index=False)