In [None]:
!pip install mendelai-brat-parser

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

nltk.download('universal_tagset')

Collecting mendelai-brat-parser
  Downloading mendelai_brat_parser-0.0.4-py3-none-any.whl (4.2 kB)
Installing collected packages: mendelai-brat-parser
Successfully installed mendelai-brat-parser-0.0.4
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [None]:
import pandas as pd
import numpy as np

from google.colab import files
import zipfile
import glob

import regex as re

from nltk.stem import WordNetLemmatizer

In [None]:
# Imported words and bigrams indicated in the appendix of the guidelines of 
# the Stab and Gurevych dataset as indicative of a premise or of a claim.
# The n-gram which are absent in each one of the dataset considered have been
# already removed from the list.

claim_premise_indicators=['accordingly','consequently','conclude that','clearly','demonstrates that','hence','implies','in short','in conclusion','indicates that','it follows that',
'it should be clear that','it should be clear','so','suggests that','therefore','thus','to sum up','assuming that','as',
'besides','because','deduced','derived from','due to','for','for example','for instance','for the reason that','furthermore','given that','in addition','in light of','in that','in view of','indicated by','is supported by','moreover','since',
'whereas']

# **Import the dataset with the origninal corpus**

In [None]:
# Import the claims positions and articles (claim-positions.txt).
print("\nImport claim-positions.txt:\n")
uploaded = files.upload()

# Import the premise positions and articles (premise-positions.txt).
print("\nImport premise-positions.txt:\n")
uploaded = files.upload()


Import claim-positions.txt:



Saving claim-positions.txt to claim-positions.txt

Import premise-positions.txt:



Saving premise-positions.txt to premise-positions.txt


In [None]:
# Import the IBM Debater dataset.

uploaded = files.upload()

Saving IBM_Debater_(R)_CE-EMNLP-2015.v3.zip to IBM_Debater_(R)_CE-EMNLP-2015.v3.zip


In [None]:
# Extract a zip file.
def import_zip(source: str, destination =""):
  with zipfile.ZipFile(source, 'r') as zip_ref:
    zip_ref.extractall(destination)

In [None]:
# Extract the dataset.
import_zip("IBM_Debater_(R)_CE-EMNLP-2015.v3.zip")

# **Claim and premise extraction**

In [None]:
# Read the files containing the labeled claim and premises.
claim_data=open("claim-positions.txt").read()
premise_data=open("premise-positions.txt").read()

In [None]:
# Create a list which contains as elements the lines in the original document.
claim_lines=claim_data.split("\n")
premise_lines=premise_data.split("\n")

In [None]:
# Separate each line of the document passed at the character '\t'.
# This character is the one used to separate the various component in each line
# of the document.
# In case of claims, also separate the first line from the rest of the document.
# The first line contains the description of the content of the other lines.

claim_list=[line.split("\t") for line in claim_lines][:-1]
premise_list=[line.split("\t") for line in premise_lines][:-1]

In [None]:
claim_list[:2]

[['Sentence',
  'Article position in list',
  'Sentence starting point',
  'Sentence ending point'],
 ['exposure to violent video games causes at least a temporary increase in aggression and that this exposure correlates with aggression in the real world',
  '0',
  '418',
  '568']]

In [None]:
# Create the two dataframes (trainsform the string representing numeric values into integer).

claim_dataframe=pd.DataFrame(claim_list[1:],columns=['sentence','article_id','start','end'])
claim_dataframe=claim_dataframe.astype({'article_id':int,'start':int,'end':int})

premise_dataframe=pd.DataFrame(premise_list[1:],columns=['sentence','article_id','start','end'])
premise_dataframe=premise_dataframe.astype({'article_id':int,'start':int,'end':int})

In [None]:
claim_dataframe.head()

Unnamed: 0,sentence,article_id,start,end
0,exposure to violent video games causes at leas...,0,418,568
1,video game violence is not related to serious ...,0,829,907
2,some violent video games may actually have a p...,0,1004,1082
3,exposure to violent video games causes both sh...,0,1442,1577
4,they increase the violent tendencies among youth,0,3900,3948


In [None]:
# Add the type of argumentative sections into the dataframes.
premise_type=["premise" for p in range(len(premise_list)-1)]
claim_type=["claim" for c in range(len(claim_list)-1)]

claim_dataframe["type"]=claim_type
premise_dataframe["type"]=premise_type

In [None]:
claim_dataframe.head()

Unnamed: 0,sentence,article_id,start,end,type
0,exposure to violent video games causes at leas...,0,418,568,claim
1,video game violence is not related to serious ...,0,829,907,claim
2,some violent video games may actually have a p...,0,1004,1082,claim
3,exposure to violent video games causes both sh...,0,1442,1577,claim
4,they increase the violent tendencies among youth,0,3900,3948,claim


In [None]:
# Concatenate the two dataframes into one.
argumentative_dataframe=pd.concat([claim_dataframe,premise_dataframe])

# **IOB and word list**

In [None]:
# Extract original articles text.
import_zip("IBM_Debater_(R)_CE-EMNLP-2015.v3/articles.zip","IBM_Debater_(R)_CE-EMNLP-2015.v3")

In [None]:
# Read the list of articles used in the dataset.
txt_articles = sorted(glob.glob("IBM_Debater_(R)_CE-EMNLP-2015.v3/articles/clean_*.txt"))

articles_text=[open(file).read() for file in txt_articles]

In [None]:
articles_text[0]



In [None]:
# Get the sorted list of the indicators of the articles used. 
articles_used=sorted(list(set(argumentative_dataframe['article_id'])))

In [None]:
len(articles_used)

513

In [None]:
articles_used[:5]

[0, 3, 19, 21, 32]

In [None]:
# Get the start and finish points of each argumentative section of each of the text.
# Also get the type of each argumentative section.
# This points are indicated in the "start" and "end" coloumns of the Dataframe 
# created containing both types of argumentative sentences (argumentative_dataframe).

sorted_span=[sorted(list(argumentative_dataframe.loc[argumentative_dataframe['article_id'] == article,['start','end','type']].values), key=lambda element: element[0] ) for article in articles_used]

In [None]:
len(sorted_span)

513

In [None]:
sorted_span[0][0:15]

[array([394, 568, 'premise'], dtype=object),
 array([394, 670, 'premise'], dtype=object),
 array([394, 568, 'premise'], dtype=object),
 array([394, 670, 'premise'], dtype=object),
 array([394, 670, 'premise'], dtype=object),
 array([394, 568, 'premise'], dtype=object),
 array([394, 568, 'premise'], dtype=object),
 array([394, 568, 'premise'], dtype=object),
 array([394, 568, 'premise'], dtype=object),
 array([418, 568, 'claim'], dtype=object),
 array([673, 907, 'premise'], dtype=object),
 array([673, 907, 'premise'], dtype=object),
 array([673, 907, 'premise'], dtype=object),
 array([731, 907, 'premise'], dtype=object),
 array([731, 907, 'premise'], dtype=object)]

In [None]:
# Join toghether all the argumentative section relating to the same section of 
# the text.

non_repeated_argumentative=[]

for article in sorted_span:
  non_repeated=[]
  start=article[0][0]
  end=article[0][1]
  argument_type=article[0][2]
  for i in range(1,len(article)):
    if end>article[i][0]:
      start=min(start,article[i][0])
      end=max(end,article[i][1])
      if not argument_type==article[i][2]:
        argument_type="both"
    else:
      non_repeated.append([start,end,argument_type])
      start=article[i][0]
      end=article[i][1]
      argument_type=article[i][2]
  non_repeated_argumentative.append(non_repeated)

In [None]:
non_repeated_argumentative[0][0:10]

[[394, 670, 'both'],
 [673, 907, 'both'],
 [911, 1111, 'both'],
 [1323, 1577, 'both'],
 [2944, 3540, 'premise'],
 [3900, 3948, 'claim'],
 [3965, 4215, 'both'],
 [4217, 4479, 'premise'],
 [5514, 5803, 'both'],
 [6679, 7317, 'both']]

In [None]:
# Number of argumentative section remaining.
sum([len(non_repeated) for non_repeated in non_repeated_argumentative])

2925

In [None]:
# Tranform the couples of points ((start,end) of each non-repeated section) 
# to a list (separator of different section).
non_repeated_span_points=[]

for i in range(len(non_repeated_argumentative)):
  list_points=[]
  for sect in non_repeated_argumentative[i]:
    list_points.append(sect[0])
    list_points.append(sect[1])
  # Insert starting point of the text
  list_points.insert(0,0)
  list_points.append(len(articles_text[articles_used[i]]))
  non_repeated_span_points.append(list_points)

In [None]:
non_repeated_span_points[0][0:10]

[0, 394, 670, 673, 907, 911, 1111, 1323, 1577, 2944]

In [None]:
# Check if there are two argumentative section that are not separated by at least 
# one character in a non argumentative section.
# There could be more than one zero at the start in case the article starts
# with an argumentative sentences, there are no problems in that case.
argumentative_regions_connected=False

for i in range(len(non_repeated_span_points)):
  for j in range(len(non_repeated_span_points[i])-2):
      if (non_repeated_span_points[i][j]==non_repeated_span_points[i][j+1]) and (not j==0) :
        argumentative_regions_connected=True
        print(i)
        print(j)
print(argumentative_regions_connected)        

False


In [None]:
# Separate the texts at the point indicated for the different sections

split_text=[]

for z in range(len(articles_used)):
  split_text.append([articles_text[articles_used[z]][i: j] for i, j in zip(non_repeated_span_points[z], non_repeated_span_points[z][1:])])

In [None]:
split_text[0][:10]

['. \n\nControversies over video games often center on topics such as video game graphic violence, sex and sexism, violent and gory scenes, partial or full nudity, portrayal of criminal behavior, racism, and other provocative and objectionable material. \n\nVideo games have been studied for links to addiction and aggression. Earlier meta-analyses (an analysis of several studies) were conflicting. ',
 'A 2001 study found that exposure to violent video games causes at least a temporary increase in aggression and that this exposure correlates with aggression in the real world. A decrease in prosocial behavior (caring about the welfare and rights of others) was also noted [REF',
 ']. ',
 'Another 2001 meta-analyses using similar methods[REF] and a more recent 2009 study focusing specifically on serious aggressive behavior[REF] concluded that video game violence is not related to serious aggressive behavior in real life',
 '. \n\n',
 'Many potential positive effects have been proposed [REF]

In [None]:
len(split_text)

513

In [None]:
# Remove articles with less than 6 section between argumentative and non argumentative.
not_remove=[i for i in range(len(split_text)) if len(split_text[i])>5]
split_text=[split_text[i] for i in not_remove]
non_repeated_argumentative=[non_repeated_argumentative[i] for i in not_remove]
articles_used=[articles_used[i] for i in not_remove]

In [None]:
len(split_text)

271

In [None]:
Y_IOB=[]
X_article_word_list=[]

for i in range(len(split_text)):
  
  IOB=[]
  article_word_list=[]

  # next_type is the next position still not considered in the list of argumentative sections of the essay examined.
  next_type=0

  texts=[re.sub(r"\n+"," ù ",text) for text in split_text[i]]
  # Remove the "[REF]" and "|-" from the text.
  texts=[re.sub(r"\[REF[\]$]*","",text) for text in texts]
  texts=[re.sub(r"^\]","",text) for text in texts]
  texts=[re.sub(r"|-","",text) for text in texts]

  # Set boolean value next_token_is_argumentative to false.
  # (first section is never argumentative, in case first sentence was argumentative
  # a fake section as been added previously).
  next_token_is_argumentative=False

  for section in texts:

    # Divide the text into token.
    seq=nltk.word_tokenize(section.lower())

    article_word_list+=seq

    if next_token_is_argumentative:
      if non_repeated_argumentative[i][next_type][2]=='premise':
        IOB+=['B-P']
        for token in range(len(seq)-1):
          IOB+=['I-P']
      else:
        if non_repeated_argumentative[i][next_type][2]=='claim':
          IOB+=['B-C']
          for token in range(len(seq)-1):
            IOB+=['I-C']
        else:
          IOB+=['B-B']
          for token in range(len(seq)-1):
            IOB+=['I-B']
      next_type += 1
    else:
      for token in range(len(seq)):
        IOB+=['O']

    # An argumentative section is followed by a non-argumentative section and vice-versa    
    next_token_is_argumentative=not next_token_is_argumentative

  Y_IOB.append(IOB)
  X_article_word_list.append(article_word_list)

In [None]:
for i in range(80,100):
  print(X_article_word_list[0][i]+" - "+Y_IOB[0][i])

causes - I-B
at - I-B
least - I-B
a - I-B
temporary - I-B
increase - I-B
in - I-B
aggression - I-B
and - I-B
that - I-B
this - I-B
exposure - I-B
correlates - I-B
with - I-B
aggression - I-B
in - I-B
the - I-B
real - I-B
world - I-B
. - I-B


# **Articles' sentences list**

In [None]:
# Get the list of sententences of each article.

article_sentence_list=[]

for article_id in range(len(X_article_word_list)):
  temp_article_sentence_list=[]
  sentence=""

  for word_id in range(len(X_article_word_list[article_id])):
    
    word=X_article_word_list[article_id][word_id]

    # Add to the sentence every word that is not a simple \n or the end of the sentence (".").
    if not ( word=="ù" or word=="." ):
      sentence+=word+" "
          
    # . is the end of the sentence.       
    if word in [".","?","!"] and (not sentence==""):
      temp_article_sentence_list.append(sentence)
      sentence=""
      
  article_sentence_list.append(temp_article_sentence_list)

In [None]:
article_sentence_list[0][0:2]

['controversies over video games often center on topics such as video game graphic violence , sex and sexism , violent and gory scenes , partial or full nudity , portrayal of criminal behavior , racism , and other provocative and objectionable material ',
 'video games have been studied for links to addiction and aggression ']

In [None]:
sentences_word_list=[[nltk.word_tokenize(sentence) for sentence in article] for article in article_sentence_list]

In [None]:
sentences_word_list[0][0][:10]

['controversies',
 'over',
 'video',
 'games',
 'often',
 'center',
 'on',
 'topics',
 'such',
 'as']

In [None]:
# Remove articles with sections of more than 150 words.
not_remove=[i for i in range(len(sentences_word_list)) if max([len(sentence) for sentence in sentences_word_list[i]])<=150]
X_article_word_list=[X_article_word_list[i] for i in not_remove]
Y_IOB=[Y_IOB[i] for i in not_remove]
split_text=[split_text[i] for i in not_remove]
non_repeated_argumentative=[non_repeated_argumentative[i] for i in not_remove]
articles_used=[articles_used[i] for i in not_remove]

In [None]:
len(not_remove)

251

In [None]:
# Get the list of all the sentences.

sentence_list=[]
  

for article_id in range(len(X_article_word_list)):
  sentence=""

  for word_id in range(len(X_article_word_list[article_id])):
    
    word=X_article_word_list[article_id][word_id]

    # Add to the sentence every word that is not a simple \n or the end of the sentence (".").
    if not ( word=="ù" or word=="." ):
      sentence+=word+" "
          
        # . is the end of the sentence.       
    if word in ['.','!','?'] and ( not sentence==""):
      sentence_list.append(sentence)

      sentence=""

  if not sentence=="":  
    sentence_list.append(sentence)        

In [None]:
len(sentence_list)

44936

# **List imorted n-gram words**

In [None]:
# Get the list of non-repeated words in the n-gram imported.
list_words=[]
for ngram in claim_premise_indicators:
  list_words+=ngram.split(" ")

list_words=list(set(list_words))

In [None]:
list_words[0:10]

['for',
 'moreover',
 'hence',
 'view',
 'supported',
 'suggests',
 'sum',
 'short',
 'by',
 'indicated']

# **Customised Tokenizer**

In [None]:
#  Lemmatization is used.
def build_tokenizer(text):

  wordnet_lemmatizer = WordNetLemmatizer()

  # Text tokenization.
  tokens=nltk.word_tokenize(text)

  # Text normalization throgh lemmatizzation.

  lemmas=[]
  for word in tokens:
    word_lemma=wordnet_lemmatizer.lemmatize(word)
    if word_lemma in list_words:
      lemmas.append(word_lemma)
    else:
      # characted "|" is not presente in the original corpus.
      lemmas.append("|")  

  return lemmas  

# **Bag of words**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Trasformation of the passed corpus in the dataframe of the bag of ngram contained
# in it.
def bag_of_ngram(sentence_list: list, ngram: int)-> pd.DataFrame:
  vectorizer=CountVectorizer(tokenizer=build_tokenizer,ngram_range=(1,ngram))
  bag_ngram=vectorizer.fit_transform(sentence_list)

  dataframe=pd.DataFrame(bag_ngram.toarray(), columns=vectorizer.get_feature_names())

  return dataframe

# **Bag of ngram export**

In [None]:
bag_ngram_dataframe=bag_of_ngram(sentence_list, 4)

In [None]:
# Mannualy check for the presence of the 5-gram.
# There is only one and the RAM is not enougth to compute the BOW of 5-grams.

gram5_column=[]

for sentence in sentence_list:
  if not sentence.find("it should be clear that"):
    gram5_column.append(1)
  else:
    gram5_column.append(0)

In [None]:
# Some of the n-gram in the imported list are not in the BOW just created. 
# Add them as empty columns.

ngram_list=bag_ngram_dataframe.columns

for ngram in claim_premise_indicators:
  if not (ngram in ngram_list):
    if (ngram=='it should be clear that'):
      print(ngram)
      bag_ngram_dataframe[ngram]=gram5_column
    else:
      print(ngram)
      bag_ngram_dataframe[ngram]=[0 for i in range(len(sentence_list))]

in conclusion
it should be clear that
for the reason that


In [None]:
bag_ngram_dataframe[claim_premise_indicators].head()

Unnamed: 0,accordingly,consequently,conclude that,clearly,demonstrates that,hence,implies,in short,in conclusion,indicates that,it follows that,it should be clear that,it should be clear,so,suggests that,therefore,thus,to sum up,assuming that,as,besides,because,deduced,derived from,due to,for,for example,for instance,for the reason that,furthermore,given that,in addition,in light of,in that,in view of,indicated by,is supported by,moreover,since,whereas
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
bag_ngram_dataframe[claim_premise_indicators].to_csv('Debater_BOW_appendix_words.csv',index=False)