# Combining Classification-Centred and Relation-Based Argumentation Mining Methods


<center><img src='../figures/pipeline-light-dark.png'></center>

## Import Libraries

In [1]:
# Native python 3.7.9 libraries             [https://docs.python.org/3.7/]
from collections import OrderedDict
from itertools import chain
import pprint
import os

# NLTK Libraries                            [https://www.nltk.org/]
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tree import Tree
from nltk.parse import CoreNLPParser
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

# Pandas library                            [https://pandas.pydata.org/docs/]
import pandas as pd

# Numpy library                             [https://numpy.org/doc/stable/contents.html]
import numpy as np

# Gensim libraries                          [https://radimrehurek.com/gensim/]
from gensim.models import Phrases
from gensim import corpora
from gensim import models

# Spacy libraries                           [https://spacy.io/api]
import spacy
from spacy.lang.en import English

# Textblob libraries                        [https://textblob.readthedocs.io/en/dev/]
from textblob import TextBlob

# String similarity libaries                [https://pypi.org/project/strsim/#cosine-similarity]
from similarity.cosine import Cosine      

# Scikit-Learn libraries                    [https://scikit-learn.org/stable/]
from sklearn.preprocessing import normalize
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

RuntimeError: Cython extensions are unavailable. Without them, this gensim functionality is disabled. If you've installed from a package, ask the package maintainer to include Cython extensions. If you're building gensim from source yourself, run `python setup.py build_ext --inplace` and retry. 

## Define utility functions

In [None]:

def print_files_in_list(file_list):
    '''
    :param file_list: A list of file names
    :return:          No return

    :description:     This function prints the contents of a list which holds filenames
    :use:             Check the contents of a filename list
    '''
    for f in file_list:
        print(f)


def get_files(file_path):
    '''
    :param file_path: A string pointing to a directory containing files
    :return:          A list of files

    :description:     Returns a list of csv files, given a file path
    :use:             Convenience function for obtaining files given a file path     
    '''
    file_list = [f for f in os.listdir(file_path) if f.endswith('.csv')]

    return file_list

def get_text_files(file_path):
    '''
    :param file_path: A string pointing to a directory containing files
    :return:          A dataframe containing the text in the file (?)

    :description:     
    :use:
    '''
    text_dictionary = {}
    file_list = [f for f in os.listdir(file_path) if f.endswith('.txt')]

    for i, fc in enumerate(file_list):
        print("Currently processing: {}".format(fc))
        with open(r"{}\{}".format(file_path, fc), encoding='utf8') as f:
            lines = f.read()
            text_dictionary['text_{}'.format(i)] = lines

    text_df = pd.DataFrame.from_dict(text_dictionary,
                                     orient='index')
    text_df.rename(columns={0: 'text'},
                   inplace=True)
    return text_df

def get_wordnet_pos(treebank_tag):
    '''
    :param treebank_tag: a string indicating the part of speech
    :return:             a wrapper constant of the part of speech now as a type wordnet.<part-of-speech>

    :description:
    :use:
    '''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

## Input Document

<center><img src='../figures/pipeline-light-dark-inp.png'></center>

In [None]:
# Set data path to read the clean files from the IBM Debator Dataset
data_path  = r'..\IBM_Debater_(R)_CE-EMNLP-2015.v3\articles'

# Instantiate a blank list to hold the file names
text_files = []

# Iterate over all the files in the IBM Debator Dataset directory
for f in os.listdir(data_path):

    # Check if the file is a text file, e.g. "ends with .txt" extension
    if f.endswith('.txt'):

        # Add the file to the text files list
        text_files.append(f)

# Uncomment this code to see the list of text files
# print_files_in_list(text_files)

### Data Inspection

In [None]:
# TODO

## Stage A:      Segmentation and Unique Identification Number Assignment

<center><img src='../figures/pipeline-light-dark-seg.png'></center>


In [None]:
def clause_segmentation(parse_string):
    '''
    :param parse_string:    A flattened parse tree string of an individual sentence
    :return:                A list of clauses

    :description:           Takes a parse string as input, and segments text into clauses
    :use:                   To segment text into individual clauses
    '''

    # Step 1: Form a parse tree from the flattened parse string
    t = Tree.fromstring(parse_string)

    # Instantiate a subtext list to hold the results
    subtexts = []

    # Step 2: Iterate through the tree, appending the sub-trees to subtext list if they indicate a clause (pos: S or SBAR)
    for subtree in t.subtrees():
        if subtree.label() == "S" or subtree.label() == "SBAR":
            # print(" ".join(subtree.leaves()))
            subtexts.append(' '.join(subtree.leaves()))
            

    for i in reversed(range(len(subtexts) - 1)):
        subtexts[i] = subtexts[i][0:subtexts[i].index(subtexts[i + 1])]

    candidate_clauses = [x.split() for x in subtexts]
    adjusted_candidate_clauses = []

    for text in candidate_clauses:
        # print("text: {} length: {}".format(text, len(text)))
        if len(text) > 1:
            adjusted_candidate_clauses.append(text)

    finalized_clauses = []
    for text in adjusted_candidate_clauses:
        finalized_clauses.append(' '.join(text))

    # For consideration: add the previous 1 word string to previous list, e.g. ['which'] -> ['adds', 'to', 'their' ...]
    return finalized_clauses

In [None]:
def create_unique_identifier(dictionary, entry_number):
    '''
    :param dictionary:      An ordered dictionary of paragraphs
    :param entry_number:    The paragraph number
    :return:                An ordered dictionary of sentences, whose keys contain unique identifiers and whose values are text fragments

    :description:
    :use:
    '''

    # Instantiate the CoreNLPParser, which parses the text and returns a parse tree of the text
    parser = CoreNLPParser(url='http://localhost:9005')

    # Instantiate a temporary unique identification number (UID) dictionary
    temp_unique_identifier_dict = OrderedDict()

    # Rename the entry number to make the programming / interpreation easier to work with
    paragraph_identifier = entry_number

    # Extract the text from the paragraph dictionary
    paragraph_text = dictionary[paragraph_identifier]

    # Step 1: tokenize the text into sentences
    sentences = sent_tokenize(paragraph_text)

    # Step 2: Iterate through the sentences and assign a number, starting with n = 1
    sentence_dict = OrderedDict(enumerate(sentences, start=1))

    # Step 3: Iterate through the new sentence dictionary created in Step 2, create the parse trees, extract the clauses, assign each clause a unique identification number
    for sentence_identifier, sentence_text in sentence_dict.items():
        try:
            # Step 3.1: create the parse tree(s) of the sentence
            trees = next(parser.raw_parse(sentence_text))

            # Step 3.2: iterate through the tree(s)
            for tr in trees:
                tr1 = str(tr)

                # create a tree from the flattened string
                s1 = Tree.fromstring(tr1)

                # generate the tree(s) production rules (documentation link: https://www.nltk.org/_modules/nltk/tree.html)
                s2 = s1.productions()

            # convert trees to string form
            new_trees = [str(tr) for tr in trees]

            # instantiate a list to trees after being joined
            joined_tree_list = []

            # Step 4: iterate over the new trees
            for tr in new_trees:
                joined_tree_list.append(tr.split())

            # Step 5: instantiate a new container for new joined trees
            new_joined_tree_list = []

            for tr in joined_tree_list:
                new_joined_tree_list.append(' '.join(tr))

            # Step 6: segement into clauses and get the clause list from the sentence
            clause_list = clause_segmentation(new_joined_tree_list[0])

            # Step 7: iterate through the clause list, and assign a unique identification number
            for clause_number, clause in enumerate(clause_list, start=1):
                unique_identifier = '{}.{}.{}'.format(paragraph_identifier, sentence_identifier, clause_number)
                temp_unique_identifier_dict[unique_identifier] = clause
                # print('Clause Detection Successful')
        
        # If only one clause exists, update with unique identification number that ends with x.x.1 (e.g. paragraph.sentence.1)
        except:
            unique_identifier = '{}.{}.1'.format(paragraph_identifier, sentence_identifier)
            temp_unique_identifier_dict[unique_identifier] = sentence_text
            # print('Special Case Detected')

    return temp_unique_identifier_dict

In [7]:
# Main Loop: Iterate through text files in text_data directory and process files into their unique id form
for file in text_files[:1]:

    file_name = file

    # Step 1: Open file
    f = open(r'{}\{}'.format(data_path, file_name), "r", encoding="utf8")

    # Step 2: Read raw data from file
    lines = f.readlines()

    # Step 3: Close file
    f.close()

    # Step 4: Iterate through file, eliminating spaces, which indicate paragraphs.
    # Note: Each new line in the resulting list is a paragraph
    lines = [line for line in lines if line != '\n']

    # Step 5: Create a paragraph dictionary, indexed by order of paragraph
    paragraph_dict = OrderedDict(enumerate(lines, start=1))

    # Step 6: Initialize an ordered dictionary to store results of unique identifiers and text
    text_dictionary = OrderedDict()

    # Step 7: Iterate through the paragraph dictionary and update the text_dictionary with results.
    # Note: The final result is a dictionary of sentences, indexed by their unique identifiers.
    for paragraph_number, paragraph_text in paragraph_dict.items():
        text_dictionary.update(create_unique_identifier(paragraph_dict, paragraph_number))

    # Step 8: Convert dictionary into pandas dataframe
    text_df = pd.DataFrame.from_dict(text_dictionary, orient='index')
    text_df.reset_index(inplace=True)

    # Step 9: Rename the default columns to reflect the data
    text_df.rename(columns={'index': 'unique_identifier', 0: 'text'}, inplace=True)
    file_name = file_name.replace('.txt', '')

    # Step 10: Save results as either a csv or excel file
    # Note: Uncomment / comment out relevent line. CSV file is default
    text_df.to_csv(
        r'..\results\stage_a\{}.csv'.format(
            file_name), index=False)
    
    # Optional code for excel instead of csv output |
    #------------------------------------------------
    #
    # text_df.to_excel(
    #     r'C:\Users\andre\PycharmProjects\Hybrid_Argument_Mining\post_18_month\data\result_excels\{}_unique_id.xlsx'.format(
    #         file_name), index=False)
    # print(text_df.head())
    # print('Finished processing: {}'.format(file_name))
    text_df



In [8]:
# Re-examine the data after Segmentation
text_df.head(15)

Unnamed: 0,unique_identifier,text
0,1.1.1,Controversies over video games often center on...
1,2.1.1,Video games have been studied for links to add...
2,2.2.1,Earlier meta - analyses -LRB- an analysis of s...
3,2.3.1,A 2001 study found that exposure to violent vi...
4,2.4.1,A decrease in prosocial behavior -LRB- caring ...
5,2.5.1,Another 2001 meta-analyses using similar metho...
6,3.1.1,Many potential positive effects have been prop...
7,3.2.1,Recent research has suggested
8,3.2.2,some violent video games may actually have a p...
9,4.1.1,It has been argued there is generally a lack o...


## Stage B:      Classification

<center><img src='../figures/pipeline-light-dark-class.png'></center>

In [9]:
# TODO

## Stage C: Templating

<center><img src='../figures/pipeline-light-dark-temp.png'></center>

In [10]:
nlp = English()
np.random.seed(123456)
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

### Stage C.1:    Sentiment Analysis

In [11]:
# Declare file location of results from previous stage
file_path = r"..\results\stage_a"

# Set a save directory
save_directory = r"..\results\stage_c_1"

# Get files from previous stage
file_list = get_files(file_path)

# Iterate over files in file_list
for i, fc in enumerate(file_list):

    print("\t-File {} - Currently processing: {}".format(i, fc))
    # Read the previoius file using pandas read_csv function
    temp_frame = pd.read_csv(r'{}\{}'.format(file_path, fc))

    try:
        # The code below uses TextBlob's "sentiment" method, which returns a named tuple
        # "Sentiment" of the type "Polarity / Subjectivity"
        '''
        Example:
            testimonial = TextBlob("Textblob is amazingly simple to use. What great fun!")
            testimonial.sentiment
            Sentiment(polarity=0.39166666666666666, subjectivity=0.4357142857142857)
            testimonial.sentiment.polarity
            0.39166666666666666
        '''
        # Use textblob's sentiment polarity / subjectivity functions to apply subjectivity / polarity
        temp_frame['sentiment_polarity'] = temp_frame['text'].apply(lambda t: TextBlob(t).sentiment[0])
        temp_frame['sentiment_subjectivity'] = temp_frame['text'].apply(lambda t: TextBlob(t).sentiment[1])

    # If the above code doesn't work, set a default value to "skip"
    except:
        temp_frame['sentiment_polarity']     = "skip"
        temp_frame['sentiment_subjectivity'] = "skip"

    # Housekeeping code to get file name and save stage progress using the same name, but to a different folder
    save_file_name = fc[:len(fc) - 4]
    temp_frame.to_csv(r'{}/{}.csv'.format(save_directory, save_file_name))
    print(r"Saving to:  {}/{}".format(save_directory, save_file_name))

	-File 0 - Currently processing: clean_1.csv
Saving to:  ..\results\stage_c_1/clean_1


### Stage C.2:    LDA / Topic Modelling

In [12]:
file_path = r'..\lda_module_data\lda_module_data.csv'
data = pd.read_csv(file_path)

#print("Data shape [Raw]: {}".format(data.shape))
data = data.dropna().reset_index(drop=True)

# The following line can be commented out for testing purposes, else tests take far longer using the entire dataset
# data = data.head(200)
print("Data shape [Adjusted for NaN]: {}".format(data.shape))

data['sentences'] = data.text.apply(sent_tokenize)
data['tokens_sentences'] = data['sentences'].apply(lambda sentences: [word_tokenize(sentence) for sentence in sentences])
data['POS_tokens'] = data['tokens_sentences'].apply(lambda tokens_sentences: [pos_tag(tokens) for tokens in tokens_sentences])

lemmatizer = WordNetLemmatizer()

data['tokens_sentences_lemmatized'] = data['POS_tokens'].apply(
    lambda list_tokens_POS: [
                                [
                                    lemmatizer.lemmatize(el[0], get_wordnet_pos(el[1]))
                                    if get_wordnet_pos(el[1]) != '' else el[0] for el in tokens_POS
                                ]
                                for tokens_POS in list_tokens_POS
                            ]
                                                               )

stopwords_verbs = ['say', 'get', 'go', 'know', 'may', 'need',
                    'like', 'make', 'see', 'want', 'come', 'take',
                    'use', 'would', 'can']

stopwords_other = ['one', 'mr', 'bbc', 'image', 'getty',
                    'de', 'en', 'caption', 'also', 'copyright', 'something']

my_stopwords = stopwords.words('English') + stopwords_verbs + stopwords_other

data['tokens'] = data['tokens_sentences_lemmatized'].apply(lambda sentences: list(chain.from_iterable(sentences)))
data['tokens'] = data['tokens'].apply(lambda tokens: [token.lower() for token in tokens if token.isalpha()
                                                        and token.lower() not in my_stopwords and len(token) > 1])


print("Processing Tokens ", end="")
tokens = data['tokens'].tolist()
bigram_model = Phrases(tokens)
trigram_model = Phrases(bigram_model[tokens], min_count=1)
tokens = list(trigram_model[bigram_model[tokens]])

print("Processing LDA dictionary ", end="")
dictionary_LDA = corpora.Dictionary(tokens)
dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens]

num_topics = 20
print("Processing LDA model via Gensim... ", end="")
lda_model = models.LdaModel(corpus,
                            num_topics=num_topics,
                            id2word=dictionary_LDA,
                            passes=4, alpha=[0.01] * num_topics,
                            eta=[0.01] * len(dictionary_LDA.keys())
                            )
print("Complete!")

# Uncomment this code to print the topics
# ---------------------------------------
# print("Looking at Topics")
# for i, topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=20):
#     print(str(i) + ": " + topic)
#     print()
# print_long_line()

# print("Allocating topics to documents")
# print("Article text: {}".format(data.text.loc[0][:500]))
# print("Topic: {}".format(lda_model[corpus[0]]))

# print("Predicting unseen text")

# Here is a sample text to use to check the model was trained correctly
document = '''Controversies over video games often center on topics such as video game graphic violence, sex and sexism, violent and gory scenes, partial or full nudity, portrayal of criminal behavior, racism, and other provocative and objectionable material.

Video games have been studied for links to addiction and aggression. Earlier meta-analyses (an analysis of several studies) were conflicting. A 2001 study found that exposure to violent video games causes at least a temporary increase in aggression and that this exposure correlates with aggression in the real world. A decrease in prosocial behavior (caring about the welfare and rights of others) was also noted. Another 2001 meta-analyses using similar methods and a more recent 2009 study focusing specifically on serious aggressive behavior concluded that video game violence is not related to serious aggressive behavior in real life.

Many potential positive effects have been proposed. Recent research has suggested that some violent video games may actually have a prosocial effect in some contexts, for example, team play.

It has been argued there is generally a lack of quality studies which can be relied upon and that the video game industry has become an easy target for the media to blame for many modern day problems. The most recent large scale meta-anlysis-- examining 130 studies with over 130,000 subjects worldwide-- concluded that exposure to violent video games causes both short term and long term aggression in players and decreases empathy and prosocial behavior. However, this meta-analysis was severely criticized in the same issue of the same journal for a number of methodological flaws, including failure to distinguish clinically valid from unstandardized aggression measures and for failing to solicit studies from researchers who have questioned whether causal links exist, thus biasing the sample of included studies.
'''

print('Testing / Error Checking ')
tokens = word_tokenize(document)
topics = lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=20)
frame = pd.DataFrame(
                        [(el[0], round(el[1], 2), topics[el[0]][1]) for el in lda_model[dictionary_LDA.doc2bow(tokens)]],
                        columns=['topic #', 'weight', 'words in topic']
                    )

frame.sort_values(by="weight", inplace=True, ascending=False)
frame

Data shape [Adjusted for NaN]: (1289, 2)
Processing Tokens Processing LDA dictionary Processing LDA model via Gensim... Complete!
Processing test string / checking for erros... 


Unnamed: 0,topic #,weight,words in topic
3,14,0.34,"0.005*""vote"" + 0.004*""party"" + 0.003*""system"" ..."
2,13,0.31,"0.015*""israel"" + 0.007*""report"" + 0.005*""gaza""..."
1,5,0.19,"0.006*""right"" + 0.004*""condom"" + 0.004*""law"" +..."
0,2,0.16,"0.003*""report"" + 0.003*""right"" + 0.003*""protes..."


In [13]:
# Declare file location of results from previous stage
file_path      = r'..\results\stage_c_1'

# Set a save directory
save_directory = r'..\results\stage_c_2'

# Get files from previous stage
file_list = get_files(file_path)

for i, fc in enumerate(file_list):
    print("File {} - Currently processing: {}".format(i, fc))
    temp_frame = pd.read_csv(r'{}\{}'.format(file_path, fc))

    temp_frame['lda_topics'] = "None"
    for row in range(temp_frame.shape[0]):
        try:
            text = word_tokenize(temp_frame.iloc[row]['text'])
            topics = lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=20)
            lda_frame = pd.DataFrame(
                                        [(el[0], round(el[1], 2), topics[el[0]][1]) for el in lda_model[dictionary_LDA.doc2bow(tokens)]],
                                        columns=['topic #', 'weight', 'words in topic']
                                     )

            topic_string = ''

            for item in lda_frame["topic #"].tolist():
                topic_string = topic_string + '{} '.format(item)
            
            temp_frame.loc[row, 'lda_topics'] = topic_string
        except:
            print("Error detected, leaving row as 'None'")

    # print(temp_frame.head())
    save_file_name = fc[:len(fc) - 4]
    temp_frame.drop('Unnamed: 0', axis=1, inplace=True)
    temp_frame.to_csv(r'{}/{}.csv'.format(save_directory, save_file_name))

File 0 - Currently processing: clean_1.csv


### Stage C.3:    Similarity Measures

In [16]:
# Declare file location of results from previous stage
file_path = r'..\results\stage_c_2'

# Set a save directory
save_directory = r'..\results\stage_c_3'

# Get files from previous stage
file_list = get_files(file_path)

for i, fc in enumerate(file_list):
    print("File {} - Currently processing: {}".format(i, fc))
    temp_frame = pd.read_csv(r'{}\{}'.format(file_path, fc))
    save_frame = temp_frame.copy()
    temp_frame.drop(['Unnamed: 0','sentiment_polarity', 'sentiment_subjectivity', 'lda_topics'], axis=1, inplace=True)

    unique_identifiers = temp_frame['unique_identifier'].tolist()
    records_dict = temp_frame.to_dict('records')
    text_dict = dict()

    for uid, record in zip(unique_identifiers, records_dict):
        text_dict[uid] = record['text']


    # This code can also be used to calculate Cosine similarities, using sklearn, Non-negative matrix factorization, and pandas "dot"              function
    # Note: This method is an approximation and has an element of randomness. 
    #------------------------------------------------------------------------

    # documents = new_frame['text'].tolist()
    # print(documents)

    # tfidf = TfidfVectorizer()
    # csr_mat = tfidf.fit_transform(documents)

    # model = NMF(n_components=20)
    # model.fit(csr_mat)
    # nmf_features = model.transform(csr_mat)
    # norm_features = normalize(nmf_features)

    # print(nmf_features)

    # df = pd.DataFrame(nmf_features, index=new_frame_list)
    # article = df.loc['1.1.1']
    # similarities = df.dot(article)

    # print(similarities)

    cosine = Cosine(2)

    for uid_1 in unique_identifiers:
        similarity_list = []
        new_column_name = 'cos_sim_to_{}'.format(uid_1)
        for uid_2 in unique_identifiers:
            
            # Comment out the three lines below for code testing
            # print("comparing {} to {}".format(uid_1, uid_2))
            # print("Text 1 --> {}".format(newer_dict[uid_1]))
            # print("Text 1 --> {}".format(newer_dict_two[uid_2]))

            p0 = cosine.get_profile(text_dict[uid_1])
            p1 = cosine.get_profile(text_dict[uid_2])
            measure = cosine.similarity_profiles(p0, p1)
            similarity_list.append(measure)

        temp_frame[new_column_name] = similarity_list
    
    save_frame = save_frame.merge(temp_frame, how='left')
    save_frame.drop('Unnamed: 0', axis=1, inplace=True)
    save_file_name = fc[:len(fc) - 4]
    # temp_frame.drop('Unnamed: 0', axis=1, inplace=True)
    save_frame.to_csv(r'{}/{}.csv'.format(save_directory, save_file_name))



File 0 - Currently processing: clean_1.csv


### Stage C.4:    Create Template

## Stage D:      Adjustment

<center><img src='../figures/pipeline-light-dark-adj.png'></center>

## Output

<center><img src='../figures/pipeline-light-dark-out.png'></center>