# Relation Prediction in Argument Mining With Pre-trained Deep Bidirectional Transformers - Additional

This notebook shows or generates some additional results and visualizations not included in the thesis.

- Overall stats overview
- Attack/Support ratios
- Lime visualizations
- Scattertext visualizations 
- Acc distributions for NoDE + Procon
- (Visualizations) for predictions by org/response
- Agreement sentiment 

In [None]:
# Necessary imports and setups
# Basic python imports
import re
import sys 
import os

# For the sentiment baselines
import nltk
# Download Sentiment Lexicon
nltk.download('vader_lexicon')

# Datahandling and plotting
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Statistics and other stuff
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report
from IPython.display import IFrame, HTML
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Get the exact splits of the data
sys.path.append(os.path.abspath("../pytorch"))
from run_classifier_dataset_utils import processors

# Settings
# Do no hide rows in pandas
pd.set_option('display.max_rows', 999)
# Print everything with precision 2
pd.set_option('precision', 2)

In [None]:
# Read all necessary data
df = pd.read_csv('../data/complete_data.tsv', sep='\t').astype({"id": str})
data_stats_org = np.load('../data/stats/data_stats_org.npy', allow_pickle=True).item()
data_stats_resp = np.load('../data/stats/data_stats_resp.npy', allow_pickle=True).item()
data_stats_total = pd.read_csv('../data/stats/data_stats_total.tsv', sep='\t')

# Init the sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Helper functions
def disc_pol(x):
    """Discretize the float sentiment polarity."""
    if x >= 0.00:
        return 'positive'
    else:
        return 'negative'

In [None]:
# Overall stats overview
display(data_stats_total)

In [None]:
# Attack/Support ratios
# Plot how many arguments attack an original argument (attack-ratio)
# Most arguments are only attacked or only supported (interesting for detecting arguments likely to be attacked/supported)
# If we disregard every argument, which is only answered to once more arguments have an attack-ratio of 0.5
# For political Task 2 (Attack/Support is regarded)

print("Attack ratio for the original arguments in debate train and political as:")
fig, (ax1,ax2) = plt.subplots(2,2, figsize=(10,4))  # 2 rows, 2 columns
for data_set, ax in [('debate_train', ax1), ('political',ax2)]:
    data_stats_org[data_set]["Total pairs nu"] = data_stats_org[data_set]['Attacked'] + data_stats_org[data_set]['Supported']
    df_plot = data_stats_org[data_set].iloc[:-1].apply(
        lambda r: pd.Series({"Attack-ratio": r["Attacked"] / r["Total pairs nu"],
                             "Attack-ratio (exluding arguments only attacked/supported once)": np.nan if r["Total pairs nu"] == 1 else r["Attacked"] / r["Total pairs nu"]}),
        axis=1)
    df_plot.hist(density=False, ax=ax)
    ax[0].set_ylabel(data_set, rotation=0)
    
plt.tight_layout()

## Lime + Scattertexts
- LIME visualization of two example sentences, model trained using **only_response** (rest default options)
    - Details about LIME [here](https://github.com/marcotcr/lime)
    - To create LIME visualizations add the `--do_visualization`-flag to `run_classifier_ba.py`, it will then create a file called `lime.html` in the `output_dir` specified.
- The scattertexts are produced by running `python datasets_plots.py`.


In [None]:
# Lime Visualization
# Some of the words play an influence as expected, e.g. are and not (attack), play, and alcohol (support)
# Others do not play the expected influence, e.g. china (attack and not support as expected)
# Overall, all weights are really small and the removal/replacement with UNK of a single word 
# does not change the prediction
HTML(filename='./LIME/lime.html')

In [None]:
# Lime
# All words have a very small impact
HTML(filename='./LIME/lime_pol.html')

In [None]:
# Scattertext of the responses in debate_train
# No special "attacking" or "supporting" words easily recognizable
# The words are either topic specific, e.g. China (in topic Chinaonechildpolicy there are more supports than attacks)
# Or they seem to be there by chance (small dataset), e.g. he, does
if os.path.isdir("../data/plots"):
    display(IFrame(src='../data/plots/scattertext_attack_supportdebate_train.html', width=950, height=500))
else:
    print("Run cd ../data; python datatsets_plots.py")

In [None]:
# Scattertext
# Scattertext of the authors in political
# The word usage of Nixon and Kennedy is quite different
if os.path.isdir("../data/plots"):
    display(IFrame(src='../data/plots/scattertext_nixon_kennedy.html', width=950, height=500))
else:
     print("Run cd ../data; python datatsets_plots.py")

In [None]:
# Plot distributions of accuracy of all runs (30) for one setting

if os.path.isdir("../pytorch/res/node_both_procon"):
    print("Accuracy for 30 runs each of the 8 settings for NoDE+procon:")
    fig, ax = plt.subplots(4,2, figsize=(10,10), sharey=True, sharex=True)  # 4 rows, 2 columns
    eval_results = pd.read_csv('../pytorch/res/node_both_procon/eval_results.tsv', sep='\t')
    eval_results_grouped = eval_results.groupby(['_bert-model', '_num_epochs', '_batch_size','_gradient_acc' ,'_learning_rate' ])
    for i, (name, group) in enumerate(eval_results_grouped):
        group.hist(column='acc', ax=ax[i//2, i%2])
    plt.tight_layout()
    plt.show()
else:
    print("You have to first reproduce the results for the NoDE procon dataset.\n"
          "../code_relation_prediction/pytorch ./run_all_node.sh procon")

In [None]:
# NoDE Predictions with respect to the original arguments
# Does one original argument always get the same label or does it depend on the paired response argument?

# Load the data
if os.path.isdir("../pytorch/res/node_both_paper/"):
    node_pro = processors['node']('both')
    _, node_test_df = node_pro.get_dev_examples('../data')
    eval_preds = pd.read_csv('../pytorch/res/node_both_paper/eval_preds.csv')
    results = eval_results_grouped['acc'].agg([np.mean, np.min, np.max, np.std, np.median])
    bmodel, bepochs, bb, bga, blr = results.loc[results['mean'].idxmax()].name
    best_pred_ps = eval_results.loc[(eval_results['_bert-model'] == bmodel) & 
                           (eval_results['_num_epochs'] == bepochs) & (eval_results['_batch_size'] == bb) &
                           (eval_results['_learning_rate'] == blr)].index
    # Only predictions from best setting 
    res = pd.concat([node_test_df.reset_index(drop=True), eval_preds.iloc[best_pred_ps,:-1].transpose().reset_index(drop=True)], axis=1)
    res['Mean prediction'] = res[list(best_pred_ps)].mean(axis=1).round().values
    res = res.replace({0: 'attack', 1: 'support'})
    res = res.rename(columns={'label': 'Label'})
    preds_orgs = pd.crosstab(res['org'], [res['Mean prediction'],res['Label']])
    preds_orgs['total'] = preds_orgs.agg([np.sum], axis=1)
    # Original arguments with many responses 
    print("Original arguments with many answers (Only for the second argument two different classes are predicted)")
    display(preds_orgs.loc[preds_orgs['total'] > 1].sort_values(by='total', ascending=False).head())
else:
    print("You have to first reproduce the results for the NoDE dataset.\n"
          "../code_relation_prediction/pytorch ./run_all_node.sh comp")

In [None]:
# Political Task 2 (Attack/Support) Predictions with respect to the original arguments and response arguments
# Does one original argument always get the same label or does it depend on the paired response argument?
# Does one response argument always get the same label or does it depend on the paired original argument?

# Load the data
if os.path.isdir("../pytorch/res/pol_as/"):
    pol_pro = processors['political-as']('both')
    pol_test_df = pd.concat(np.array(pol_pro.get_splits('../data'))[:,3])
    eval_preds = pd.read_csv('../pytorch/res/pol_as/eval_preds.csv')
    preds = pd.Series()
    for i, row in eval_preds.iterrows():
        preds = preds.append(pd.Series(row.values[~row.str.contains('bert*', na=False, regex=True)]))
        if i == 9:
            break
    preds = preds.dropna().astype(int)
    pol_test_df['preds'] = preds.values
    pol_test_df = pol_test_df.replace({0: 'attack', 1: 'support'})
    # Original arguments with many responses 
    preds_orgs = pd.crosstab(pol_test_df['org'], [pol_test_df['preds'], pol_test_df['label']])
    preds_orgs['total'] = preds_orgs.agg([np.sum], axis=1)
    print("Original arguments with many answers (Only for the thirds and fifth argument two different classes" 
          "are predicted) \n But the ")
    display(preds_orgs.loc[preds_orgs['total'] > 1].sort_values(by='total', ascending=False).head())
    # Original arguments with many responses 
    preds_resp = pd.crosstab(pol_test_df['response'], [pol_test_df['preds'], pol_test_df['label']])
    preds_resp['total'] = preds_resp.agg([np.sum], axis=1)
    print("Response arguments with many originals (Only for the third argument two different classes are predicted)")
    display(preds_resp.loc[preds_resp['total'] > 1].sort_values(by='total', ascending=False).head())
else:
    print('You have to first reproduce the results for the Polical dataset Task 2.\n'
          'python run_classifier_ba.py  --task_name "political-as" --output_dir res/pol_as/crossval1 --do_cross_val --do_lower_case --num_train_epochs 5 --max_seq_length 256 --train_batch_size 12 --learning_rate 2e-5')

In [None]:
# Agreement Sentiment Baselines
# (About as good as a random guess)

agreement_pro = processors['agreement']('both')
splits_data = np.array(agreement_pro.get_splits('../data'))
ag_test_df = pd.concat(splits_data[:,3])
ag_test_df['org_polarity'] = ag_test_df['org'].apply(lambda r: disc_pol(sid.polarity_scores(r)['compound']))
ag_test_df['resp_polarity'] = ag_test_df['response'].apply(lambda r: disc_pol(sid.polarity_scores(r)['compound']))
ag_test_df['sent_both_baseline'] = ag_test_df.apply(lambda r: 'disagreement' if r['org_polarity'] != r['resp_polarity'] else 'agreement', axis=1)
ag_test_df['sent_resp_baseline'] = ag_test_df.apply(lambda r: 'disagreement' if r['resp_polarity'] == 'negative' else 'agreement', axis=1)

print("Agreement Sentiment 1 baseline:\n")
print(classification_report(ag_test_df['label'], ag_test_df['sent_both_baseline']))
print("\nAgreement Sentiment 2 baseline:\n")
print(classification_report(ag_test_df['label'], ag_test_df['sent_resp_baseline']))