# Result Showcase


In [None]:
# Necessary imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import IFrame, HTML
from wordcloud import WordCloud, STOPWORDS

In [None]:
# Read all necessary data
df = pd.read_csv('../data/complete_data.tsv', sep='\t')
data_stats_org = np.load('../data/stats/data_stats_org.npy', allow_pickle=True).item()
data_stats_resp = np.load('../data/stats/data_stats_resp.npy', allow_pickle=True).item()
data_stats_topic = np.load('../data/stats/data_stats_topic.npy', allow_pickle=True).item()
data_stats_sent = np.load('../data/stats/sent_stats.npy', allow_pickle=True).item()
data_stats_author = pd.read_csv('../data/stats/data_stats_author.tsv', sep='\t')
data_stats_total = pd.read_csv('../data/stats/data_stats_total.tsv', sep='\t')
data_nix_ken = pd.read_csv('../data/stats/data_nix_ken.tsv', sep='\t')

# Data overview
- NoDE
    - Link: http://www-sop.inria.fr/NoDE/NoDE-xml.html
    - Paper: https://pdfs.semanticscholar.org/16d1/6b8a37c5313fa8c8430fddc011f2a98d20c5.pdf
- Political
    - Link: https://dh.fbk.eu/resources/political-argumentation
    - Paper: https://aaai.org/ocs/index.php/AAAI/AAAI18/paper/view/16393/16020
- Agreement
    - Link: https://dh.fbk.eu/resources/agreement-disagreement
    - Paper: https://www.aclweb.org/anthology/C16-1232

In [None]:
# General stats of all datasets
# Number of unique arguments, number of total pairs, number of attacks/supports und unrelated pairs
# Statistics about the total length (org+response) of the pairs
# Important: debate_test/train is already repaired, but still not the same as in the paper
# Important: agreement had many rows which could not get parsed, e.g. because resp or org was empty, they were excluded
# And the dataset is smaller than reported in the paper
# Important: There are two duplicates in the political dataset
# Length important for the seq_len parameter of BERT
data_stats_total.loc[data_stats_total['dataset'].isin(['debate_test', 'debate_train', 'procon', 'political', 'agreement'])]

In [None]:
# Debate train/test by topic
# Topics that are not matching paper are Interentaccess and Militaryservice
# Most topics attack/support distributions are similar to the overall distribution
pd.concat((data_stats_topic['debate_train'], data_stats_topic['debate_test']), keys=['train', 'test'])

In [None]:
# Political by topic
# Most topics have a similar distribution, minimum wage is an exception
data_stats_topic['political']

In [None]:
# Political by author
# Same author mostly support each other
# Different authors mostly attack each other
# Dataset is heavily imbalanced in respect to the author, Kennedy occurs way more often
print(data_nix_ken.groupby("author").nunique())
data_stats_author.style.background_gradient(cmap='Blues')

In [None]:
# Political duplicates
for data_set in ['political']:
    print(data_set + " Duplicates:")
    df_check = df[df['org_dataset'] == data_set]
    print(df_check[df_check.duplicated(subset=['org', 'response'], keep=False)])

# Length

In [None]:
# Plot distribution of length of org, resp and combined over the different datasets
# Seq_len 128/200 ~75% of debate_dataset, 250 ~75% political_dataset
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(10,4))  # 1 row, 2 columns

for data_set, ax in [('debate_extended', ax1), ('political',ax2)]:
    df_plot = df[df['org_dataset'] == data_set]
    df_plot.boxplot(ax=ax)
    ax.set_title(data_set)
plt.tight_layout()

# Attack/Support ratios

In [None]:
# Plot how many arguments attack an argument (attack-ratio)
# Most arguments are only attacked or only supported (interesting for detecting arguments likely to be attacked/supported)
# If we disregard every argument, which is only answered to once most arguments have an attack-ratio of 0.5
# In the case of the political dataset many arguments are unrelated, and unrelated arguments are disregarded in this plot
fig, (ax1,ax2) = plt.subplots(2,2, figsize=(10,4))  # 2 rows, 2 columns
for data_set, ax in [('debate_extended', ax1), ('political',ax2)]:
    df_plot = data_stats_org[data_set].iloc[:-1].apply(
        lambda r: pd.Series({"Attack-ratio": r.attacked / r.tot,
                             "Attack-ratio (exluding arguments only attacked/supported once)": np.nan if r.tot == 1 else r.attacked / r.tot}),
        axis=1)
    # Ratio broken?
    df_plot.hist(density=False, ax=ax)
    ax[0].set_ylabel(data_set, rotation=0)
    
plt.tight_layout()

# Usage of arguments

In [None]:
# First column shows how many answers an argument has
# Second column shows how many outgoing links an argument has
# Most arguments only have one ingoing link, but some have many ~10 debate, ~30 political
# In debate (orginal) every argument only has one outgoing link, in political most have one, but some have many ~8
fig, (ax1,ax2,ax3) = plt.subplots(3,2, figsize=(10,4))  # 3 rows, 2 columns

for data_set, ax in [('debate_test', ax1), ('debate_extended', ax2), ('political',ax3)]:
    df_plot = data_stats_org[data_set].iloc[:-1]
    df_plot = df_plot['tot']
    # Ratio broken?
    df_plot.hist(density=True, ax=ax[0])
    ax[0].set_title('{0}, org'.format(data_set))
    ax[1].set_title('{0}, resp'.format(data_set))
    df_plot = data_stats_resp[data_set].iloc[:-1]
    df_plot = df_plot['tot']
    # Ratio broken?
    df_plot.hist(bins=np.arange(0, 10), ax=ax[1])
plt.tight_layout()

# Visualizations Debate Responses
- Word scattertext of the responses in debate_train
- Lime and anchor visualization of an example sentence, using **only_response** (rest default options)
    - model has accuracy 53% (quite bad)
    - details about LIME [here](https://github.com/marcotcr/lime)
    - details about ANCHOR [here](https://github.com/marcotcr/anchor)

In [None]:
# Scattertext of the responses in debate_train
# No special "attacking" or "supporting" words easily recognizable
# The words are either topic specific, e.g. China (in topic Chinaonechildpolicy there are more supports than attacks)
# Or they seem to be there by chance (small dataset), e.g. he, does
IFrame(src='./scattertext_attack_supportdebate_train.html', width=950, height=500)

In [None]:
# Lime Visualization
# Some of the words play an influence as expected, e.g. are and not (attack), play, and alcohol (support)
# Others do not play the expected influence, e.g. china (attack and not support as expected)
# Overall, all weights are really small and the removal/replacement with UNK of a single word 
# does not change the prediction
HTML(filename='./lime.html')

In [None]:
# Anchor Visualization
# Anchor did not find a way to change some words, and then to predict the other class
HTML(filename='./anchor.html')

# Visualizations Political Authors
- WordClouds authors
- Scattertext authors
- Lime and Anchor, **only_org** (rest default), attack/support
    - Model acc: 70%, F1: 70%

In [None]:
# Wordclouds for kennedy and for nixon
# Both often say the name of the other candidate, Nixon talks about Predisdent Eisenhower
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(12,10))  # 1 row, 2 columns

stopwords = set(STOPWORDS)  # set(STOPWORDS)
wordcloud = WordCloud(
    stopwords=stopwords).generate(
    " ".join(text for text in data_nix_ken.loc[data_nix_ken["author"] == 'Nixon', 'text']))
ax1.imshow(wordcloud, interpolation="bilinear")
ax1.set_title("Nixon WordCloud")
ax1.set_axis_off()
wordcloud = WordCloud(
    stopwords=stopwords).generate(
    " ".join(text for text in data_nix_ken.loc[data_nix_ken["author"] == 'Kennedy', 'text']))
ax2.imshow(wordcloud, interpolation="bilinear")
ax2.set_title("Kennedy WordCloud")
ax2.set_axis_off()


plt.tight_layout()


In [None]:
# Scattertext
# Scattertext of the authors in political
# The word usage of Nixon and Kennedy is quite different
IFrame(src='./scattertext_nixon_kennedy.html', width=950, height=500)

In [None]:
# Lime
# All words have a very small impact
HTML(filename='./lime_pol.html')

In [None]:
# Anchors
# No rule found
HTML(filename='./anchor_pol.html')

# Baselines
- TODO: for the grouped results, actually calculate the weighted average/baseline

In [None]:
# Major Class
def get_major_acc(x, classes=['unrelated', 'attack/disagreement', 'support/agreement']):
    return np.divide(x[classes].max(), np.sum(x[classes]))

def get_major_class(x, classes=['unrelated', 'attack/disagreement', 'support/agreement']):
    return x[classes].astype('float64').idxmax()

data_stats_total['major_acc'] = data_stats_total.apply(get_major_acc, axis=1)
data_stats_total['major_class'] = data_stats_total.apply(get_major_class, axis=1)

data_stats_total.loc[data_stats_total['dataset'].isin(['debate_test', 'political'])][['dataset', 'major_class', 'major_acc']]

In [None]:
# Major Class per Topic node
data = data_stats_topic['debate_test']
data['major_acc'] = data.apply(get_major_acc, args=[['attack','support']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['attack','support']], axis=1)
data[['topic', 'major_class', 'major_acc', 'tot']]

In [None]:
# Major Class per Topic political
data = data_stats_topic['political']
data['major_acc'] = data.apply(get_major_acc, args=[['attack','support', 'unrelated']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['attack','support', 'unrelated']], axis=1)
data[['topic', 'major_class', 'major_acc', 'tot']]

In [None]:
# Major Class per Topic political attack/support only
data = data_stats_topic['political']
data['major_acc'] = data.apply(get_major_acc, args=[['attack','support']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['attack','support']], axis=1)
data[['topic', 'major_class', 'major_acc', 'tot']]

In [None]:
# Major Class Author identified 
data = data_stats_author.copy()
data['major_acc'] = data.apply(get_major_acc, args=[['attack','support', 'unrelated']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['attack','support', 'unrelated']], axis=1)
data[['author_resp', 'author_org', 'major_class', 'major_acc', 'tot']]

In [None]:
# Major Class Author identified attack/support only
data = data_stats_author.copy()
data['major_acc'] = data.apply(get_major_acc, args=[['attack','support']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['attack','support']], axis=1)
data[['author_resp', 'author_org', 'major_class', 'major_acc', 'tot']]

In [None]:
# Merged to same author / different author
# Very high accuracy possible if only detected if it is the same or a different author
data = data_stats_author.iloc[:-1].copy()
data['authors'] = data.apply(lambda r: 'Same' if r['author_resp'] == r['author_org'] else 'Different', axis=1)
data = data.groupby('authors').sum()
data = data.reset_index()
data['major_acc'] = data.apply(get_major_acc, args=[['attack','support']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['attack','support']], axis=1)

data[['authors', 'major_class', 'major_acc', 'tot']]


In [None]:
# Sentiment Analysis (nltk vader)

# Only responses debate test, supporting arguments often have a positive sentiment
# Attacking arguments have nothing special
pd.concat((data_stats_sent['respdebate_test'],data_stats_sent['resppolitical']), keys=['node', 'political'], sort=True)

In [None]:
# Both org and response
# Attack often have different sentiment, support often have the same sentiment (node)
# Nothing meaningful for political
pd.concat((data_stats_sent['bothdebate_test'],data_stats_sent['bothpolitical']), keys=['node', 'political'], sort=True)

In [None]:
# .... ?
# Major Class for every Org argument
# Major Class for every Resp argument (only political)

# Results

## NoDE paper
![](https://i.imgur.com/1N94Gjq.png)

In [None]:
# Node Acc with different parameters

# Fixed: input=both, seq_len=128, warmup_prop=0.1, seed=42
# Tested: model=base-uncased,large-uncased, epochs=3,4,5, batch_size=8,12,16, lr=2e-5, 3e-5, 5e-5
# Gradient accumulation: batch_size/4 for bert_large 
# (in principle equivalent, in practice different because of rounding errors etc.)
eval_results = pd.read_csv('../pytorch/node_both/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# Paper acc 0.67, best bert acc 0.74, mean (bert-base) 0.62 , baselines ~0.6
print(eval_results['acc'].agg([np.mean, np.min, np.max, np.std]))
# Somehow bert-large performs worse than bert-base
print(eval_results.groupby('_bert-model')['acc'].agg([np.mean, np.min, np.max, np.std])) 
print()
# Print settings of best result
print(eval_results.iloc[eval_results['acc'].idxmax()])

# Show the table
eval_results.head()

## Political Paper
![](https://i.imgur.com/yGlTYbd.png)
![](https://i.imgur.com/7yrDqQH.png)

In [None]:
# Political F1 CrossVal
# Fixed: input=both, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=5, batch_size=12, lr=2e-5

# Related/Unrelated
eval_results = pd.read_csv('../pytorch/pol_ru/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# Paper average F1 0.65, here average F1 0.68, baseline ?
print(eval_results['f1'].agg([np.mean, np.min, np.max, np.std]))
eval_results.head()

In [None]:
# Political F1 CrossVal
# Fixed: input=both, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=5, batch_size=12, lr=2e-5

# Attack/Support
eval_results = pd.read_csv('../pytorch/pol_as/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# Paper average F1 0.82, here average F1 0.73, baselines (author) ~0.85
print(eval_results['f1'].agg([np.mean, np.min, np.max, np.std]))
eval_results.head()

In [None]:
# Political F1 CrossVal
# Fixed: input=both, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=5, batch_size=12, lr=2e-5

# Attack/Support/Unrelated
eval_results = pd.read_csv('../pytorch/pol_asu/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# Paper only reported precision 0.57, here only major class classified
# Use some tricks to coope with class imbalance!
print(eval_results['f1'].agg([np.mean, np.min, np.max, np.std]))
eval_results.head()

## Agreement Paper
- Accuracy 74%

In [None]:
# Agreement F1 CrossVal
# Comparison with Paper + Baselines
# Fixed: input=both, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=2, batch_size=12, lr=2e-5

# Agreement/Disagreement
eval_results = pd.read_csv('../pytorch/agreement/eval_results.tsv', sep='\t')
# Some stats: mean, min, max, std
# Paper average acc 0.74 , here average acc 0.61
# TODO: non cross_val version had acc ~0.97! Probably parameters are bad, 2 Epochs might not be enough 
# (try again with higher epochs number)
print(eval_results['acc'].agg([np.mean, np.min, np.max, np.std]))
eval_results.head()

## Results analyzed

In [None]:
# Import the train/test splits functions
import sys 
import os
# TODO: use a relative path or a module instead
sys.path.append(os.path.abspath("/media/jannis/GeDaTS/SS19/BA/Code_BA/code_relation_prediction/pytorch"))

from run_classifier_dataset_utils import processors

node_pro = processors['node']('both')
political_as_pro = processors['political-as']('both')

In [None]:
# Node results with respect to topic
_, node_test_df = node_pro.get_dev_examples('../data')
eval_preds = pd.read_csv('../pytorch/node_both/eval_preds.csv')

# Only predictions from bert-base
res = pd.concat([node_test_df.reset_index(drop=True), eval_preds.iloc[27:,:-1].transpose().reset_index(drop=True)], axis=1)
res = res.replace({0: 'attack', 1: 'support'})

# For now, only one run (run 51) used
# There are errors in every topic, no clear trend visible that some topics are better or worse
# More false classifications of attack than of support (support is the major class)
# Could, also look at several runs, or average, etc.
pd.crosstab(res['topic'], [res['label'],res[51]])

In [None]:
# Take the rounded mean prediction for all bert-base runs 
res['mean_round'] = eval_preds.iloc[27:,:-1].mean().round().values
res = res.replace({0: 'attack', 1: 'support'})
pd.crosstab(res['topic'], [res['label'],res['mean_round']])

In [None]:
# We can recreate all metrics from the available data
# E.g. classification reports or confusion matrices 
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_pred=eval_preds.iloc[51,:-1].replace({0: 'attack', 1: 'support'}), y_true=res['label']))

print(confusion_matrix(res['label'], eval_preds.iloc[51,:-1].replace({0: 'attack', 1: 'support'})))

In [None]:
# Political results with respect to topic
splits_data = political_as_pro.get_splits('../data')

# Get the test data and the test predictions
pol_test_df = pd.concat(np.array(splits_data)[:,3])
eval_preds = pd.read_csv('../pytorch/pol_as/eval_preds.csv')


pol_test_df['preds'] =  eval_preds.iloc[:,:-1].stack().values
pol_test_df = pol_test_df.replace({0: 'attack', 1: 'support'})


pd.crosstab(pol_test_df['topic'], [pol_test_df['label'],pol_test_df['preds']])

In [None]:
# Political results with respect to author
pd.crosstab(pol_test_df['preds'], [pol_test_df['org_stance'],pol_test_df['response_stance']])

In [None]:
# Complete results political (all folds "summed")
print(classification_report(y_pred=pol_test_df['preds'], y_true=pol_test_df['label']))

print(confusion_matrix(y_pred=pol_test_df['preds'], y_true=pol_test_df['label']))

In [None]:
# Results with respect to same org, same resp (always gets the same label or not?)

# Same org
# One org does not always get the same prediction (but often)
pd.crosstab(res['org'], res[51])

In [None]:
# Same org pol
# TODO: aggregate to get some useful insights 
# (and maybe do it for every fold individually, 
# because otherwise it could be that we always predict one label for one org in one fold and another in another fold)
pd.crosstab(pol_test_df['org'], pol_test_df['preds']).head()

In [None]:
# Same resp pol
pd.crosstab(pol_test_df['response'], pol_test_df['preds']).head()

# Results other inputs
- TODO: test with only the orgs as input and with only the response as input
   - Arguments likely to be attacked/supported
   - Attackful/ing or supportful/ing arguments

In [None]:
# Only org

In [None]:
# Only resp

# Domain adaptation etc.
- TODO: do some domain adaptation etc.

In [None]:
# Train on one dataset, evaluate on another (without finetuning)

In [None]:
# With finetuning (reusing the classification layer)

In [None]:
# With finetuning + use a new classification layer 