# Result Showcase


In [None]:
# Necessary imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import IFrame
from wordcloud import WordCloud, STOPWORDS

In [None]:
# Read all necessary data
df = pd.read_csv('../data/complete_data.tsv', sep='\t')
data_stats_org = np.load('../data/stats/data_stats_org.npy', allow_pickle=True).item()
data_stats_resp = np.load('../data/stats/data_stats_resp.npy', allow_pickle=True).item()
data_stats_topic = np.load('../data/stats/data_stats_topic.npy', allow_pickle=True).item()
data_stats_sent = np.load('../data/stats/sent_stats.npy', allow_pickle=True).item()
data_stats_author = pd.read_csv('../data/stats/data_stats_author.tsv', sep='\t')
data_stats_total = pd.read_csv('../data/stats/data_stats_total.tsv', sep='\t')
data_nix_ken = pd.read_csv('../data/stats/data_nix_ken.tsv', sep='\t')

# Data overview

In [None]:
# General stats of all datasets
# Number of unique arguments, number of total pairs, number of attacks/supports und unrelated pairs
# Statistics about the total length (org+response) of the pairs
# Important: debate_test/train is already repaired, but still not the same as in the paper
# Length important for the seq_len parameter of BERT
data_stats_total.loc[data_stats_total['dataset'].isin(['debate_test', 'debate_train', 'procon', 'political', 'agreement'])]

In [None]:
# Debate train/test by topic
# Topics that are not matching paper are Interentaccess and Militaryservice
# Most topics attack/support distributions are similar to the overall distribution
pd.concat((data_stats_topic['debate_train'], data_stats_topic['debate_test']), keys=['train', 'test'])

In [None]:
# Political by topic
# Most topics have a similar distribution, minimum wage is an exception
data_stats_topic['political']

In [None]:
# Political by author
# Same author mostly support each other
# Different authors mostly attack each other
# Dataset is heavily imbalanced in respect to the author, Kennedy occurs way more often
print(data_nix_ken.groupby("author").nunique())
data_stats_author.style.background_gradient(cmap='Blues')

In [None]:
# Political duplicates
for data_set in ['political']:
    print(data_set + " Duplicates:")
    df_check = df[df['org_dataset'] == data_set]
    print(df_check[df_check.duplicated(subset=['org', 'response'], keep=False)])

# Length

In [None]:
# Plot distribution of length of org, resp and combined over the different datasets
# Seq_len 128/200 ~75% of debate_dataset, 250 ~75% political_dataset
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(10,4))  # 1 row, 2 columns

for data_set, ax in [('debate_extended', ax1), ('political',ax2)]:
    df_plot = df[df['org_dataset'] == data_set]
    df_plot.boxplot(ax=ax)
    ax.set_title(data_set)
plt.tight_layout()

# Attack/Support ratios

In [None]:
# Plot how many arguments attack an argument (attack-ratio)
# Most arguments are only attacked or only supported (interesting for detecting arguments likely to be attacked/supported)
# If we disregard every argument, which is only answered to once most arguments have an attack-ratio of 0.5
# In the case of the political dataset many arguments are unrelated, and unrelated arguments are disregarded in this plot
fig, (ax1,ax2) = plt.subplots(2,2, figsize=(10,4))  # 2 rows, 2 columns
for data_set, ax in [('debate_extended', ax1), ('political',ax2)]:
    df_plot = data_stats_org[data_set].iloc[:-1].apply(
        lambda r: pd.Series({"Attack-ratio": r.attacked / r.tot,
                             "Attack-ratio (exluding arguments only attacked/supported once)": np.nan if r.tot == 1 else r.attacked / r.tot}),
        axis=1)
    # Ratio broken?
    df_plot.hist(density=False, ax=ax)
    ax[0].set_ylabel(data_set, rotation=0)
    
plt.tight_layout()

# Usage of arguments

In [None]:
# First column shows how many answers an argument has
# Second column shows how many outgoing links an argument has
# Most arguments only have one ingoing link, but some have many ~10 debate, ~30 political
# In debate (orginal) every argument only has one outgoing link, in political most have one, but some have many ~8
fig, (ax1,ax2,ax3) = plt.subplots(3,2, figsize=(10,4))  # 3 rows, 2 columns

for data_set, ax in [('debate_test', ax1), ('debate_extended', ax2), ('political',ax3)]:
    df_plot = data_stats_org[data_set].iloc[:-1]
    df_plot = df_plot['tot']
    # Ratio broken?
    df_plot.hist(density=True, ax=ax[0])
    ax[0].set_title('{0}, org'.format(data_set))
    ax[1].set_title('{0}, resp'.format(data_set))
    df_plot = data_stats_resp[data_set].iloc[:-1]
    df_plot = df_plot['tot']
    # Ratio broken?
    df_plot.hist(bins=np.arange(0, 10), ax=ax[1])
plt.tight_layout()

# Visualizations Debate Responses
- Word scattertext of the responses in debate_train
- Lime and anchor visualization of an example sentence, using **only_response** (rest default options)
    - model has accuracy 53% (quite bad)
    - details about LIME [here](https://github.com/marcotcr/lime)
    - details about ANCHOR [here](https://github.com/marcotcr/anchor)

In [None]:
# Scattertext of the responses in debate_train
# No special "attacking" or "supporting" words easily recognizable
# The words are either topic specific, e.g. China (in topic Chinaonechildpolicy there are more supports than attacks)
# Or they seem to be there by chance (small dataset), e.g. he, does
IFrame(src='../data/plots/scattertext_attack_supportdebate_train.html', width=950, height=500)

In [None]:
# Lime Visualization
# Some of the words play an influence as expected, e.g. are and not (attack), play, and alcohol (support)
# Others do not play the expected influence, e.g. china (attack and not support as expected)
# Overall, all weights are really small and the removal/replacement with UNK of a single word 
# does not change the prediction
IFrame(src='./lime.html', width=1200, height=350)

In [None]:
# Anchor Visualization
# Anchor did not find a way to change some words, and then to predict the other class
IFrame(src='./anchor.html', width=1200, height=350)

# Visualizations Political Authors
- WordClouds authors
- Scattertext authors
- Lime and Anchor, **only_org** (rest default)
    - Model acc: 70%, F1: 70%

In [None]:
# Wordclouds for kennedy and for nixon
# Both often say the name of the other candidate, Nixon talks about Predisdent Eisenhower
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(12,10))  # 1 row, 2 columns

stopwords = set(STOPWORDS)  # set(STOPWORDS)
wordcloud = WordCloud(
    stopwords=stopwords).generate(
    " ".join(text for text in data_nix_ken.loc[data_nix_ken["author"] == 'Nixon', 'text']))
ax1.imshow(wordcloud, interpolation="bilinear")
ax1.set_title("Nixon WordCloud")
ax1.set_axis_off()
wordcloud = WordCloud(
    stopwords=stopwords).generate(
    " ".join(text for text in data_nix_ken.loc[data_nix_ken["author"] == 'Kennedy', 'text']))
ax2.imshow(wordcloud, interpolation="bilinear")
ax2.set_title("Kennedy WordCloud")
ax2.set_axis_off()


plt.tight_layout()


In [None]:
# Scattertext
# Scattertext of the authors in political
# The word usage of Nixon and Kennedy is quite different
IFrame(src='../data/plots/scattertext_nixon_kennedy.html', width=950, height=500)

In [None]:
# Lime
# All words have a very small impact
IFrame(src='./lime_pol.html', width=1200, height=350)

In [None]:
# Anchors
# No rule found
IFrame(src='./anchor_pol.html', width=1200, height=350)

# Baselines
- TODO: for the grouped results, actually calculate the weighted average/baseline

In [None]:
# Major Class
def get_major_acc(x, classes=['unrelated', 'yes', 'no']):
    return np.divide(x[classes].max(), np.sum(x[classes]))

def get_major_class(x, classes=['unrelated', 'yes', 'no']):
    return x[classes].astype('float64').idxmax()

data_stats_total['major_acc'] = data_stats_total.apply(get_major_acc, axis=1)
data_stats_total['major_class'] = data_stats_total.apply(get_major_class, axis=1)

data_stats_total.loc[data_stats_total['dataset'].isin(['debate_test', 'political'])][['dataset', 'major_class', 'major_acc']]

In [None]:
# Major Class per Topic node
data = data_stats_topic['debate_test']
data['major_acc'] = data.apply(get_major_acc, axis=1)
data['major_class'] = data.apply(get_major_class, axis=1)
data[['topic', 'major_class', 'major_acc', 'tot']]

In [None]:
# Major Class per Topic political
data = data_stats_topic['political']
data['major_acc'] = data.apply(get_major_acc, axis=1)
data['major_class'] = data.apply(get_major_class, axis=1)
data[['topic', 'major_class', 'major_acc', 'tot']]

In [None]:
# Major Class per Topic political attack/support only
data = data_stats_topic['political']
data['major_acc'] = data.apply(get_major_acc, args=[['yes','no']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['yes','no']], axis=1)
data[['topic', 'major_class', 'major_acc', 'tot']]

In [None]:
# Major Class Author identified 
data = data_stats_author.copy()
data['major_acc'] = data.apply(get_major_acc, axis=1)
data['major_class'] = data.apply(get_major_class, axis=1)
data[['author_resp', 'author_org', 'major_class', 'major_acc', 'tot']]

In [None]:
# Major Class Author identified attack/support only
data = data_stats_author.copy()
data['major_acc'] = data.apply(get_major_acc, args=[['yes','no']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['yes','no']], axis=1)
data[['author_resp', 'author_org', 'major_class', 'major_acc', 'tot']]

In [None]:
# Merged to same author / different author
# Very high accuracy possible if only detected if it is the same or a different author
data = data_stats_author.iloc[:-1].copy()
data['authors'] = data.apply(lambda r: 'Same' if r['author_resp'] == r['author_org'] else 'Different', axis=1)
data = data.groupby('authors').sum()
data = data.reset_index()
data['major_acc'] = data.apply(get_major_acc, args=[['yes','no']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['yes','no']], axis=1)

data[['authors', 'major_class', 'major_acc', 'tot']]


In [None]:
# Sentiment Analysis (nltk vader)

# Only responses debate test, supporting arguments often have a positive sentiment
# Attacking arguments have nothing special
pd.concat((data_stats_sent['respdebate_test'],data_stats_sent['resppolitical']), keys=['node', 'political'], sort=True)

In [None]:
# Both org and response
# Attack often have different sentiment, support often have the same sentiment (node)
# Nothing meaningful for political
pd.concat((data_stats_sent['bothdebate_test'],data_stats_sent['bothpolitical']), keys=['node', 'political'], sort=True)

In [None]:
# .... ?
# Major Class for every Org argument
# Major Class for every Resp argument (only political)

# Results

In [None]:
# Node Acc with different parameters
# TODO: Comparison with Paper + Baselines

# Fixed: input=both, seq_len=128, warmup_prop=0.1, seed=42
# Tested: model=base-uncased,large-uncased, epochs=3,4,5, batch_size=8,12,16, lr=2e-5, 3e-5, 5e-5
# Gradient accumulation: batch_size/4 for bert_large 
# (in principle equivalent, in practice different because of rounding errors etc.)
eval_preds = pd.read_csv('../pytorch/node_both/eval_preds.csv')
eval_results = pd.read_csv('../pytorch/node_both/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
print(eval_results['acc'].agg([np.mean, np.min, np.max, np.std]))
# Print settings of best result
print(eval_results.iloc[eval_results['acc'].idxmax()])

# Show the table
eval_results.head()

In [None]:
# Political F1 CrossVal
# Comparison with Paper + Baselines

In [None]:
# Agreement F1 CrossVal
# Comparison with Paper + Baselines

In [None]:
# Node results with respect to topic

In [None]:
# Political results with respect to topic

In [None]:
# Political results with respect to author

In [None]:
# Results with respect to same org, same resp (always gets the same label or not?)