# Result Showcase


In [None]:
# Necessary imports
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import IFrame, HTML
from wordcloud import WordCloud, STOPWORDS

# Import the train/test splits functions
import sys 
import os
# TODO: use a relative path or a module instead
sys.path.append(os.path.abspath("../pytorch"))

from run_classifier_dataset_utils import processors

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
import nltk
nltk.download('vader_lexicon')

In [None]:
# Read all necessary data
df = pd.read_csv('../data/complete_data.tsv', sep='\t')
data_stats_org = np.load('../data/stats/data_stats_org.npy', allow_pickle=True).item()
data_stats_resp = np.load('../data/stats/data_stats_resp.npy', allow_pickle=True).item()
data_stats_topic = np.load('../data/stats/data_stats_topic.npy', allow_pickle=True).item()
data_stats_sent = np.load('../data/stats/sent_stats.npy', allow_pickle=True).item()
data_stats_author = pd.read_csv('../data/stats/data_stats_author.tsv', sep='\t')
data_stats_total = pd.read_csv('../data/stats/data_stats_total.tsv', sep='\t')
data_nix_ken = pd.read_csv('../data/stats/data_nix_ken.tsv', sep='\t')

# Data overview
- NoDE
    - Link: http://www-sop.inria.fr/NoDE/NoDE-xml.html
    - Paper: https://pdfs.semanticscholar.org/16d1/6b8a37c5313fa8c8430fddc011f2a98d20c5.pdf
- Political
    - Link: https://dh.fbk.eu/resources/political-argumentation
    - Paper: https://aaai.org/ocs/index.php/AAAI/AAAI18/paper/view/16393/16020
- Agreement
    - Link: https://dh.fbk.eu/resources/agreement-disagreement
    - Paper: https://www.aclweb.org/anthology/C16-1232

In [None]:
# General stats of all datasets
# Number of unique arguments, number of total pairs, number of attacks/supports und unrelated pairs
# Statistics about the combined length (org+response) of the pairs
# Important: debate_test/train is already repaired, but still not the same as in the paper
# Important: agreement had many rows which could not get parsed, e.g. because resp or org was empty, they were excluded
# And the dataset is smaller than reported in the paper
# Important: There are two duplicates in the political dataset
# Length important for the seq_len parameter of BERT
data_stats_total.loc[data_stats_total['Dataset'].isin(['debate_test', 'debate_train', 'procon', 'political', 'agreement'])]

In [None]:
# Debate train/test by topic
# Topics that are not matching paper are Interentaccess and Militaryservice
# Most topics attack/support distributions are similar to the overall distribution
pd.concat((data_stats_topic['debate_train'], data_stats_topic['debate_test']), keys=['train', 'test'])

In [None]:
# Political by topic
# Most topics have a similar distribution, minimum wage is an exception
data_stats_topic['political']

In [None]:
# Political by author
# Same author mostly support each other
# Different authors mostly attack each other
# Dataset is heavily imbalanced in respect to the author, Kennedy occurs way more often
print(data_nix_ken.groupby("author").nunique())
data_stats_author.iloc[:,:-3].to_csv('../data/thesis/author_imbalance.csv', index=False)
data_stats_author.iloc[:,:-3].style.background_gradient(cmap='Blues')

In [None]:
# Political duplicates
for data_set in ['political']:
    print(data_set + " Duplicates:")
    df_check = df[df['org_dataset'] == data_set]
    print(df_check[df_check.duplicated(subset=['org', 'response'], keep=False)])

# Length

In [None]:
# Plot distribution of length of org, resp and combined over the different datasets
# Seq_len 128/200 ~75% of debate_dataset, 250 ~75% political_dataset
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(10,4))  # 1 row, 2 columns

for data_set, ax in [('debate_extended', ax1), ('political',ax2)]:
    df_plot = df[df['org_dataset'] == data_set]
    df_plot.boxplot(ax=ax)
    ax.set_title(data_set)
plt.tight_layout()

In [None]:
# Plot for thesis
fig, ax = plt.subplots(1,2, figsize=(10,4), sharey=True)  # 1 row, 2 columns

#for data_set, ax in [('debate_train', ax1),('debate_test', ax2)]:
df_plot = pd.concat((df[df['org_dataset'] == 'debate_train'], df[df['org_dataset'] == 'debate_test']))
df_plot = df_plot.rename(columns={'org_len': 'Original', 'response_len': 'Response', 'complete_len': 'Combined'})
df_plot = df_plot.replace({'debate_test': 'Test', 'debate_train': 'Train'})
df_plot.groupby('org_dataset', sort=False).boxplot(ax=ax)
#ax.set_title(data_set)
ax[0].set_ylabel('Length in WordPiece tokens')
plt.tight_layout()

plt.savefig('../data/thesis/node_length.pdf', bbox_inches='tight')

In [None]:
# Plot for thesis
fig, ax = plt.subplots(1,1, figsize=(5,4), sharey=True)  # 1 row, 2 columns

#for data_set, ax in [('debate_train', ax1),('debate_test', ax2)]:
df_plot = df[df['org_dataset'] == 'political']
df_plot = df_plot.rename(columns={'org_len': 'Original', 'response_len': 'Response', 'complete_len': 'Combined'})
df_plot.groupby('org_dataset', sort=False).boxplot(ax=ax)
ax.set_title("")
ax.set_ylabel('Length in WordPiece tokens')
plt.tight_layout()

plt.savefig('../data/thesis/political_length.pdf', bbox_inches='tight')

In [None]:
# Plot for thesis
fig, ax = plt.subplots(1,1, figsize=(5,4), sharey=True)  # 1 row, 2 columns

#for data_set, ax in [('debate_train', ax1),('debate_test', ax2)]:
df_plot = df[df['org_dataset'] == 'agreement']
df_plot = df_plot.rename(columns={'org_len': 'Argument 1', 'response_len': 'Argument 2', 'complete_len': 'Combined'})
df_plot.groupby('org_dataset', sort=False).boxplot(ax=ax)
ax.set_title("")
ax.set_ylabel('Length in WordPiece tokens')
plt.tight_layout()

plt.savefig('../data/thesis/agreement_length.pdf', bbox_inches='tight')

In [None]:
# Tokenization tests
from pytorch_pretrained_bert.tokenization import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

text = "Violent games make youth more agressive/violent."
tokenizer.tokenize(text)


# Attack/Support ratios

In [None]:
# Plot how many arguments attack an argument (attack-ratio)
# Most arguments are only attacked or only supported (interesting for detecting arguments likely to be attacked/supported)
# If we disregard every argument, which is only answered to once most arguments have an attack-ratio of 0.5
# In the case of the political dataset many arguments are unrelated, and unrelated arguments are disregarded in this plot
fig, (ax1,ax2) = plt.subplots(2,2, figsize=(10,4))  # 2 rows, 2 columns
for data_set, ax in [('debate_extended', ax1), ('political',ax2)]:
    df_plot = data_stats_org[data_set].iloc[:-1].apply(
        lambda r: pd.Series({"Attack-ratio": r["Attacked"] / r["Total pairs"],
                             "Attack-ratio (exluding arguments only attacked/supported once)": np.nan if r["Total pairs"] == 1 else r["Attacked"] / r["Total pairs"]}),
        axis=1)
    # Ratio broken?
    df_plot.hist(density=False, ax=ax)
    ax[0].set_ylabel(data_set, rotation=0)
    
plt.tight_layout()

# Usage of arguments

In [None]:
# First column shows how many answers an argument has
# Second column shows how many outgoing links an argument has
# Most arguments only have one ingoing link, but some have many ~10 debate, ~30 political
# In debate (orginal) every argument only has one outgoing link, in political most have one, but some have many ~8
fig, (ax1,ax2,ax3) = plt.subplots(3,2, figsize=(10,4))  # 3 rows, 2 columns

for data_set, ax in [('debate_test', ax1), ('debate_extended', ax2), ('political',ax3)]:
    df_plot = data_stats_org[data_set].iloc[:-1]
    df_plot = df_plot['Total pairs']
    # Ratio broken?
    df_plot.hist(density=True, ax=ax[0])
    ax[0].set_title('{0}, org'.format(data_set))
    ax[1].set_title('{0}, resp'.format(data_set))
    df_plot = data_stats_resp[data_set].iloc[:-1]
    df_plot = df_plot['Total pairs']
    # Ratio broken?
    df_plot.hist(bins=np.arange(0, 10), ax=ax[1])
plt.tight_layout()

In [None]:
# Plot for thesis
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4), sharex=True)  # 1 row, 2 columns

data_set = 'debate_test'
df_plot = pd.concat((data_stats_org[data_set].iloc[:-1], data_stats_resp[data_set].iloc[:-1].rename(columns={'Attacks': 'Attacked', 'Supports': 'Supported'})), keys=['Original', 'Response'])
#print(df_plot)
df_plot.loc['Original'].hist(column=['Total pairs'], ax=ax1, bins=[1,2,3,4,5,6,7,8,9])
df_plot.loc['Response'].hist(column=['Total pairs'], ax=ax2, bins=[1,2,3,4,5,6,7,8,9])

ax1.set_title("Original")
ax2.set_title("Response")
ax1.set_ylabel("Count")
ax1.set_xlabel("Number of ingoing links")
ax2.set_xlabel("Number of outgoing links")

plt.tight_layout()

plt.savefig('../data/thesis/node_hist.pdf', bbox_inches='tight')

In [None]:
# Plot for thesis
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4), sharex=False, sharey=True)  # 1 row, 2 columns

data_set = 'political'
df_plot = pd.concat((data_stats_org[data_set].iloc[:-1], data_stats_resp[data_set].iloc[:-1].rename(columns={'Attacks': 'Attacked', 'Supports': 'Supported'})), keys=['Original', 'Response'])
#print(df_plot)
df_plot.loc['Original'].hist(column=['Total pairs'], bins=[1,2,3,4,5,6,7,8,9],ax=ax1)
df_plot.loc['Response'].hist(column=['Total pairs'], bins=[1,2,3,4,5,6,7,8,9], ax=ax2)

ax1.set_title("Original")
ax2.set_title("Response")
ax1.set_ylabel("Count")
ax1.set_xlabel("Number of ingoing links")
ax2.set_xlabel("Number of outgoing links")

plt.tight_layout()

plt.savefig('../data/thesis/political_hist.pdf', bbox_inches='tight')

# Visualizations Debate Responses
- Word scattertext of the responses in debate_train
- Lime and anchor visualization of an example sentence, using **only_response** (rest default options)
    - model has accuracy 53% (quite bad)
    - details about LIME [here](https://github.com/marcotcr/lime)
    - details about ANCHOR [here](https://github.com/marcotcr/anchor)

In [None]:
# Scattertext of the responses in debate_train
# No special "attacking" or "supporting" words easily recognizable
# The words are either topic specific, e.g. China (in topic Chinaonechildpolicy there are more supports than attacks)
# Or they seem to be there by chance (small dataset), e.g. he, does
#IFrame(src='./scattertext_attack_supportdebate_train.html', width=950, height=500)

In [None]:
# Lime Visualization
# Some of the words play an influence as expected, e.g. are and not (attack), play, and alcohol (support)
# Others do not play the expected influence, e.g. china (attack and not support as expected)
# Overall, all weights are really small and the removal/replacement with UNK of a single word 
# does not change the prediction
HTML(filename='./lime.html')

In [None]:
# Anchor Visualization
# Anchor did not find a way to change some words, and then to predict the other class
#HTML(filename='./anchor.html')

# Visualizations Political Authors
- WordClouds authors
- Scattertext authors
- Lime and Anchor, **only_org** (rest default), attack/support
    - Model acc: 70%, F1: 70%

In [None]:
# Wordclouds for kennedy and for nixon
# Both often say the name of the other candidate, Nixon talks about Predisdent Eisenhower
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(12,10))  # 1 row, 2 columns

stopwords = set(STOPWORDS)  # set(STOPWORDS)
wordcloud = WordCloud(
    stopwords=stopwords).generate(
    " ".join(text for text in data_nix_ken.loc[data_nix_ken["author"] == 'Nixon', 'text']))
ax1.imshow(wordcloud, interpolation="bilinear")
ax1.set_title("Nixon WordCloud")
ax1.set_axis_off()
wordcloud = WordCloud(
    stopwords=stopwords).generate(
    " ".join(text for text in data_nix_ken.loc[data_nix_ken["author"] == 'Kennedy', 'text']))
ax2.imshow(wordcloud, interpolation="bilinear")
ax2.set_title("Kennedy WordCloud")
ax2.set_axis_off()


plt.tight_layout()
plt.savefig('../data/thesis/authors_wordcloud.pdf', bbox_inches='tight')

In [None]:
# Scattertext
# Scattertext of the authors in political
# The word usage of Nixon and Kennedy is quite different
#IFrame(src='./scattertext_nixon_kennedy.html', width=950, height=500)

In [None]:
# Lime
# All words have a very small impact
HTML(filename='./lime_pol.html')

In [None]:
# Anchors
# No rule found
#HTML(filename='./anchor_pol.html')

# Baselines
- TODO: for the grouped results, actually calculate the weighted average/baseline

In [None]:
# Major Class
def get_major_acc(x, classes=['Unrelated', 'Attack/Disagreement', 'Support/Agreement']):
    return np.divide(x[classes].max(), np.sum(x[classes]))

def get_major_class(x, classes=['Unrelated', 'Attack/Disagreement', 'Support/Agreement']):
    return x[classes].astype('float64').idxmax()

data_stats_total['major_acc'] = data_stats_total.apply(get_major_acc, axis=1)
data_stats_total['major_class'] = data_stats_total.apply(get_major_class, axis=1)

data_stats_total.loc[data_stats_total['Dataset'].isin(['debate_test', 'political'])][['Dataset', 'major_class', 'major_acc']]

In [None]:
# Major Class per Topic node
data = data_stats_topic['debate_test']
data['major_acc'] = data.apply(get_major_acc, args=[['Attack','Support']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['Attack','Support']], axis=1)
data[['Topic', 'major_class', 'major_acc', 'Total pairs']]
data = data.set_index('Topic')
data

In [None]:
node_pro = processors['node']('both')

_, node_test_df = node_pro.get_dev_examples('../data')

node_test_df['major_topic_pred'] = node_test_df['topic'].apply(lambda r: data.loc[r, 'major_class']).replace({'Attack': 'attack', 'Support': 'support'})
print(classification_report(node_test_df['label'], node_test_df['major_topic_pred']))

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

def disc_pol(x):
    if x >= 0.00:
        return 'positive'
    else:
        return 'negative'

In [None]:
node_test_df['org_polarity'] = node_test_df['org'].apply(lambda r: disc_pol(sid.polarity_scores(r)['compound']))
node_test_df['resp_polarity'] = node_test_df['response'].apply(lambda r: disc_pol(sid.polarity_scores(r)['compound']))
node_test_df['sent_both_baseline'] = node_test_df.apply(lambda r: 'attack' if r['org_polarity'] != r['resp_polarity'] else 'support', axis=1)
node_test_df['sent_resp_baseline'] = node_test_df.apply(lambda r: 'attack' if r['resp_polarity'] == 'negative' else 'support', axis=1)
print(classification_report(node_test_df['label'], node_test_df['sent_both_baseline']))
print(classification_report(node_test_df['label'], node_test_df['sent_resp_baseline']))

In [None]:
# Major Class per Topic political RU
political_ru_pro = processors['political-ru']('both')
splits_data = np.array(political_ru_pro.get_splits('../data'))

# Get the test data and the test predictions
pol_test_df = pd.concat(splits_data[:,3])

data = data_stats_topic['political']
data['related'] = data.apply(lambda r: r['Attack'] + r['Support'], axis=1)
data['major_acc'] = data.apply(get_major_acc, args=[['related', 'Unrelated']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['related', 'Unrelated']], axis=1)
data[['Topic', 'major_class', 'major_acc', 'Total pairs']]
data = data.set_index('Topic')
pol_test_df['major_topic_pred'] = pol_test_df['topic'].apply(lambda r: data.loc[r, 'major_class']).replace({'Unrelated': 'unrelated'})
print(classification_report(pol_test_df['label'], pol_test_df['major_topic_pred']))
pol_test_df['major_class'] = 'related'
print(classification_report(pol_test_df['label'], pol_test_df['major_class']))

def count_values(x, labels):
    return x['label'].loc[x['label'].isin(labels)].count()

data = pol_test_df.groupby('org').apply(lambda r: pd.Series({'org': r['org'].iloc[0], 'related': count_values(r, ['related']), 'unrelated': count_values(r, ['unrelated'])}))
data['major_acc'] = data.apply(get_major_acc, args=[['related', 'unrelated']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['related', 'unrelated']], axis=1)
data = data.set_index('org')
data['total'] = data['related'] + data['unrelated']
orgs_only_once = data.loc[data['total'] == 1].index.to_list()
index = pol_test_df[pol_test_df['org'].isin(orgs_only_once)].index
pol_test_df = pol_test_df.drop(index)
pol_test_df['major_org_pred'] = pol_test_df['org'].apply(lambda r: data.loc[r, 'major_class'])
print(classification_report(pol_test_df['label'], pol_test_df['major_org_pred']))
splits_data = np.array(political_ru_pro.get_splits('../data'))

# Get the test data and the test predictions
pol_test_df = pd.concat(splits_data[:,3])
data = pol_test_df.groupby('response').apply(lambda r: pd.Series({'resp': r['response'].iloc[0], 'related': count_values(r, ['related']), 'unrelated': count_values(r, ['unrelated'])}))
data['major_acc'] = data.apply(get_major_acc, args=[['related', 'unrelated']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['related', 'unrelated']], axis=1)
data = data.set_index('resp')
data['total'] = data['related'] + data['unrelated']
resps_only_once = data.loc[data['total'] == 1].index.to_list()
index = pol_test_df[pol_test_df['response'].isin(resps_only_once)].index
pol_test_df = pol_test_df.drop(index)
pol_test_df['major_resp_pred'] = pol_test_df['response'].apply(lambda r: data.loc[r, 'major_class'])
print(classification_report(pol_test_df['label'], pol_test_df['major_resp_pred']))

In [None]:
# Major Class Author identified 
splits_data = np.array(political_ru_pro.get_splits('../data'))

# Get the test data and the test predictions
pol_test_df = pd.concat(splits_data[:,3])

data = data_stats_author.copy()
data['related'] = data.apply(lambda r: r['Attack'] + r['Support'], axis=1)
data['major_acc'] = data.apply(get_major_acc, args=[['related', 'Unrelated']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['related', 'Unrelated']], axis=1)
data[['Author resp', 'Author org', 'major_class', 'major_acc', 'Total pairs']]
data = data.set_index(['Author resp', 'Author org'])
pol_test_df['major_author'] = pol_test_df.apply(lambda r: data.loc[r['response_stance'], r['org_stance']]['major_class'], axis=1).replace({'Unrelated': 'unrelated'})
print(classification_report(pol_test_df['label'], pol_test_df['major_author']))
data

In [None]:
# Major Class per Topic political attack/support only
political_as_pro = processors['political-as']('both')
splits_data = np.array(political_as_pro.get_splits('../data'))

# Get the test data and the test predictions
pol_test_df = pd.concat(splits_data[:,3])
data = data_stats_topic['political']
data['major_acc'] = data.apply(get_major_acc, args=[['Attack','Support']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['Attack','Support']], axis=1)
data[['Topic', 'major_class', 'major_acc', 'Total pairs']]
data = data.set_index('Topic')
pol_test_df['major_topic_pred'] = pol_test_df['topic'].apply(lambda r: data.loc[r, 'major_class']).replace({'Attack': 'attack', 'Support': 'support'})
print(classification_report(pol_test_df['label'], pol_test_df['major_topic_pred']))
pol_test_df['major_class'] = 'attack'
print(classification_report(pol_test_df['label'], pol_test_df['major_class']))
pol_test_df['org_polarity'] = pol_test_df['org'].apply(lambda r: disc_pol(sid.polarity_scores(r)['compound']))
pol_test_df['resp_polarity'] = pol_test_df['response'].apply(lambda r: disc_pol(sid.polarity_scores(r)['compound']))
pol_test_df['sent_both_baseline'] = pol_test_df.apply(lambda r: 'attack' if r['org_polarity'] != r['resp_polarity'] else 'support', axis=1)
pol_test_df['sent_resp_baseline'] =pol_test_df.apply(lambda r: 'attack' if r['resp_polarity'] == 'negative' else 'support', axis=1)
print(classification_report(pol_test_df['label'], pol_test_df['sent_both_baseline']))
print(classification_report(pol_test_df['label'], pol_test_df['sent_resp_baseline']))

data = pol_test_df.groupby('org').apply(lambda r: pd.Series({'org': r['org'].iloc[0], 'attack': count_values(r, ['attack']), 'support': count_values(r, ['support'])}))
data['major_acc'] = data.apply(get_major_acc, args=[['attack', 'support']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['attack', 'support']], axis=1)
data = data.set_index('org')
data['total'] = data['attack'] + data['support']
orgs_only_once = data.loc[data['total'] == 1].index.to_list()
index = pol_test_df[pol_test_df['org'].isin(orgs_only_once)].index
pol_test_df = pol_test_df.drop(index)
pol_test_df['major_org_pred'] = pol_test_df['org'].apply(lambda r: data.loc[r, 'major_class'])
print(classification_report(pol_test_df['label'], pol_test_df['major_org_pred']))

splits_data = np.array(political_as_pro.get_splits('../data'))

# Get the test data and the test predictions
pol_test_df = pd.concat(splits_data[:,3])
data = pol_test_df.groupby('response').apply(lambda r: pd.Series({'resp': r['response'].iloc[0], 'attack': count_values(r, ['attack']), 'support': count_values(r, ['support'])}))
data['major_acc'] = data.apply(get_major_acc, args=[['attack', 'support']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['attack', 'support']], axis=1)
data = data.set_index('resp')
data['total'] = data['attack'] + data['support']
resps_only_once = data.loc[data['total'] == 1].index.to_list()
index = pol_test_df[pol_test_df['response'].isin(resps_only_once)].index
pol_test_df = pol_test_df.drop(index)
pol_test_df['major_resp_pred'] = pol_test_df['response'].apply(lambda r: data.loc[r, 'major_class'])
print(classification_report(pol_test_df['label'], pol_test_df['major_resp_pred']))

In [None]:
data = data_stats_author.copy()
data['major_acc'] = data.apply(get_major_acc, args=[['Attack', 'Support']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['Attack', 'Support']], axis=1)
data[['Author resp', 'Author org', 'major_class', 'major_acc', 'Total pairs']]
data = data.set_index(['Author resp', 'Author org'])
pol_test_df['major_author'] = pol_test_df.apply(lambda r: data.loc[r['response_stance'], r['org_stance']]['major_class'], axis=1).replace({'Attack': 'attack', 'Support': 'support'})
print(classification_report(pol_test_df['label'], pol_test_df['major_author']))

In [None]:
# Merged to same author / different author
# Very high accuracy possible if only detected if it is the same or a different author
data = data_stats_author.iloc[:-1].copy()
data['authors'] = data.apply(lambda r: 'Same' if r['Author resp'] == r['Author org'] else 'Different', axis=1)
data = data.groupby('authors').sum()
data = data.reset_index()
data['major_acc'] = data.apply(get_major_acc, args=[['Attack','Support']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['Attack','Support']], axis=1)
data[['authors', 'major_acc', 'Total pairs']]


In [None]:
# Both org and response
# Attack often have different sentiment, support often have the same sentiment (node)
# Nothing meaningful for political
pd.concat((data_stats_sent['bothdebate_test'],data_stats_sent['bothpolitical']), keys=['node', 'political'], sort=True)

In [None]:
# Sentiment Analysis (nltk vader)

# Only responses debate test, supporting arguments often have a positive sentiment
# Attacking arguments have nothing special
pd.concat((data_stats_sent['respdebate_test'],data_stats_sent['resppolitical']), keys=['node', 'political'], sort=True)

In [None]:
# .... ?
# Major Class for every Org argument
# Major Class for every Resp argument (only political)

In [None]:
# Major class agreement
data = data_stats_topic['agreement']
data['major_acc'] = data.apply(get_major_acc, args=[['Agreement','Disagreement']], axis=1)
data['major_class'] = data.apply(get_major_class, args=[['Agreement','Disagreement']], axis=1)
data[['Topic', 'major_class', 'major_acc', 'Total pairs']]
data = data.set_index('Topic')
agreement_pro = processors['agreement']('both')
splits_data = np.array(agreement_pro.get_splits('../data'))

# Get the test data and the test predictions
ag_test_df = pd.concat(splits_data[:,3])
ag_test_df['major_topic_pred'] = ag_test_df['topic'].apply(lambda r: data.loc[r, 'major_class']).replace({'Agreement': 'agreement', 'Disagreement': 'disagreement'})
print(classification_report(ag_test_df['label'], ag_test_df['major_topic_pred']))
ag_test_df['major'] = 'disagreement'
print(classification_report(ag_test_df['label'], ag_test_df['major']))

ag_test_df['org_polarity'] = ag_test_df['org'].apply(lambda r: disc_pol(sid.polarity_scores(r)['compound']))
ag_test_df['resp_polarity'] = ag_test_df['response'].apply(lambda r: disc_pol(sid.polarity_scores(r)['compound']))
ag_test_df['sent_both_baseline'] = ag_test_df.apply(lambda r: 'disagreement' if r['org_polarity'] != r['resp_polarity'] else 'agreement', axis=1)
ag_test_df['sent_resp_baseline'] = ag_test_df.apply(lambda r: 'disagreement' if r['resp_polarity'] == 'negative' else 'agreement', axis=1)
print(classification_report(ag_test_df['label'], ag_test_df['sent_both_baseline']))
print(classification_report(ag_test_df['label'], ag_test_df['sent_resp_baseline']))

# Results

## NoDE paper
![](https://i.imgur.com/1N94Gjq.png)

In [None]:
# Node Acc with different parameters

# Fixed: input=both, seq_len=128, warmup_prop=0.1, seed=42
# Tested: model=base-uncased,large-uncased, epochs=3,4,5, batch_size=8,12,16, lr=2e-5, 3e-5, 5e-5
# Gradient accumulation: batch_size/4 for bert_large 
# (in principle equivalent, in practice different because of rounding errors etc.)
eval_results = pd.read_csv('../pytorch/res/node_both_paper/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# Paper acc 0.67, best bert acc 0.74, mean (bert-base) 0.62 , baselines ~0.6
print(eval_results['acc'].agg([np.mean, np.min, np.max, np.std]))
# Somehow bert-large performs worse than bert-base
print(eval_results.groupby('_bert-model')['acc'].agg([np.mean, np.min, np.max, np.std])) 
print()
# Print settings of best result
print(eval_results.iloc[eval_results['acc'].idxmax()])

eval_results_grouped = eval_results.groupby(['_bert-model', '_num_epochs', '_batch_size','_gradient_acc' ,'_learning_rate' ])
results = eval_results_grouped['acc'].agg([np.mean, np.min, np.max, np.std, np.median])
# Ste = std/sqrt(count(scores)) # every parameter setting was tested 10 times
results['ste'] = results['std']/np.sqrt(10)
# Confidence intervals (assumes Gaussian distribution) 0.95
results['interval'] = results['ste'] * 1.96
results['upper'] = results['mean'] + results['interval']
results['lower'] = results['mean'] - results['interval']


print("best mean", results.loc[results['mean'].idxmax()]) # Best mean result
print("best upper", results.loc[results['upper'].idxmax()]) # Best upper conf bound
print("best median", results.loc[results['median'].idxmax()]) # Best median result
print("best worse", results.loc[results['amin'].idxmax()]) # Best worse result
print("best run", results.loc[results['amax'].idxmax()]) # Best overall results (one run)
print("smallest std", results.loc[results['std'].idxmin()]) # Result with smallest std



# Show the table
results = results.reset_index()
results = results.rename(columns={element: element.replace("_", "-") for element in results.columns.tolist()})
results = results.rename(columns={element: re.sub("^-", "", element) for element in results.columns.tolist()})
results = results.replace({"bert-base-uncased": 'bbu', "bert-large-uncased": 'blu'})
results['batch-size'] = results['batch-size'] * results['gradient-acc']
results = results.drop(columns=["gradient-acc"])
print(results.columns)
results.iloc[:,:9].to_csv('../data/thesis/node_all_results.csv', index=False)

In [None]:
node_pro = processors['node']('both')

In [None]:
_, node_test_df = node_pro.get_dev_examples('../data')
eval_preds = pd.read_csv('../pytorch/res/node_both_paper/eval_preds.csv')

results = eval_results_grouped['acc'].agg([np.mean, np.min, np.max, np.std, np.median])
bmodel, bepochs, bb, bga, blr = results.loc[results['mean'].idxmax()].name
print(bmodel, bepochs, bb, blr)
best_pred_ps = eval_results.loc[(eval_results['_bert-model'] == bmodel) & 
                       (eval_results['_num_epochs'] == bepochs) & (eval_results['_batch_size'] == bb) &
                       (eval_results['_learning_rate'] == blr)].index
print(best_pred_ps)

# Only predictions from best setting 
res = pd.concat([node_test_df.reset_index(drop=True), eval_preds.iloc[best_pred_ps,:-1].transpose().reset_index(drop=True)], axis=1)
res = res.replace({0: 'attack', 1: 'support'})
re_pre_dict = {'Precision support' : [], 'Precision attack': [], 'Recall support': [], 'Recall attack': []}
for ind in best_pred_ps:
    pre, rec, _, _ = precision_recall_fscore_support(res['label'], res[ind], labels=['support', 'attack'])
    re_pre_dict['Precision support'].append(pre[0])
    re_pre_dict['Precision attack'].append(pre[1])
    re_pre_dict['Recall support'].append(rec[0])
    re_pre_dict['Recall attack'].append(rec[1])

re_pre_df = pd.DataFrame(re_pre_dict)
re_pre_df.agg([np.mean, np.min, np.max, np.std, np.median])

In [None]:
# Plot distributions of acc
# Some look gaussian, maybe 10 runs is not enough
eval_results_grouped.hist(column='acc')
plt.tight_layout()
plt.show()

## Political Paper
![](https://i.imgur.com/yGlTYbd.png)
![](https://i.imgur.com/7yrDqQH.png)

In [None]:
# Political F1 CrossVal
# Fixed: input=both, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=5, batch_size=12, lr=2e-5

# Related/Unrelated
eval_results = pd.read_csv('../pytorch/res/pol_ru/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# Paper average F1 0.65, here average F1 0.67, baseline ?
print(eval_results.groupby(['_bert-model', '_num_epochs', '_batch_size','_gradient_acc' ,'_learning_rate', '_seed'])['f1'].agg([np.mean, np.min, np.max, np.std]))
eval_results

In [None]:
# Political results with respect to topic
political_ru_pro = processors['political-ru']('both')
splits_data = np.array(political_ru_pro.get_splits('../data'))

# Get the test data and the test predictions
pol_test_df = pd.concat(splits_data[:,3])
eval_preds = pd.read_csv('../pytorch/res/pol_ru/eval_preds.csv')


#pol_test_df['preds'] =  eval_preds.iloc[:10,:-1].stack().values # Not working anymore, because of apending to output file
preds = pd.Series()
re_pre_f1_dict = {'Precision related' : [], 'Precision unrelated': [], 
                  'Recall related': [], 'Recall unrelated': [], 'F1 related': [], 'F1 unrelated': []}
scores_df = pd.DataFrame(re_pre_f1_dict)
for i, row in eval_preds.iterrows():
    #print(pd.Series(row.values[~row.str.contains('bert*', na=False, regex=True)]))
    preds_split_i = pd.Series(row.values[~row.str.contains('bert*', na=False, regex=True)]).dropna()    
    preds_split_i = preds_split_i.replace({0: 'related', 1: 'unrelated'})
    preds = preds.append(preds_split_i)
    pre, rec, f1, support = precision_recall_fscore_support(splits_data[i,3]['label'], preds_split_i, labels=['related', 'unrelated'])
    scores_df.loc[i] = np.array((pre,rec,f1)).reshape((1,-1))[0]
    if i == 9:
        break

pol_test_df['preds'] = preds.values

print(classification_report(pol_test_df['label'], pol_test_df['preds'], labels=['related', 'unrelated']))

scores_df['Average Precision'] = (scores_df['Precision related']+scores_df['Precision unrelated'])/2
scores_df['Average Recall'] = (scores_df['Recall related']+scores_df['Recall unrelated'])/2
scores_df['Average F1'] = (scores_df['F1 related']+scores_df['F1 unrelated'])/2

scores_df.agg([np.mean, np.std, np.min, np.max])

In [None]:
y_true = np.array((np.ones(50), np.zeros(50))).reshape(100)
y_pred = np.concatenate([np.ones(48), np.zeros(2), np.ones(40), np.zeros(10)])
print(y_pred)

from sklearn.metrics import confusion_matrix
confusion_matrix(y_true, y_pred)
print(classification_report(y_true, y_pred))
print(precision_recall_fscore_support(y_true, y_pred, average='micro'))

# Test about the result tables in the paper
# Result: Average Precision and Recall reported is macro/weighted, and F1 score is micro=accuracy? 
# This is also correct for table 3, 
# for table 4 this could be correct, because actually there are 3 classes (attack/support/unrelated) and not the full table is shown
# The unrelated samples are always false, lower the accuracy, but not the other scores in the table

In [None]:
# Political F1 CrossVal
# Fixed: input=both, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=5, batch_size=12, lr=2e-5

# Attack/Support
eval_results = pd.read_csv('../pytorch/res/pol_as/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# Paper average F1 0.82, here average F1 0.73, baselines (author) ~0.85
print(eval_results.groupby(['_bert-model', '_num_epochs', '_batch_size','_gradient_acc' ,'_learning_rate', '_seed'])['f1'].agg([np.mean, np.min, np.max, np.std]))
eval_results.head()

In [None]:
political_as_pro = processors['political-as']('both')
splits_data = np.array(political_as_pro.get_splits('../data'))

# Get the test data and the test predictions
pol_test_df = pd.concat(splits_data[:,3])
eval_preds = pd.read_csv('../pytorch/res/pol_as/eval_preds.csv')


#pol_test_df['preds'] =  eval_preds.iloc[:10,:-1].stack().values # Not working anymore, because of apending to output file
preds = pd.Series()
re_pre_f1_dict = {'Precision attack' : [], 'Precision support': [], 
                  'Recall attack': [], 'Recall support': [], 'F1 attack': [], 'F1 support': []}
scores_df = pd.DataFrame(re_pre_f1_dict)
for i, row in eval_preds.iterrows():
    #print(pd.Series(row.values[~row.str.contains('bert*', na=False, regex=True)]))
    preds_split_i = pd.Series(row.values[~row.str.contains('bert*', na=False, regex=True)]).dropna()    
    preds_split_i = preds_split_i.replace({0: 'attack', 1: 'support'})
    preds = preds.append(preds_split_i)
    pre, rec, f1, support = precision_recall_fscore_support(splits_data[i,3]['label'], preds_split_i, labels=['attack', 'support'])
    print(pre, rec, f1, support)
    scores_df.loc[i] = np.array((pre,rec,f1)).reshape((1,-1))[0]
    if i == 9:
        break

pol_test_df['preds'] = preds.values

print(classification_report(pol_test_df['label'], pol_test_df['preds'], labels=['attack', 'support']))

scores_df['Average Precision'] = (scores_df['Precision attack']+scores_df['Precision support'])/2
scores_df['Average Recall'] = (scores_df['Recall attack']+scores_df['Recall support'])/2
scores_df['Average F1'] = (scores_df['F1 attack']+scores_df['F1 support'])/2

scores_df.agg([np.mean, np.std, np.min, np.max])

In [None]:
# Political F1 CrossVal
# Fixed: input=both, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=5, batch_size=12, lr=2e-5

# Attack/Support/Unrelated
eval_results = pd.read_csv('../pytorch/res/pol_asu/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# Paper only reported precision 0.57, here average f1 0.60 
# Use some tricks to coope with class imbalance!
print(eval_results.groupby(['_bert-model', '_num_epochs', '_batch_size','_gradient_acc' ,'_learning_rate', '_seed'])['f1'].agg([np.mean, np.min, np.max, np.std]))
eval_results.head()

In [None]:
political_asu_pro = processors['political-asu']('both')
splits_data = np.array(political_asu_pro.get_splits('../data'))

# Get the test data and the test predictions
pol_test_df = pd.concat(splits_data[:,3])
eval_preds = pd.read_csv('../pytorch/res/pol_asu/eval_preds.csv')


#pol_test_df['preds'] =  eval_preds.iloc[:10,:-1].stack().values # Not working anymore, because of apending to output file
preds = pd.Series()
re_pre_f1_dict = {'Precision attack' : [], 'Precision support': [], 'Precision unrelated': [],
                  'Recall attack': [], 'Recall support': [], 'Recall unrelated': [],
                  'F1 attack': [], 'F1 support': [], 'F1 unrelated': []}
scores_df = pd.DataFrame(re_pre_f1_dict)
for i, row in eval_preds.iterrows():
    #print(pd.Series(row.values[~row.str.contains('bert*', na=False, regex=True)]))
    preds_split_i = pd.Series(row.values[~row.str.contains('bert*', na=False, regex=True)]).dropna()    
    preds_split_i = preds_split_i.replace({0: 'attack', 1: 'support', 2: 'unrelated'})
    preds = preds.append(preds_split_i)
    pre, rec, f1, support = precision_recall_fscore_support(splits_data[i,3]['label'], preds_split_i, labels=['attack', 'support', 'unrelated'])
    print(pre, rec, f1, support)
    scores_df.loc[i] = np.array((pre,rec,f1)).reshape((1,-1))[0]
    if i == 9:
        break

pol_test_df['preds'] = preds.values

print(classification_report(pol_test_df['label'], pol_test_df['preds'], labels=['attack', 'support', 'unrelated']))
print(classification_report(pol_test_df['label'], ['unrelated']*1460, labels=['attack', 'support', 'unrelated']))


scores_df['Average Precision'] = (scores_df['Precision attack']+scores_df['Precision support']+scores_df['Precision unrelated'])/3
scores_df['Average Recall'] = (scores_df['Recall attack']+scores_df['Recall support']+scores_df['Recall unrelated'])/3
scores_df['Average F1'] = (scores_df['F1 attack']+scores_df['F1 support']+scores_df['F1 unrelated'])/3

scores_df.agg([np.mean, np.std, np.min, np.max])

## Agreement Paper
- Accuracy 74%

In [None]:
# Agreement F1 CrossVal
# Comparison with Paper + Baselines
# Fixed: input=both, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=3, batch_size=12, lr=2e-5

# Agreement/Disagreement
eval_results = pd.read_csv('../pytorch/res/agreement_new/eval_results.tsv', sep='\t')
# Some stats: mean, min, max, std
# Paper average acc 0.74 , here average acc 0.61
# TODO: non cross_val version had acc ~0.97! Probably parameters are bad, 2 Epochs might not be enough 
# (try again with higher epochs number)
print(eval_results['acc'].agg([np.mean, np.min, np.max, np.std]))
eval_results.head()

In [None]:
agreement_pro = processors['agreement']('both')
splits_data = np.array(agreement_pro.get_splits('../data'))

# Get the test data and the test predictions
ag_test_df = pd.concat(splits_data[:,3])
eval_preds = pd.read_csv('../pytorch/res/agreement_new/eval_preds.csv')

preds = pd.Series()
re_pre_f1_dict = {'Precision agreement' : [], 'Precision disagreement': [], 
                  'Recall agreement': [], 'Recall disagreement': [], 'F1 agreement': [], 'F1 disagreement': []}
scores_df = pd.DataFrame(re_pre_f1_dict)
for i, row in eval_preds.iterrows():
    #print(pd.Series(row.values[~row.str.contains('bert*', na=False, regex=True)]))
    preds_split_i = pd.Series(row.values[~row.str.contains('bert*', na=False, regex=True)]).dropna()    
    preds_split_i = preds_split_i.replace({0: 'agreement', 1: 'disagreement','0': 'agreement', '1': 'disagreement'})

    preds = preds.append(preds_split_i)
    pre, rec, f1, support = precision_recall_fscore_support(splits_data[i,3]['label'], preds_split_i, labels=['agreement', 'disagreement'])
    scores_df.loc[i] = np.array((pre,rec,f1)).reshape((1,-1))[0]
    if i == 9:
        break

ag_test_df['preds'] = preds.values
scores_df.describe()
pd.crosstab(ag_test_df['preds'], ag_test_df['label'])

## Results analyzed

In [None]:
eval_results = pd.read_csv('../pytorch/res/node_both_paper/eval_results.tsv', sep='\t')
eval_results_grouped = eval_results.groupby(['_bert-model', '_num_epochs', '_batch_size','_gradient_acc' ,'_learning_rate' ])
_, node_test_df = node_pro.get_dev_examples('../data')
eval_preds = pd.read_csv('../pytorch/res/node_both_paper/eval_preds.csv')

results = eval_results_grouped['acc'].agg([np.mean, np.min, np.max, np.std, np.median])
bmodel, bepochs, bb, bga, blr = results.loc[results['mean'].idxmax()].name
print(bmodel, bepochs, bb, blr)
best_pred_ps = eval_results.loc[(eval_results['_bert-model'] == bmodel) & 
                       (eval_results['_num_epochs'] == bepochs) & (eval_results['_batch_size'] == bb) &
                       (eval_results['_learning_rate'] == blr)].index
print(best_pred_ps)

# Only predictions from best setting 
res = pd.concat([node_test_df.reset_index(drop=True), eval_preds.iloc[best_pred_ps,:-1].transpose().reset_index(drop=True)], axis=1)
res['Mean prediction'] = res[list(best_pred_ps)].mean(axis=1).round().values
res = res.replace({0: 'attack', 1: 'support'})
res = res.rename(columns={'label': 'Label'})

pd.crosstab(res['topic'], [res['Label'],res['Mean prediction']]).to_csv('../data/thesis/node_topics_preds.csv', index=True)
pd.crosstab(res['topic'], [res['Label'],res['Mean prediction']])
preds_orgs = pd.crosstab(res['org'], [res['Label'],res['Mean prediction']])
preds_orgs['total'] = preds_orgs.agg([np.sum], axis=1)
# TODO: how to visualize? E.g. count number of same predictions and divide by total to show how (important) the response arguments are (exclude arguments with total 1)
# Compare this number to the correct predictions?
preds_orgs

In [None]:
# Node results with respect to topic
_, node_test_df = node_pro.get_dev_examples('../data')
eval_preds = pd.read_csv('../pytorch/res/node_both_paper/eval_preds.csv')


# Only predictions from bert-base
res = pd.concat([node_test_df.reset_index(drop=True), eval_preds.iloc[27:,:-1].transpose().reset_index(drop=True)], axis=1)
res = res.replace({0: 'attack', 1: 'support'})

# For now, only one run (run 51) used
# There are errors in every topic, no clear trend visible that some topics are better or worse
# More false classifications of attack than of support (support is the major class)
# Could, also look at several runs, or average, etc.
pd.crosstab(res['topic'], [res['label'],res[51]])

In [None]:
# Take the rounded mean prediction for all bert-base runs 
res['mean_round'] = eval_preds.iloc[27:,:-1].mean().round().values
res = res.replace({0: 'attack', 1: 'support'})
pd.crosstab(res['topic'], [res['label'],res['mean_round']])

In [None]:
# We can recreate all metrics from the available data
# E.g. classification reports or confusion matrices 
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_pred=eval_preds.iloc[51,:-1].replace({0: 'attack', 1: 'support'}), y_true=res['label']))

print(confusion_matrix(res['label'], eval_preds.iloc[51,:-1].replace({0: 'attack', 1: 'support'})))

In [None]:
# Political results with respect to topic
political_ru_pro = processors['political-ru']('both')
splits_data = political_ru_pro.get_splits('../data')

# Get the test data and the test predictions
pol_test_df = pd.concat(np.array(splits_data)[:,3])
eval_preds = pd.read_csv('../pytorch/res/pol_ru/eval_preds.csv')


#pol_test_df['preds'] =  eval_preds.iloc[:10,:-1].stack().values # Not working anymore, because of apending to output file
preds = pd.Series()
for i, row in eval_preds.iterrows():
    #print(pd.Series(row.values[~row.str.contains('bert*', na=False, regex=True)]))
    preds = preds.append(pd.Series(row.values[~row.str.contains('bert*', na=False, regex=True)]))
    if i == 9:
        break
preds = preds.dropna()
preds = preds.astype(int)

pol_test_df['preds'] = preds.values
pol_test_df = pol_test_df.replace({0: 'related', 1: 'unrelated'})

for name, group in pol_test_df.groupby('topic'):
    print(name)
    print(classification_report(group['label'], group['preds'], labels=['related', 'unrelated']))


pd.crosstab(pol_test_df['topic'], [pol_test_df['label'],pol_test_df['preds']])

In [None]:
# Political results with respect to topic
political_ru_pro = processors['political-ru-topics']('both')
splits_data = political_ru_pro.get_splits('../data')

# Get the test data and the test predictions
pol_test_df = pd.concat(np.array(splits_data)[:,3])
eval_preds = pd.read_csv('../pytorch/res/pol_ru_topics/eval_preds.csv', names=list(range(0,315))).iloc[1:]


#pol_test_df['preds'] =  eval_preds.iloc[:10,:-1].stack().values # Not working anymore, because of apending to output file
preds = pd.Series()
for i, row in eval_preds.iterrows():
    #print(pd.Series(row.values[~row.str.contains('bert*', na=False, regex=True)]))
    preds = preds.append(pd.Series(row.values[~row.str.contains('bert*', na=False, regex=True)]))
    if i == 5:
        break
preds = preds.dropna()
preds = preds.astype(int)

pol_test_df['preds'] = preds.values
pol_test_df = pol_test_df.replace({0: 'related', 1: 'unrelated'})

for name, group in pol_test_df.groupby('topic'):
    print(name)
    print(classification_report(group['label'], group['preds'], labels=['related', 'unrelated']))

print(classification_report(pol_test_df['label'], pol_test_df['preds']))

pd.crosstab(pol_test_df['topic'], [pol_test_df['label'],pol_test_df['preds']])

In [None]:
# Political results with respect to topic
political_as_pro = processors['political-as']('both')
splits_data = political_as_pro.get_splits('../data')

# Get the test data and the test predictions
pol_test_df = pd.concat(np.array(splits_data)[:,3])
eval_preds = pd.read_csv('../pytorch/res/pol_as/eval_preds.csv')


#pol_test_df['preds'] =  eval_preds.iloc[:10,:-1].stack().values # Not working anymore, because of apending to output file
preds = pd.Series()
for i, row in eval_preds.iterrows():
    #print(pd.Series(row.values[~row.str.contains('bert*', na=False, regex=True)]))
    preds = preds.append(pd.Series(row.values[~row.str.contains('bert*', na=False, regex=True)]))
    if i == 9:
        break
preds = preds.dropna()
preds = preds.astype(int)

pol_test_df['preds'] = preds.values
pol_test_df = pol_test_df.replace({0: 'attack', 1: 'support'})

for name, group in pol_test_df.groupby('topic'):
    print(name)
    print(classification_report(group['label'], group['preds'], labels=['attack', 'support']))

print(classification_report(pol_test_df['label'], pol_test_df['preds']))

pol_test_df['authors'] = pol_test_df.apply(lambda r: 'same' if r['org_stance'] == r['response_stance'] else 'different', axis=1)

for name, group in pol_test_df.groupby('authors'):
    print(name)
    print(classification_report(group['label'], group['preds'], labels=['attack', 'support']))
    
for name, group in pol_test_df.groupby(['response_stance', 'org_stance']):
    print(name)
    print(classification_report(group['label'], group['preds'], labels=['attack', 'support']))


pd.crosstab(pol_test_df['topic'], [pol_test_df['label'],pol_test_df['preds']])
pol_test_df.loc[(pol_test_df['authors'] == 'same') & (pol_test_df['label'] == 'attack')][['org', 'response', 'preds', 'id']]

In [None]:
# Political results with respect to topic
political_as_pro = processors['political-as-topics']('both')
splits_data = political_as_pro.get_splits('../data')

# Get the test data and the test predictions
pol_test_df = pd.concat(np.array(splits_data)[:,3])
eval_preds = pd.read_csv('../pytorch/res/pol_as_topics/eval_preds.csv', names=list(range(0,315))).iloc[1:]


#pol_test_df['preds'] =  eval_preds.iloc[:10,:-1].stack().values # Not working anymore, because of apending to output file
preds = pd.Series()
for i, row in eval_preds.iterrows():
    #print(pd.Series(row.values[~row.str.contains('bert*', na=False, regex=True)]))
    preds = preds.append(pd.Series(row.values[~row.str.contains('bert*', na=False, regex=True)]))
    if i == 5:
        break
preds = preds.dropna()
preds = preds.astype(int)

pol_test_df['preds'] = preds.values
pol_test_df = pol_test_df.replace({0: 'attack', 1: 'support'})

for name, group in pol_test_df.groupby('topic'):
    print(name)
    print(classification_report(group['label'], group['preds'], labels=['attack', 'support']))

print(classification_report(pol_test_df['label'], pol_test_df['preds']))

pd.crosstab(pol_test_df['topic'], [pol_test_df['label'],pol_test_df['preds']])

In [None]:
# Political results with respect to author
pd.crosstab(pol_test_df['preds'], [pol_test_df['org_stance'],pol_test_df['response_stance']])

In [None]:
# Complete results political (all folds "summed")
print(classification_report(y_pred=pol_test_df['preds'], y_true=pol_test_df['label']))

print(confusion_matrix(y_pred=pol_test_df['preds'], y_true=pol_test_df['label']))

In [None]:
# Results with respect to same org, same resp (always gets the same label or not?)

# Same org
# One org does not always get the same prediction (but often)
pd.crosstab(res['org'], res[51])

In [None]:
# Same org pol
# TODO: aggregate to get some useful insights 
# (and maybe do it for every fold individually, 
# because otherwise it could be that we always predict one label for one org in one fold and another in another fold)
pd.crosstab(pol_test_df['org'], pol_test_df['preds']).head()

In [None]:
# Same resp pol
pd.crosstab(pol_test_df['response'], pol_test_df['preds']).head()

In [None]:
# Agreement results with respect to topic
ag_pro = processors['agreement']('both')
splits_data = ag_pro.get_splits('../data')

# Get the test data and the test predictions
ag_test_df = pd.concat(np.array(splits_data)[:,3])
eval_preds = pd.read_csv('../pytorch/res/agreement_new/eval_preds.csv')


#pol_test_df['preds'] =  eval_preds.iloc[:10,:-1].stack().values # Not working anymore, because of apending to output file
preds = pd.Series()
for i, row in eval_preds.iterrows():
    #print(pd.Series(row.values[~row.str.contains('bert*', na=False, regex=True)]))
    preds = preds.append(pd.Series(row.values[~row.str.contains('bert*', na=False, regex=True)]))
    if i == 9:
        break
preds = preds.dropna()
preds = preds.astype(int)

ag_test_df['preds'] = preds.values
ag_test_df = ag_test_df.replace({0: 'agreement', 1: 'disagreement'})

print(classification_report(ag_test_df['label'], ag_test_df['preds']))

results_topic = pd.crosstab(ag_test_df['topic'], [ag_test_df['label'], ag_test_df['preds']])
results_topic['total'] = results_topic.agg([np.sum], axis=1)
results_topic['correctness'] = results_topic.apply(lambda r: (r['agreement', 'agreement']+r['disagreement', 'disagreement'])/r['total'], axis=1)
results = results_topic.sort_values(by='correctness')
results

In [None]:
# Agreement results with respect to topic
ag_pro = processors['agreement-topics']('both')
_, ag_test_df = ag_pro.get_dev_examples('../data')

# Get the test data and the test predictions
eval_preds = pd.read_csv('../pytorch/res/agreement_topics_new/eval_preds.csv')


ag_test_df['preds'] = eval_preds.transpose().iloc[:-1].values
ag_test_df = ag_test_df.replace({0: 'agreement', 1: 'disagreement'})

print(classification_report(ag_test_df['label'], ag_test_df['preds']))

results_topic = pd.crosstab(ag_test_df['topic'], [ag_test_df['label'], ag_test_df['preds']])
print(results_topic)
results_topic['total'] = results_topic.agg([np.sum], axis=1)
results_topic['correctness'] = results_topic.apply(lambda r: (r['agreement', 'agreement']+r['disagreement', 'disagreement'])/r['total'], axis=1)
results = results_topic.sort_values(by='correctness')
results

# Results additional
- NoDE + procon
- Pol as/ru Topic 5-CV
- Agreement train/test (topics indepedent + removed duplicates)
- test with only the orgs as input and with only the response as input
   - Arguments likely to be attacked/supported
   - Attackful/ing or supportful/ing arguments
- changed order org/resp

In [None]:
# NoDE + procon
# Fixed: input=both, seq_len=128, warmup_prop=0.1, seed=42-(42+30)
# model=base-uncased, epochs=4-5, batch_size=12,16, lr=2e-5,3e-5

# Attack/Support
eval_results = pd.read_csv('../pytorch/res/node_both_procon/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# Best setting 0.67+-0.05 (without procon 0.69+-0.02) performance decreased! (more seeds?, procon too different from debatepedia, ...)
results = eval_results.groupby(['_bert-model', '_num_epochs', '_batch_size','_gradient_acc' ,'_learning_rate'])['acc'].agg([np.mean, np.min, np.max, np.std])
print("best mean", results.loc[results['mean'].idxmax()]) # Best mean result
results

results = results.reset_index()
results = results.rename(columns={element: element.replace("_", "-") for element in results.columns.tolist()})
results = results.rename(columns={element: re.sub("^-", "", element) for element in results.columns.tolist()})
results = results.replace({"bert-base-uncased": 'bbu', "bert-large-uncased": 'blu'})
results['batch-size'] = results['batch-size'] * results['gradient-acc']
results = results.drop(columns=["gradient-acc"])
print(results.columns)
results.iloc[:,:9].to_csv('../data/thesis/node_procon_results.csv', index=False)
results

In [None]:
# Pol ru topics-cv
# Fixed: input=both, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=5, batch_size=12, lr=2e-5

# Related/Unrelated
eval_results = pd.read_csv('../pytorch/res/pol_ru_topics/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# 0.59 +- 0.04 (original BERT 0.67+-0.03) -> performance decreased! task is harder, network might remember topics/args
results = eval_results.groupby(['_bert-model', '_num_epochs', '_batch_size','_gradient_acc' ,'_learning_rate'])['f1'].agg([np.mean, np.min, np.max, np.std])
results

In [None]:
# Pol as topics-cv
# Fixed: input=both, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=5, batch_size=12, lr=2e-5

# Attack/Support
eval_results = pd.read_csv('../pytorch/res/pol_as_topics/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# 0.63 +- 0.07 (orginal BERT 0.76+-0.05 and paper 0.82) -> performance drastically decreased! task is harder, in normal version the network might remember a lot?
results = eval_results.groupby(['_bert-model', '_num_epochs', '_batch_size','_gradient_acc' ,'_learning_rate'])['f1'].agg([np.mean, np.min, np.max, np.std])
results

In [None]:
# Agreement topic train/test
# Fixed: input=both, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=3, batch_size=12, lr=2e-5

# Related/Unrelated
eval_results = pd.read_csv('../pytorch/res/agreement_topics_new/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# TODO: Fix numbers!
# 0.75 (original BERT 0.97, paper: 0.74) -> performance drastically decreased! original task to easy, because of many duplicates + remembering distribution over topics
results = eval_results.groupby(['_bert-model', '_num_epochs', '_batch_size','_gradient_acc' ,'_learning_rate'])['acc'].agg([np.mean, np.min, np.max, np.std])
results

In [None]:
# NoDE only org
# Fixed: input=org, seq_len=128, warmup_prop=0.1, seed=42-(42+30)
# model=base-uncased, epochs=4-5, batch_size=12,16, lr=2e-5,3e-5

# Attack/Support
eval_results = pd.read_csv('../pytorch/res/node_org/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# Best setting 0.64+-0.02 (with org and resp 0.69+-0.02) performance decreased! (only org is not enough, but carries some meaning)
results = eval_results.groupby(['_bert-model', '_num_epochs', '_batch_size','_gradient_acc' ,'_learning_rate'])['acc'].agg([np.mean, np.min, np.max, np.std])
print("best mean", results.loc[results['mean'].idxmax()]) # Best mean result
results
results = results.reset_index()
results = results.rename(columns={element: element.replace("_", "-") for element in results.columns.tolist()})
results = results.rename(columns={element: re.sub("^-", "", element) for element in results.columns.tolist()})
results = results.replace({"bert-base-uncased": 'bbu', "bert-large-uncased": 'blu'})
results['batch-size'] = results['batch-size'] * results['gradient-acc']
results = results.drop(columns=["gradient-acc"])
print(results.columns)
results.iloc[:,:9].to_csv('../data/thesis/node_onlyorg_results.csv', index=False)
results

In [None]:
# NoDE only resp
# Fixed: input=resp, seq_len=128, warmup_prop=0.1, seed=42-(42+30)
# model=base-uncased, epochs=4-5, batch_size=12,16, lr=2e-5,3e-5

# Attack/Support
eval_results = pd.read_csv('../pytorch/res/node_resp/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# Best setting 0.58+-0.06 (with org and resp 0.69+-0.02) worse than just using org (unexpected!)
results = eval_results.groupby(['_bert-model', '_num_epochs', '_batch_size','_gradient_acc' ,'_learning_rate'])['acc'].agg([np.mean, np.min, np.max, np.std])
print("best mean", results.loc[results['mean'].idxmax()]) # Best mean result
results
results = results.reset_index()
results = results.rename(columns={element: element.replace("_", "-") for element in results.columns.tolist()})
results = results.rename(columns={element: re.sub("^-", "", element) for element in results.columns.tolist()})
results = results.replace({"bert-base-uncased": 'bbu', "bert-large-uncased": 'blu'})
results['batch-size'] = results['batch-size'] * results['gradient-acc']
results = results.drop(columns=["gradient-acc"])
print(results.columns)
results.iloc[:,:9].to_csv('../data/thesis/node_onlyresp_results.csv', index=False)
results

In [None]:
# Pol ru only org
# Fixed: input=org, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=5, batch_size=12, lr=2e-5

# Related/Unrelated
eval_results = pd.read_csv('../pytorch/res/pol_ru_org/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# 0.61 +- 0.06 (original BERT 0.67+-0.03) -> better than random
results = eval_results.groupby(['_bert-model', '_num_epochs', '_batch_size','_gradient_acc' ,'_learning_rate'])['f1'].agg([np.mean, np.min, np.max, np.std])
results

In [None]:
# Pol ru only resp
# Fixed: input=resp, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=5, batch_size=12, lr=2e-5

# Related/Unrelated
eval_results = pd.read_csv('../pytorch/res/pol_ru_resp/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# 0.62 +- 0.05 (original BERT 0.67+-0.03) -> better than random/no significant difference between just using org or just using response
results = eval_results.groupby(['_bert-model', '_num_epochs', '_batch_size','_gradient_acc' ,'_learning_rate'])['f1'].agg([np.mean, np.min, np.max, np.std])
results

In [None]:
# Pol as only org
# Fixed: input=org, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=5, batch_size=12, lr=2e-5

# Related/Unrelated
eval_results = pd.read_csv('../pytorch/res/pol_as_org/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# 0.55 +- 0.15 (original BERT 0.76+-0.05) -> very high std!, performance quite bad
results = eval_results.groupby(['_bert-model', '_num_epochs', '_batch_size','_gradient_acc' ,'_learning_rate'])['f1'].agg([np.mean, np.min, np.max, np.std])
results

In [None]:
# Pol as only resp
# Fixed: input=resp, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=5, batch_size=12, lr=2e-5

# Related/Unrelated
eval_results = pd.read_csv('../pytorch/res/pol_as_resp/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# 0.62 +- 0.06 (original BERT 0.76+-0.05) -> similar to ru? better than just using org?
results = eval_results.groupby(['_bert-model', '_num_epochs', '_batch_size','_gradient_acc' ,'_learning_rate'])['f1'].agg([np.mean, np.min, np.max, np.std])
results

In [None]:
# NoDE input reversed
# Fixed: input=resp-org, seq_len=128, warmup_prop=0.1, seed=42-(42+30)
# model=base-uncased, epochs=4-5, batch_size=12,16, lr=2e-5,3e-5

# Attack/Support
eval_results = pd.read_csv('../pytorch/res/node_both_reversed/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# Best setting 0.62+-0.05 (with org and resp 0.69+-0.02) order of org/resp matters! (maybe because of NSP?)
results = eval_results.groupby(['_bert-model', '_num_epochs', '_batch_size','_gradient_acc' ,'_learning_rate'])['acc'].agg([np.mean, np.min, np.max, np.std])
print("best mean", results.loc[results['mean'].idxmax()]) # Best mean result
results
results = results.reset_index()
results = results.rename(columns={element: element.replace("_", "-") for element in results.columns.tolist()})
results = results.rename(columns={element: re.sub("^-", "", element) for element in results.columns.tolist()})
results = results.replace({"bert-base-uncased": 'bbu', "bert-large-uncased": 'blu'})
results['batch-size'] = results['batch-size'] * results['gradient-acc']
results = results.drop(columns=["gradient-acc"])
print(results.columns)
results.iloc[:,:9].to_csv('../data/thesis/node_resporg_results.csv', index=False)
results

In [None]:
# Pol ru reversed
# Fixed: input=resp-org, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=5, batch_size=12, lr=2e-5

# Related/Unrelated
eval_results = pd.read_csv('../pytorch/res/pol_ru_resporg/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# 0.68 +- 0.03(original BERT 0.67+-0.03) -> better performance than the other ordering?!
results = eval_results.groupby(['_bert-model', '_num_epochs', '_batch_size','_gradient_acc' ,'_learning_rate'])['f1'].agg([np.mean, np.min, np.max, np.std])
results

In [None]:
# Pol as reversed
# Fixed: input=resp-org, seq_len=256, warmup_prop=0.1, seed=42
# model=base-uncased, epochs=5, batch_size=12, lr=2e-5

# Related/Unrelated
eval_results = pd.read_csv('../pytorch/res/pol_as_resporg/eval_results.tsv', sep='\t')

# Some stats: mean, min, max, std
# 0.79 +- 0.06(original BERT 0.76+-0.05) -> better performance than the other ordering?!
results = eval_results.groupby(['_bert-model', '_num_epochs', '_batch_size','_gradient_acc' ,'_learning_rate'])['f1'].agg([np.mean, np.min, np.max, np.std])
results

# Domain adaptation etc.
- TODO: do some domain adaptation etc.

In [None]:
# Train on one dataset, evaluate on another (without finetuning)

In [None]:
# With finetuning (reusing the classification layer)

In [None]:
# With finetuning + use a new classification layer 