In [6]:
import scattertext as st
import re
from pprint import pprint
import pandas as pd
import numpy as np
import spacy.en
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

In [3]:
nlp = spacy.en.English()

Data is from:

Habernal, Ivan, and Iryna Gurevych. "Which argument is more convincing? Analyzing and predicting convincingness of Web arguments using bidirectional LSTM." Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL). 2016.
(https://github.com/UKPLab/acl2016-convincing-arguments/)

In [58]:
file_names = ['ban-plastic-water-bottles_no-bad-for-the-economy.csv', 'ban-plastic-water-bottles_yes-emergencies-only.csv', 'christianity-or-atheism-_atheism.csv', 'christianity-or-atheism-_christianity.csv', 'evolution-vs-creation_creation.csv', 'evolution-vs-creation_evolution.csv', 'firefox-vs-internet-explorer_it-has-a-cute-logo-oh-and-extensions-err-add-ons.csv', 'firefox-vs-internet-explorer_there-s-more-browsers-than-the-ie-firefox-is-an-animal.csv', 'gay-marriage-right-or-wrong_allowing-gay-marriage-is-right.csv', 'gay-marriage-right-or-wrong_allowing-gay-marriage-is-wrong.csv', 'human-growth-and-development-should-parents-use-spanking-as-an-option-to-discipline-_no.csv', 'human-growth-and-development-should-parents-use-spanking-as-an-option-to-discipline-_yes.csv', 'if-your-spouse-committed-murder-and-he-or-she-confided-in-you-would-you-turn-them-in-_no.csv', 'if-your-spouse-committed-murder-and-he-or-she-confided-in-you-would-you-turn-them-in-_yes.csv', 'india-has-the-potential-to-lead-the-world-_no-against.csv', 'india-has-the-potential-to-lead-the-world-_yes-for.csv', 'is-it-better-to-have-a-lousy-father-or-to-be-fatherless-_fatherless.csv', 'is-it-better-to-have-a-lousy-father-or-to-be-fatherless-_lousy-father.csv', 'is-porn-wrong-_no-is-is-not.csv', 'is-porn-wrong-_yes-porn-is-wrong.csv', 'is-the-school-uniform-a-good-or-bad-idea-_bad.csv', 'is-the-school-uniform-a-good-or-bad-idea-_good.csv', 'personal-pursuit-or-advancing-the-common-good-_advancing-the-common-good.csv', 'personal-pursuit-or-advancing-the-common-good-_personal-pursuit.csv', 'pro-choice-vs-pro-life_pro-choice.csv', 'pro-choice-vs-pro-life_pro-life.csv', 'should-physical-education-be-mandatory-in-schools-_no-.csv', 'should-physical-education-be-mandatory-in-schools-_yes-.csv', 'tv-is-better-than-books_books.csv', 'tv-is-better-than-books_tv.csv', 'william-farquhar-ought-to-be-honoured-as-the-rightful-founder-of-singapore_no-it-is-raffles-.csv', 'william-farquhar-ought-to-be-honoured-as-the-rightful-founder-of-singapore_yes-of-course-.csv']
url_base = 'https://raw.githubusercontent.com/UKPLab/acl2016-convincing-arguments/master/data/UKPConvArg1-Ranking-CSV/'

dfs = []
for fn in file_names:
    handle = urllib.request.urlopen(url_base + fn)
    df = pd.read_csv(io.BytesIO(handle.read()), sep='\t')
    argument, stance = fn.split('_')
    df['topic'] = argument.replace('-', ' ').strip()
    df['stance'] = stance.replace('-', ' ').replace('.csv', '').strip()
    dfs.append(df)
df = pd.concat(dfs).sort_values(by='rank')
df = df.ix[df['argument'].drop_duplicates().index] # dropping duplicate arguments

In [62]:
df['metadata'] = df['topic'] + ': ' + df['argument'] + ' ('+df['rank'].astype(str)+')'

# Three of the best (lowest ranking) arguments

In [57]:
for _, row in df.iloc[:3].iterrows():
    print('Score:',row['rank'])
    print('Topic:',row['topic'])
    print('Stance:',row['stance'])
    print('Argument:')
    
    print('\t'+row['argument'])
    print()

Score: 0.0042899999999999995
Topic: is the school uniform a good or bad idea
Stance: bad
Argument:
	The school my mother works at, plus the school district my cousin's 3 children are in, are utilizing school uniforms. One reason is to "reduce bullying", which in reality, doesn't even address the problem concerning bullying. The only good it does is that it gets rid of or reduces students being bullied because they aren't wearing a specific clothing label that they dictate is the IN thing to wear. While it's a problem, all it does is sweep the one basic type of bullying under the rug. Kids will find other reasons to bully others. It also infringes upon their basic rights to be individuals and to express their individuality.

Score: 0.0042899999999999995
Topic: is the school uniform a good or bad idea
Stance: good
Argument:
	According to the legacy educational resources, as fashion and trends change, students become more concerned with how they look and how they are perceived than they d

# Three of the worst (highest ranking) arguments

In [60]:
for _, row in df.iloc[-3:].iterrows():
    print('Score:',row['rank'])
    print('Topic:',row['topic'])
    print('Stance:',row['stance'])
    print('Argument:')
    
    print('\t'+row['argument'])
    print()

Score: 0.32533
Topic: is the school uniform a good or bad idea
Stance: good
Argument:
	This is very. Bad as the uniforms are also cost effective

Score: 0.33296
Topic: tv is better than books
Stance: books
Argument:
	If those who have actually read a book, then there is really no debate.

Score: 0.35163
Topic: if your spouse committed murder and he or she confided in you would you turn them in
Stance: no
Argument:
	All I want to say to the spouses of the people on this side of the debate, "Don't confide in your spouse and tell them that you killed somebody; they'll turn you in! And if they find out on their own, take them out before they turn you in!" ;)



# Scattertext helper functions

In [None]:
def draw_corpus(df, corpus, category, category_publish_name, other_category, category_col, extra='', scores=None, singleScoreMode=False, minimum_term_frequency=2):
    html = st.produce_scattertext_explorer(corpus, 
                                           category=category, 
                                           category_name=category_publish_name, 
                                           not_category_name=other_category,
                                           pmi_filter_thresold=2,
                                           minimum_term_frequency=minimum_term_frequency,
                                           metadata=df['metadata'],
                                           scores=scores,
                                           width_in_pixels=1000,
                                           singleScoreMode=singleScoreMode)
    file_name = category.lower() + '-' + other_category.lower() + extra + '.html'
    open(file_name, 'wb').write(html.encode('utf-8'))
    return IFrame(src=file_name, width = 1200, height=1000)

def draw_plot(df, category, category_publish_name, other_category, category_col, extra=''):
    # Scattertext can only do a one column vs. all analysis.  We're excluding any other speakrs
    category_vs_other_df = df[(df[category_col] == category) | (df[category_col] == other_category)]
    corpus = st.CorpusFromPandas(category_vs_other_df, 
                                 category_col = category_col, 
                                 text_col = 'argument',
                                 nlp = nlp).build()
    return draw_corpus(category_vs_other_df, corpus, category, other_category, category_col, extra=extra)

## Let's see how the best tenth and worst tenth of arguments differ, and put this in the is_persusasive field

In [79]:
print('Argument count', len(df))
lower_bound, upper_bound = np.array(df['rank'].quantile([1./10,9./10]))
extreme_df = df[np.logical_or(df['rank'] <= lower_bound, df['rank'] >= upper_bound)]
extreme_df['is_persuasive'] = (extreme_df['rank'] <= lower_bound).apply(lambda x: 'yes' if x else 'no')
print('Extremely good or bad count', len(extreme_df))

Argument count 31668
Extremely good or bad count 6414


In [80]:
extreme_df.iloc[0]
draw_plot(extreme_df, 'yes', 'Unpersuasive', 'is_persusasive', extra=''):

#id                                                      arg251309
rank                                                       0.00429
argument         The school my mother works at, plus the school...
topic                     is the school uniform a good or bad idea
stance                                                         bad
metadata         is the school uniform a good or bad idea: The ...
is_persuasive                                                  yes
Name: 2, dtype: object