In [1]:
%matplotlib inline
import texttoideas as TI
import pandas as pd
import numpy as np
from pprint import pprint
from texttoideas.TermDocMatrix import TermDocMatrix, build_from_category_spacy_doc_iter
from texttoideas.RudderChart import *
import mpld3, imp
mpld3.enable_notebook()

In [2]:
speeches = list(TI.iter_party_convention_speech(convention_speech_iter = TI.convention_speech_iter()))

category democrat
# speeches 123
category republican
# speeches 66


In [3]:
pprint([[cat, str(text)[:100].strip()] for cat, text in speeches[:3]])
pprint([[cat, str(text)[:100].strip()] for cat, text in speeches[-3:]])

[[u'democrat',
  'Thank you. Thank you. Thank you. Thank you so much.Thank you.Thank you so much. Thank you. Thank you'],
 [u'democrat',
  'Thank you so much. Tonight, I am so thrilled and so honored and so proud to introduce the love of my'],
 [u'democrat',
  'Thank you. It is a singular honor to be here tonight. Eight years ago in Boston, I introduced you to']]
[[u'republican',
  'To Chairman Priebus and to my fellow Americans who have traveled from every corner of this great cou'],
 [u'republican',
  'Absolutely. Thank you, Mr.Chairman.\nWelcome, everyone, to my home state of Florida, and we are so h'],
 [u'republican',
  "I am thrilled to add Utah's voice in support for Mitt Romney. Let me tell you about the America I kn"]]


In [4]:
imp.reload(TI)
term_doc_mat = TI.build_from_category_spacy_doc_iter(speeches)

In [5]:
df = term_doc_mat.get_term_freq_df()
print df.ix[[x for x in df.index if ' ' not in x]].sum().sum()
print df.ix['romney'].sum()
dfbg = term_doc_mat._get_corpus_joined_to_background()
print '%d' % (dfbg.ix[[x for x in df.index if ' ' not in x]].sum().sum())
print dfbg.ix['romney']
import scipy.stats as stats
#total_corpus = dfbg.ix[[x for x in df.index if ' ' not in x]].sum().sum()
romeny_corpus = dfbg.ix['romney']['background']
table = [[570, 137966 - 570], 
         [dfbg.ix['romney']['background'], 
          dfbg['background'].sum() - dfbg.ix['romney']['background']]]
print table
print stats.fisher_exact(table, alternative='greater')



137966
570
369016983229
corpus           570
background    695398
Name: romney, dtype: float64
[[570, 137396], [695398.0, 588123524789.0]]
(3508.6163308626865, 0.0)


In [None]:
scaled_f_score_df = (term_doc_mat
      .get_scaled_f_score_scores_vs_background(scaler_algo='none')
      .sort_values(by='scaled_f_score', ascending = False))
rudder_df = (term_doc_mat
      .get_rudder_scores_vs_background()
      .sort_values(by='rudder', ascending = True))
postratio_df = (term_doc_mat
      .get_posterior_mean_ratio_scores_vs_background()
      .sort_values(by='Log Posterior Mean Ratio', ascending = False))


In [None]:
fisher_df = term_doc_mat.get_fisher_scores_vs_background()

In [None]:
fisher_df = fisher_df.sort_values(by=['Bonferroni-corrected p-values', 'Odds ratio'], ascending=[True, False])

In [None]:
top_20_fisher = (fisher_df
                 .sort_values(by=['Bonferroni-corrected p-values', 'Odds ratio'], 
                              ascending = [True, False]).index[:20])

## Different methods for comparing 2012 convention speeches to background unigram counts

In [None]:

def get_text(x):
    return x.name + '%s (%d)' % ('*' if x['background'] == 0 else '', x['corpus'])
comparison_to_background_corpora = pd.DataFrame({
        'Top F-Score Terms (Freq)':list(scaled_f_score_df.iloc[:20].apply(get_text, axis=1)),
        'Top Rudder Terms (Freq)':list(rudder_df.iloc[:20].apply(get_text, axis=1)),
        'Top Fisher Terms (Freq)':list(fisher_df.iloc[:20].apply(get_text, axis=1))
    })
comparison_to_background_corpora[:12]


In [None]:

df = term_doc_mat.get_term_freq_df()
df['fscore dem'] = np.array(term_doc_mat.get_scaled_f_scores('democrat', scaler_algo='percentile'))
df['fscore rep'] = np.array(term_doc_mat.get_scaled_f_scores('republican', scaler_algo='percentile'))
df['rudder dem'] = term_doc_mat.get_rudder_scores('democrat')
df['rudder rep'] = term_doc_mat.get_rudder_scores('republican')
df['logregl2 dem'], acc, bl = term_doc_mat.get_logistic_regression_coefs_l2('democrat')
df['logregl2 rep'], acc, bl = term_doc_mat.get_logistic_regression_coefs_l2('republican')
df['logregl1 dem'], acc, bl = term_doc_mat.get_logistic_regression_coefs_l1('democrat')
df['logregl1 rep'], acc, bl = term_doc_mat.get_logistic_regression_coefs_l1('democrat')


In [None]:
print 'Democratic Associations'
pd.DataFrame({
        'Rudder (Freq)': list(df.sort_values(by='rudder dem')
                              .apply(lambda x: x.name + ' (%s)' % (int(x['democrat freq'])), axis=1)[:20]),
        'F-Score (Freq)': list(df.sort_values(by='kessler dem', ascending=False)
                               .apply(lambda x: x.name + ' (%s)' % (int(x['democrat freq'])), axis=1)[:20]), 
        'Ridge (Freq)': list(df.sort_values(by='logregl2 dem', ascending=False)
                               .apply(lambda x: x.name + ' (%s)' % (int(x['democrat freq'])), axis=1)[:20]) 
})


In [None]:
import imp
imp.reload(texttoideas.RudderChart)
rudder_chart = texttoideas.RudderChart.RudderChart(term_doc_mat, 
                                                   jitter=0.1)
plt.rcParams['figure.figsize'] = (10, 10)
drawn_df, fig_html = rudder_chart.draw('democrat', 
                                       num_top_words_to_annotate = 0,
                                       words_to_annotate=['the', 'america forward', 'auto', 
                                                          'pell', 'affordable', 'fight for',
                                                         'israel', 'tax cut', 'security', 
                                                         'michelle', 'last week',
                                                          'big government',
                                                          'olympics', 'ann',
                                                          'small business',
                                                          'success',
                                                          'boy',
                                                          'girl',
                                                          'restore',
                                                          'grandfather',
                                                          'grandmother',
                                                          'daughter',
                                                          'god bless'
                                                         'ideology',
                                                         'mr. president',
                                                         'restore',
                                                         'reagan',
                                                          'church',
                                                          'trickle down',
                                                          'rosevelt',
                                                          '10 million',
                                                          'vice presidential',
                                                         'creator',
                                                         'reformer',
                                                         'unemployment'])
open('fig.html','w').write(fig_html)

In [None]:
term_doc_mat._category_idx_store = texttoideas.IndexStore()
term_doc_mat._category_idx_store.getidx('democrat')
term_doc_mat._category_idx_store.getidx('republican')
