Skip to content

Commit

Permalink
Update to 0.0.2.48. Adding Mann Whitney.
Browse files Browse the repository at this point in the history
  • Loading branch information
JasonKessler committed Apr 24, 2019
1 parent 3f56dd4 commit 6ea7e64
Show file tree
Hide file tree
Showing 7 changed files with 259 additions and 18 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
[![Gitter Chat](https://img.shields.io/badge/GITTER-join%20chat-green.svg)](https://gitter.im/scattertext/Lobby)
[![Twitter Follow](https://img.shields.io/twitter/follow/espadrine.svg?style=social&label=Follow)](https://twitter.com/jasonkessler)

# Scattertext 0.0.2.47
# Scattertext 0.0.2.48

**Table of Contents**

Expand Down
50 changes: 50 additions & 0 deletions demo_alt_tokenization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import scattertext as st
import pandas as pd
import re


data = [
{'text': "I don't think you'll want to.", 'category': 'a'},
{'text': "You'll have a didn't a-b #dfs .", 'category': 'a'},
{'text': "You'll shoudn't #have a, didn't a-b #dfs .", 'category': 'a'},
{'text': "Can't not get along to didn't.", 'category': 'b'},
{'text': "Can't try aba-ba alo33ng to didn't.", 'category': 'b'},
{'text': "Can't no't g'e't al33ong 3to5.", 'category': 'b'},
{'text': "You haven't changed a b'it.", 'category': 'c'},
{'text': "You haven't changed a b'it.", 'category': 'c'},
{'text': "You haven't ch5ng3d a bit.", 'category': 'c'}
]

df = pd.DataFrame(data)
df['parse'] = df.text.apply(lambda x: st.whitespace_nlp_with_sentences(x, tok_splitter_re=re.compile('( )')))
corpus = st.CorpusFromParsedDocuments(df, parsed_col='parse', category_col='category').build().get_unigram_corpus()

semiotic_square = st.SemioticSquare(
corpus,
category_a='a',
category_b='b',
neutral_categories=['c'],
scorer=st.RankDifference(),
labels={'not_a_and_not_b': 'Plot Descriptions',
'a_and_b': 'Reviews',
'a_and_not_b': 'Positive',
'b_and_not_a': 'Negative',
'a':'',
'b':'',
'not_a':'',
'not_b':''}
)

html = st.produce_semiotic_square_explorer(semiotic_square,
category_name='a',
not_category_name='b',
x_label='Fresh-Rotten',
y_label='Plot-Review',
num_terms_semiotic_square=20,
minimum_term_frequency=0,
pmi_filter_thresold=0,
neutral_category_name='Plot Description')

fn = 'demo_alt_tokenization.html'
open(fn, 'wb').write(html.encode('utf-8'))
print('Open ' + fn + ' in Chrome or Firefox.')
54 changes: 54 additions & 0 deletions demo_mann_whitney.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import scattertext as st

'''
convention_df = st.SampleCorpora.ConventionData2012.get_data()
corpus = (st.CorpusFromPandas(convention_df,
category_col='party',
text_col='text',
nlp=st.whitespace_nlp_with_sentences)
.build()
.get_unigram_corpus())
term_scorer = st.MannWhitneyU(corpus).set_categories('democrat', ['republican'])
html = st.produce_frequency_explorer(
corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
y_label='Mann Whitney FDR-BH Z',
scores=term_scorer.get_score_df('fdr_bh').mwu_z,
metadata=convention_df['speaker'],
grey_threshold=0
)
file_name = 'demo_mann_whitney.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open %s in Chrome or Firefox.' % file_name)
'''

movie_df = st.SampleCorpora.RottenTomatoes.get_data()

corpus = st.CorpusFromPandas(
movie_df,
category_col='category',
text_col='text',
nlp=st.whitespace_nlp_with_sentences
).build()
corpus = corpus.get_unigram_corpus()

score_df = st.MannWhitneyU(corpus).set_categories('plot', ['fresh', 'rotten']).get_score_df('fdr_bh')

print(score_df.sort_values(by='mwu_z', ascending=False).head())
print(score_df.sort_values(by='mwu_z', ascending=False).tail())

html = st.produce_frequency_explorer(
corpus,
category='plot',
y_label='Mann Whitney FDR-BH Z',
scores=score_df.mwu_z,
grey_threshold=0
)

file_name = 'demo_mann_whitney.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open %s in Chrome or Firefox.' % file_name)

38 changes: 38 additions & 0 deletions demo_two_axis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import scattertext as st

movie_df = st.SampleCorpora.RottenTomatoes.get_data()

corpus = st.CorpusFromPandas(
movie_df,
category_col='category',
text_col='text',
nlp=st.whitespace_nlp_with_sentences
).build()
corpus = corpus.get_unigram_corpus()

fwer_method = 'fdr_bh'

x_score_df = st.MannWhitneyU(corpus).set_categories('fresh', ['rotten']).get_score_df(fwer_method)
y_score_df = st.MannWhitneyU(corpus).set_categories('plot', ['fresh', 'rotten']).get_score_df(fwer_method)


labels = {'not_a_and_not_b': 'Reviews',
'a_and_b': 'Plot Descriptions',
'a_and_not_b': 'Negative',
'b_and_not_a': 'Positive',
'a': '',
'b': '',
'not_a': '',
'not_b': ''}

html = st.produce_two_axis_plot(corpus, x_score_df, y_score_df, 'fresh', 'plot',
x_tooltip_label='rotten-fresh',
y_tooltip_label='plot-review',
statistic_column='mwu_z',
p_value_column='mwu_p',
statistic_name='z',
semiotic_square_labels=labels)

fn = 'demo_two_axes.html'
open(fn, 'wb').write(html.encode('utf-8'))
print('Open ' + fn + ' in Chrome or Firefox.')
41 changes: 25 additions & 16 deletions scattertext/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import print_function

version = [0, 0, 2, 47]
version = [0, 0, 2, 48]
__version__ = '.'.join([str(e) for e in version])
import re
import warnings
Expand All @@ -19,6 +19,7 @@
from scattertext.termcompaction.AssociationCompactor import AssociationCompactor, TermCategoryRanker, \
AssociationCompactorByRank
from scattertext.termscoring.CohensD import CohensD, HedgesR
from scattertext.termscoring.MannWhitneyU import MannWhitneyU
from scattertext.diachronic.BubbleDiachronicVisualization import BubbleDiachronicVisualization
from scattertext.diachronic.DiachronicTermMiner import DiachronicTermMiner
from scattertext.characteristic.DenseRankCharacteristicness import DenseRankCharacteristicness
Expand Down Expand Up @@ -832,12 +833,13 @@ def round_to_1(x):
if use_term_significance:
kwargs['term_significance'] = term_scorer

color_func = '''(function(d) {
kwargs['y_label'] = kwargs.get('y_label', term_scorer.get_name())

kwargs['color_func'] = kwargs.get('color_func', '''(function(d) {
return (Math.abs(d.os) < %s)
? d3.interpolate(d3.rgb(230, 230, 230), d3.rgb(130, 130, 130))(Math.abs(d.os)/%s)
: d3.interpolateRdYlBu(d.y);
})''' % (grey_threshold, grey_threshold)

})''' % (grey_threshold, grey_threshold))

return produce_scattertext_explorer(corpus,
category=category,
Expand All @@ -853,10 +855,8 @@ def round_to_1(x):
rescale_y=y_axis_rescale,
sort_by_dist=False,
term_ranker=term_ranker,
color_func=color_func,
not_categories=not_categories,
x_label=kwargs.get('x_label', 'Log Frequency'),
y_label=kwargs.get('y_label', term_scorer.get_name()),
**kwargs)


Expand Down Expand Up @@ -1456,42 +1456,51 @@ def produce_two_axis_plot(corpus,
y_score_df,
x_label,
y_label,
effect_size_column='cohens_d',
statistic_column='cohens_d',
p_value_column='cohens_d_p',
statistic_name='d',
use_non_text_features=False,
pick_color=pick_color,
axis_scaler=scale_neg_1_to_1_with_zero_mean,
distance_measure=EuclideanDistance,
semiotic_square_labels=None,
x_tooltip_label=None,
y_tooltip_label=None,
**kwargs):
'''
:param corpus: Corpus
:param x_score_df: pd.DataFrame, contains effect_size_column, p_value_column. outputted by CohensDs
:param y_score_df: pd.DataFrame, contains effect_size_column, p_value_column. outputted by CohensDs
:param x_score_df: pd.DataFrame, contains effect_size_column, p_value_column. outputted by CohensD
:param y_score_df: pd.DataFrame, contains effect_size_column, p_value_column. outputted by CohensD
:param x_label: str
:param y_label: str
:param effect_size_column: str, column in x_score_df, y_score_df giving effect sizes, default cohens_d
:param statistic_column: str, column in x_score_df, y_score_df giving statistics, default cohens_d
:param p_value_column: str, column in x_score_df, y_score_df giving effect sizes, default cohens_d_p
:param statistic_name: str, column which corresponds to statistic name, defauld d
:param use_non_text_features: bool, default True
:param pick_color: func, returns color, default is pick_color
:param axis_scaler: func, scaler default is scale_neg_1_to_1_with_zero_mean
:param distance_measure: DistanceMeasureBase, default EuclideanDistance
This is how parts of the square are populated
:param semiotic_square_labels: dict, semiotic square position labels
:param x_tooltip_label: str, if None, x_label
:param y_tooltip_label: str, if None, y_label
:param kwargs: dict, other arguments
:return: str, html
'''


if use_non_text_features:
terms = corpus.get_metadata()
else:
terms = corpus.get_terms()

axes = pd.DataFrame({'x': x_score_df.cohens_d, 'y': y_score_df.cohens_d}).loc[terms]
axes = pd.DataFrame({'x': x_score_df[statistic_column],
'y': y_score_df[statistic_column]}).loc[terms]
merged_scores = pd.merge(x_score_df, y_score_df, left_index=True, right_index=True).loc[terms]

x_tooltip_label = x_label if x_tooltip_label is None else x_tooltip_label
y_tooltip_label = y_label if y_tooltip_label is None else y_tooltip_label

def generate_term_metadata(term_struct):
x_p = term_struct[p_value_column + '_x']
y_p = term_struct[p_value_column + '_y']
Expand All @@ -1501,12 +1510,12 @@ def generate_term_metadata(term_struct):
y_p = term_struct[p_value_column + '_corr_y']
x_p = min(x_p, 1. - x_p)
y_p = min(y_p, 1. - y_p)
x_d = term_struct[effect_size_column + '_x']
y_d = term_struct[effect_size_column + '_y']
x_d = term_struct[statistic_column + '_x']
y_d = term_struct[statistic_column + '_y']

tooltip = '%s: d: %0.3f; p: %0.4f' % (x_label, x_d, x_p)
tooltip = '%s: %s: %0.3f; p: %0.4f' % (x_tooltip_label, statistic_name, x_d, x_p)
tooltip += '<br/>'
tooltip += '%s: d: %0.3f; p: %0.4f' % (y_label, y_d, y_p)
tooltip += '%s: %s: %0.3f; p: %0.4f' % (y_tooltip_label, statistic_name, y_d, y_p)
return {'tooltip': tooltip, 'color': pick_color(x_p, y_p, np.abs(x_d), np.abs(y_d))}

explanations = merged_scores.apply(generate_term_metadata, axis=1)
Expand Down
90 changes: 90 additions & 0 deletions scattertext/termscoring/MannWhitneyU.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import pandas as pd
import numpy as np
from scipy.stats import norm, mannwhitneyu, ranksums

from scattertext.termscoring.CorpusBasedTermScorer import CorpusBasedTermScorer


class MannWhitneyU(CorpusBasedTermScorer):
'''
Mann Whitney U test
term_scorer = (MannWhitneyU(corpus).set_categories('Positive', ['Negative'], ['Plot']))
html = st.produce_frequency_explorer(
corpus,
category='Positive',
not_categories=['Negative'],
neutral_categories=['Plot'],
term_scorer=term_scorer,
metadata=rdf['movie_name'],
grey_threshold=0,
show_neutral=True
)
file_name = 'rotten_fresh_mwu.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width=1300, height=700)
'''

def _set_scorer_args(self, **kwargs):
pass

def get_scores(self, *args):
return self.get_score_df()['mwu_z']

def get_score_df(self, correction_method=None):
'''
Computes Mann Whitney corrected p, z-values. Falls back to normal approximation when numerical limits are reached.
:param correction_method: str or None, correction method from statsmodels.stats.multitest.multipletests
'fdr_bh' is recommended.
:return: pd.DataFrame
'''
X = self._get_X().astype(np.float64)
X = X / X.sum(axis=1)
cat_X, ncat_X = self._get_cat_and_ncat(X)

def normal_apx(u, x, y):
# from https://stats.stackexchange.com/questions/116315/problem-with-mann-whitney-u-test-in-scipy
m_u = len(x) * len(y) / 2
sigma_u = np.sqrt(len(x) * len(y) * (len(x) + len(y) + 1) / 12)
z = (u - m_u) / sigma_u
return 2*norm.cdf(z)
scores = []
for i in range(cat_X.shape[1]):
cat_list = cat_X.T[i].A1
ncat_list = ncat_X.T[i].A1
try:
if cat_list.mean() > ncat_list.mean():
mw = mannwhitneyu(cat_list, ncat_list, alternative='greater')
if mw.pvalue in (0, 1):
mw.pvalue = normal_apx(mw.staistic, cat_list, ncat_list)

scores.append({'mwu': mw.statistic, 'mwu_p': mw.pvalue, 'mwu_z': norm.isf(float(mw.pvalue)), 'valid':True})

else:
mw = mannwhitneyu(ncat_list, cat_list, alternative='greater')
if mw.pvalue in (0, 1):
mw.pvalue = normal_apx(mw.staistic, ncat_list, cat_list)

scores.append({'mwu': -mw.statistic, 'mwu_p': 1 - mw.pvalue, 'mwu_z': 1. - norm.isf(float(mw.pvalue)), 'valid':True})
except:
scores.append({'mwu': 0, 'mwu_p': 0, 'mwu_z': 0, 'valid':False})

score_df = pd.DataFrame(scores, index=self.corpus_.get_terms()).fillna(0)
if correction_method is not None:
from statsmodels.stats.multitest import multipletests
for method in ['mwu']:
valid_pvals = score_df[score_df.valid].mwu_p
valid_pvals_abs = np.min([valid_pvals, 1-valid_pvals], axis=0)
valid_pvals_abs_corr = multipletests(valid_pvals_abs, method=correction_method)[1]
score_df[method + '_p_corr'] = 0.5

valid_pvals_abs_corr[valid_pvals > 0.5] = 1. - valid_pvals_abs_corr[valid_pvals > 0.5]
valid_pvals_abs_corr[valid_pvals < 0.5] = valid_pvals_abs_corr[valid_pvals < 0.5]
score_df.loc[score_df.valid, method + '_p_corr'] = valid_pvals_abs_corr
score_df[method + '_z'] = -norm.ppf(score_df[method + '_p_corr'])
return score_df

def get_name(self):
return "Mann Whitney Z"
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from setuptools import setup, find_packages

setup(name='scattertext',
version='0.0.2.47',
version='0.0.2.48',
description='An NLP package to visualize interesting terms in text.',
url='https://github.com/JasonKessler/scattertext',
author='Jason Kessler',
Expand Down

0 comments on commit 6ea7e64

Please sign in to comment.