-
Notifications
You must be signed in to change notification settings - Fork 289
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update to 0.0.2.48. Adding Mann Whitney.
- Loading branch information
1 parent
3f56dd4
commit 6ea7e64
Showing
7 changed files
with
259 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import scattertext as st | ||
import pandas as pd | ||
import re | ||
|
||
|
||
data = [ | ||
{'text': "I don't think you'll want to.", 'category': 'a'}, | ||
{'text': "You'll have a didn't a-b #dfs .", 'category': 'a'}, | ||
{'text': "You'll shoudn't #have a, didn't a-b #dfs .", 'category': 'a'}, | ||
{'text': "Can't not get along to didn't.", 'category': 'b'}, | ||
{'text': "Can't try aba-ba alo33ng to didn't.", 'category': 'b'}, | ||
{'text': "Can't no't g'e't al33ong 3to5.", 'category': 'b'}, | ||
{'text': "You haven't changed a b'it.", 'category': 'c'}, | ||
{'text': "You haven't changed a b'it.", 'category': 'c'}, | ||
{'text': "You haven't ch5ng3d a bit.", 'category': 'c'} | ||
] | ||
|
||
df = pd.DataFrame(data) | ||
df['parse'] = df.text.apply(lambda x: st.whitespace_nlp_with_sentences(x, tok_splitter_re=re.compile('( )'))) | ||
corpus = st.CorpusFromParsedDocuments(df, parsed_col='parse', category_col='category').build().get_unigram_corpus() | ||
|
||
semiotic_square = st.SemioticSquare( | ||
corpus, | ||
category_a='a', | ||
category_b='b', | ||
neutral_categories=['c'], | ||
scorer=st.RankDifference(), | ||
labels={'not_a_and_not_b': 'Plot Descriptions', | ||
'a_and_b': 'Reviews', | ||
'a_and_not_b': 'Positive', | ||
'b_and_not_a': 'Negative', | ||
'a':'', | ||
'b':'', | ||
'not_a':'', | ||
'not_b':''} | ||
) | ||
|
||
html = st.produce_semiotic_square_explorer(semiotic_square, | ||
category_name='a', | ||
not_category_name='b', | ||
x_label='Fresh-Rotten', | ||
y_label='Plot-Review', | ||
num_terms_semiotic_square=20, | ||
minimum_term_frequency=0, | ||
pmi_filter_thresold=0, | ||
neutral_category_name='Plot Description') | ||
|
||
fn = 'demo_alt_tokenization.html' | ||
open(fn, 'wb').write(html.encode('utf-8')) | ||
print('Open ' + fn + ' in Chrome or Firefox.') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import scattertext as st | ||
|
||
''' | ||
convention_df = st.SampleCorpora.ConventionData2012.get_data() | ||
corpus = (st.CorpusFromPandas(convention_df, | ||
category_col='party', | ||
text_col='text', | ||
nlp=st.whitespace_nlp_with_sentences) | ||
.build() | ||
.get_unigram_corpus()) | ||
term_scorer = st.MannWhitneyU(corpus).set_categories('democrat', ['republican']) | ||
html = st.produce_frequency_explorer( | ||
corpus, | ||
category='democrat', | ||
category_name='Democratic', | ||
not_category_name='Republican', | ||
y_label='Mann Whitney FDR-BH Z', | ||
scores=term_scorer.get_score_df('fdr_bh').mwu_z, | ||
metadata=convention_df['speaker'], | ||
grey_threshold=0 | ||
) | ||
file_name = 'demo_mann_whitney.html' | ||
open(file_name, 'wb').write(html.encode('utf-8')) | ||
print('Open %s in Chrome or Firefox.' % file_name) | ||
''' | ||
|
||
movie_df = st.SampleCorpora.RottenTomatoes.get_data() | ||
|
||
corpus = st.CorpusFromPandas( | ||
movie_df, | ||
category_col='category', | ||
text_col='text', | ||
nlp=st.whitespace_nlp_with_sentences | ||
).build() | ||
corpus = corpus.get_unigram_corpus() | ||
|
||
score_df = st.MannWhitneyU(corpus).set_categories('plot', ['fresh', 'rotten']).get_score_df('fdr_bh') | ||
|
||
print(score_df.sort_values(by='mwu_z', ascending=False).head()) | ||
print(score_df.sort_values(by='mwu_z', ascending=False).tail()) | ||
|
||
html = st.produce_frequency_explorer( | ||
corpus, | ||
category='plot', | ||
y_label='Mann Whitney FDR-BH Z', | ||
scores=score_df.mwu_z, | ||
grey_threshold=0 | ||
) | ||
|
||
file_name = 'demo_mann_whitney.html' | ||
open(file_name, 'wb').write(html.encode('utf-8')) | ||
print('Open %s in Chrome or Firefox.' % file_name) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
import scattertext as st | ||
|
||
movie_df = st.SampleCorpora.RottenTomatoes.get_data() | ||
|
||
corpus = st.CorpusFromPandas( | ||
movie_df, | ||
category_col='category', | ||
text_col='text', | ||
nlp=st.whitespace_nlp_with_sentences | ||
).build() | ||
corpus = corpus.get_unigram_corpus() | ||
|
||
fwer_method = 'fdr_bh' | ||
|
||
x_score_df = st.MannWhitneyU(corpus).set_categories('fresh', ['rotten']).get_score_df(fwer_method) | ||
y_score_df = st.MannWhitneyU(corpus).set_categories('plot', ['fresh', 'rotten']).get_score_df(fwer_method) | ||
|
||
|
||
labels = {'not_a_and_not_b': 'Reviews', | ||
'a_and_b': 'Plot Descriptions', | ||
'a_and_not_b': 'Negative', | ||
'b_and_not_a': 'Positive', | ||
'a': '', | ||
'b': '', | ||
'not_a': '', | ||
'not_b': ''} | ||
|
||
html = st.produce_two_axis_plot(corpus, x_score_df, y_score_df, 'fresh', 'plot', | ||
x_tooltip_label='rotten-fresh', | ||
y_tooltip_label='plot-review', | ||
statistic_column='mwu_z', | ||
p_value_column='mwu_p', | ||
statistic_name='z', | ||
semiotic_square_labels=labels) | ||
|
||
fn = 'demo_two_axes.html' | ||
open(fn, 'wb').write(html.encode('utf-8')) | ||
print('Open ' + fn + ' in Chrome or Firefox.') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
import pandas as pd | ||
import numpy as np | ||
from scipy.stats import norm, mannwhitneyu, ranksums | ||
|
||
from scattertext.termscoring.CorpusBasedTermScorer import CorpusBasedTermScorer | ||
|
||
|
||
class MannWhitneyU(CorpusBasedTermScorer): | ||
''' | ||
Mann Whitney U test | ||
term_scorer = (MannWhitneyU(corpus).set_categories('Positive', ['Negative'], ['Plot'])) | ||
html = st.produce_frequency_explorer( | ||
corpus, | ||
category='Positive', | ||
not_categories=['Negative'], | ||
neutral_categories=['Plot'], | ||
term_scorer=term_scorer, | ||
metadata=rdf['movie_name'], | ||
grey_threshold=0, | ||
show_neutral=True | ||
) | ||
file_name = 'rotten_fresh_mwu.html' | ||
open(file_name, 'wb').write(html.encode('utf-8')) | ||
IFrame(src=file_name, width=1300, height=700) | ||
''' | ||
|
||
def _set_scorer_args(self, **kwargs): | ||
pass | ||
|
||
def get_scores(self, *args): | ||
return self.get_score_df()['mwu_z'] | ||
|
||
def get_score_df(self, correction_method=None): | ||
''' | ||
Computes Mann Whitney corrected p, z-values. Falls back to normal approximation when numerical limits are reached. | ||
:param correction_method: str or None, correction method from statsmodels.stats.multitest.multipletests | ||
'fdr_bh' is recommended. | ||
:return: pd.DataFrame | ||
''' | ||
X = self._get_X().astype(np.float64) | ||
X = X / X.sum(axis=1) | ||
cat_X, ncat_X = self._get_cat_and_ncat(X) | ||
|
||
def normal_apx(u, x, y): | ||
# from https://stats.stackexchange.com/questions/116315/problem-with-mann-whitney-u-test-in-scipy | ||
m_u = len(x) * len(y) / 2 | ||
sigma_u = np.sqrt(len(x) * len(y) * (len(x) + len(y) + 1) / 12) | ||
z = (u - m_u) / sigma_u | ||
return 2*norm.cdf(z) | ||
scores = [] | ||
for i in range(cat_X.shape[1]): | ||
cat_list = cat_X.T[i].A1 | ||
ncat_list = ncat_X.T[i].A1 | ||
try: | ||
if cat_list.mean() > ncat_list.mean(): | ||
mw = mannwhitneyu(cat_list, ncat_list, alternative='greater') | ||
if mw.pvalue in (0, 1): | ||
mw.pvalue = normal_apx(mw.staistic, cat_list, ncat_list) | ||
|
||
scores.append({'mwu': mw.statistic, 'mwu_p': mw.pvalue, 'mwu_z': norm.isf(float(mw.pvalue)), 'valid':True}) | ||
|
||
else: | ||
mw = mannwhitneyu(ncat_list, cat_list, alternative='greater') | ||
if mw.pvalue in (0, 1): | ||
mw.pvalue = normal_apx(mw.staistic, ncat_list, cat_list) | ||
|
||
scores.append({'mwu': -mw.statistic, 'mwu_p': 1 - mw.pvalue, 'mwu_z': 1. - norm.isf(float(mw.pvalue)), 'valid':True}) | ||
except: | ||
scores.append({'mwu': 0, 'mwu_p': 0, 'mwu_z': 0, 'valid':False}) | ||
|
||
score_df = pd.DataFrame(scores, index=self.corpus_.get_terms()).fillna(0) | ||
if correction_method is not None: | ||
from statsmodels.stats.multitest import multipletests | ||
for method in ['mwu']: | ||
valid_pvals = score_df[score_df.valid].mwu_p | ||
valid_pvals_abs = np.min([valid_pvals, 1-valid_pvals], axis=0) | ||
valid_pvals_abs_corr = multipletests(valid_pvals_abs, method=correction_method)[1] | ||
score_df[method + '_p_corr'] = 0.5 | ||
|
||
valid_pvals_abs_corr[valid_pvals > 0.5] = 1. - valid_pvals_abs_corr[valid_pvals > 0.5] | ||
valid_pvals_abs_corr[valid_pvals < 0.5] = valid_pvals_abs_corr[valid_pvals < 0.5] | ||
score_df.loc[score_df.valid, method + '_p_corr'] = valid_pvals_abs_corr | ||
score_df[method + '_z'] = -norm.ppf(score_df[method + '_p_corr']) | ||
return score_df | ||
|
||
def get_name(self): | ||
return "Mann Whitney Z" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters