Adding in preliminary support for some extensions to Scattertext. Cod…

…e for an easier solution to Issue #56 present in `SpacyEntities`. Fixes to the TermDocMatrix modify-and-create-new functionality. Limit the number of overlapping terms displayed through the max_overlapping argument. Custom background frequencies are easier.
JasonKessler · Apr 20, 2020 · da7ac7c · da7ac7c
1 parent 42646d7
commit da7ac7c
Show file tree

Hide file tree

Showing 11 changed files with 33 additions and 22 deletions.
diff --git a/demo_category_frequencies.py b/demo_category_frequencies.py
@@ -1,7 +1,6 @@
 import pandas as pd
 
 import scattertext as st
-import scattertext.interface.ProduceScattertextExplorer
 
 '''
 Sample genre frequencies from the Corpus of Contemporary American English via 
@@ -17,7 +16,7 @@
 
 term_cat_freq = st.TermCategoryFrequencies(df)
 
-html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(
+html = st.produce_scattertext_explorer(
 	term_cat_freq,
 	category='SPOKEN',
 	category_name='Spoken',
@@ -42,7 +41,7 @@
 
 doc_term_cat_freq = st.TermCategoryFrequencies(df, document_category_df=document_df)
 
-html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(
+html = st.produce_scattertext_explorer(
 	doc_term_cat_freq,
 	category='SPOKEN',
 	category_name='Spoken',

diff --git a/demo_compact.py b/demo_compact.py
@@ -1,5 +1,4 @@
 import scattertext as st
-import scattertext.interface.ProduceScattertextExplorer
 
 df = st.SampleCorpora.ConventionData2012.get_data().assign(
     parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)
@@ -9,7 +8,7 @@
     df, category_col='party', parsed_col='parse'
 ).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
 
-html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(
+html = st.produce_scattertext_explorer(
     corpus,
     category='democrat',
     category_name='Democratic',

diff --git a/demo_custom_topic_model.py b/demo_custom_topic_model.py
@@ -1,5 +1,4 @@
 import scattertext as st
-import scattertext.interface.ProduceScattertextExplorer
 
 convention_df = st.SampleCorpora.ConventionData2012.get_data()
 convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)
@@ -19,7 +18,7 @@
 	feats_from_spacy_doc=topic_feature_builder
 ).build()
 
-html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(
+html = st.produce_scattertext_explorer(
 	topic_corpus,
 	category='democrat',
 	category_name='Democratic',

diff --git a/demo_dense_rank_difference.py b/demo_dense_rank_difference.py
@@ -1,5 +1,4 @@
 import scattertext as st
-import scattertext.interface.ProduceScattertextExplorer
 
 convention_df = st.SampleCorpora.ConventionData2012.get_data()
 corpus = (st.CorpusFromPandas(convention_df,
@@ -8,7 +7,7 @@
                               nlp=st.whitespace_nlp_with_sentences)
           .build().get_unigram_corpus())
 
-html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(
+html = st.produce_scattertext_explorer(
     corpus,
     category='BARACK OBAMA',
     sort_by_dist=False,

diff --git a/demo_emoji.py b/demo_emoji.py
@@ -7,7 +7,6 @@
 import urllib.request
 
 import scattertext as st
-import scattertext.interface.ProduceScattertextExplorer
 from scattertext.termranking import OncePerDocFrequencyRanker
 
 try:
@@ -37,7 +36,7 @@
 	feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
 ).build()
 
-html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(
+html = st.produce_scattertext_explorer(
 	corpus,
 	category='f',
 	category_name='Female',

diff --git a/demo_flashtext.py b/demo_flashtext.py
@@ -1,7 +1,6 @@
 from collections import Counter
 from flashtext import KeywordProcessor
 import scattertext as st
-import scattertext.interface.ProduceScattertextExplorer
 
 
 class FlashTextExtact(st.FeatsFromSpacyDoc):
@@ -48,7 +47,7 @@ def get_feats(self, doc):
 
 print(corpus.get_term_freq_df())
 
-html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(
+html = st.produce_scattertext_explorer(
     corpus,
     category='democrat',
     category_name='Democratic',

diff --git a/demo_japanese.py b/demo_japanese.py
@@ -1,7 +1,6 @@
 import pandas as pd
 from urllib.request import urlopen
 import scattertext as st
-import scattertext.interface.ProduceScattertextExplorer
 
 
 def main():
@@ -15,7 +14,7 @@ def main():
 	corpus = st.CorpusFromParsedDocuments(df,
 	                                      category_col='title',
 	                                      parsed_col='text').build()
-	html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(corpus,
+	html = st.produce_scattertext_explorer(corpus,
                                                                                          category='Shisei',
                                                                                          category_name='Shisei',
                                                                                          not_category_name='Horadanshaku tabimiyage',

diff --git a/demo_names.py b/demo_names.py
@@ -0,0 +1,22 @@
+import scattertext as st
+
+df = st.SampleCorpora.ConventionData2012.get_data().assign(
+    parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)
+)
+
+corpus = st.CorpusFromParsedDocuments(
+    df, category_col='party', parsed_col='parse'
+).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
+
+html = st.produce_scattertext_explorer(
+    corpus,
+    category='democrat',
+    category_name='Democratic',
+    not_category_name='Republican',
+    minimum_term_frequency=0, pmi_threshold_coefficient=0,
+    width_in_pixels=1000, metadata=corpus.get_df()['speaker'],
+    transform=st.Scalers.dense_rank,
+    max_overlapping=3
+)
+open('./demo_compact.html', 'w').write(html)
+print('open ./demo_compact.html in Chrome')
diff --git a/demo_nmf_topic_model.py b/demo_nmf_topic_model.py
@@ -3,7 +3,6 @@
 from sklearn.pipeline import Pipeline
 
 import scattertext as st
-import scattertext.interface.ProduceScattertextExplorer
 
 convention_df = st.SampleCorpora.ConventionData2012.get_data()
 convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)
@@ -30,7 +29,7 @@
 	feats_from_spacy_doc=topic_feature_builder
 ).build()
 
-html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(
+html = st.produce_scattertext_explorer(
 	topic_corpus,
 	category='democrat',
 	category_name='Democratic',

diff --git a/demo_sentence_piece.py b/demo_sentence_piece.py
@@ -2,8 +2,6 @@
 import tempfile
 import sentencepiece as spm
 
-import scattertext.interface.ProduceScattertextExplorer
-
 convention_df = st.SampleCorpora.ConventionData2012.get_data()
 convention_df['parse'] = convention_df.text.apply(st.whitespace_nlp_with_sentences)
 
@@ -33,7 +31,7 @@ def train_sentence_piece_tokenizer(documents, vocab_size):
     feats_from_spacy_doc=st.FeatsFromSentencePiece(sp)
 ).build()
 
-html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(
+html = st.produce_scattertext_explorer(
     corpus,
     category='democrat',
     category_name='Democratic',

diff --git a/demo_word_list_topic_model.py b/demo_word_list_topic_model.py
@@ -1,5 +1,4 @@
 import scattertext as st
-import scattertext.interface.ProduceScattertextExplorer
 from scattertext import RankDifference
 
 convention_df = st.SampleCorpora.ConventionData2012.get_data()
@@ -25,7 +24,7 @@
 	feats_from_spacy_doc=topic_feature_builder
 ).build()
 
-html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(
+html = st.produce_scattertext_explorer(
 	topic_corpus,
 	category='democrat',
 	category_name='Democratic',