Skip to content

Commit

Permalink
Adding in preliminary support for some extensions to Scattertext. Cod…
Browse files Browse the repository at this point in the history
…e for an easier solution to Issue #56 present in `SpacyEntities`.  Fixes to the TermDocMatrix modify-and-create-new functionality.  Limit the number of overlapping terms displayed through the max_overlapping argument. Custom background frequencies are easier.
  • Loading branch information
JasonKessler committed Apr 20, 2020
1 parent 42646d7 commit da7ac7c
Show file tree
Hide file tree
Showing 11 changed files with 33 additions and 22 deletions.
5 changes: 2 additions & 3 deletions demo_category_frequencies.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import pandas as pd

import scattertext as st
import scattertext.interface.ProduceScattertextExplorer

'''
Sample genre frequencies from the Corpus of Contemporary American English via
Expand All @@ -17,7 +16,7 @@

term_cat_freq = st.TermCategoryFrequencies(df)

html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(
html = st.produce_scattertext_explorer(
term_cat_freq,
category='SPOKEN',
category_name='Spoken',
Expand All @@ -42,7 +41,7 @@

doc_term_cat_freq = st.TermCategoryFrequencies(df, document_category_df=document_df)

html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(
html = st.produce_scattertext_explorer(
doc_term_cat_freq,
category='SPOKEN',
category_name='Spoken',
Expand Down
3 changes: 1 addition & 2 deletions demo_compact.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import scattertext as st
import scattertext.interface.ProduceScattertextExplorer

df = st.SampleCorpora.ConventionData2012.get_data().assign(
parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)
Expand All @@ -9,7 +8,7 @@
df, category_col='party', parsed_col='parse'
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))

html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(
html = st.produce_scattertext_explorer(
corpus,
category='democrat',
category_name='Democratic',
Expand Down
3 changes: 1 addition & 2 deletions demo_custom_topic_model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import scattertext as st
import scattertext.interface.ProduceScattertextExplorer

convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)
Expand All @@ -19,7 +18,7 @@
feats_from_spacy_doc=topic_feature_builder
).build()

html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(
html = st.produce_scattertext_explorer(
topic_corpus,
category='democrat',
category_name='Democratic',
Expand Down
3 changes: 1 addition & 2 deletions demo_dense_rank_difference.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import scattertext as st
import scattertext.interface.ProduceScattertextExplorer

convention_df = st.SampleCorpora.ConventionData2012.get_data()
corpus = (st.CorpusFromPandas(convention_df,
Expand All @@ -8,7 +7,7 @@
nlp=st.whitespace_nlp_with_sentences)
.build().get_unigram_corpus())

html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(
html = st.produce_scattertext_explorer(
corpus,
category='BARACK OBAMA',
sort_by_dist=False,
Expand Down
3 changes: 1 addition & 2 deletions demo_emoji.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import urllib.request

import scattertext as st
import scattertext.interface.ProduceScattertextExplorer
from scattertext.termranking import OncePerDocFrequencyRanker

try:
Expand Down Expand Up @@ -37,7 +36,7 @@
feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
).build()

html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(
html = st.produce_scattertext_explorer(
corpus,
category='f',
category_name='Female',
Expand Down
3 changes: 1 addition & 2 deletions demo_flashtext.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from collections import Counter
from flashtext import KeywordProcessor
import scattertext as st
import scattertext.interface.ProduceScattertextExplorer


class FlashTextExtact(st.FeatsFromSpacyDoc):
Expand Down Expand Up @@ -48,7 +47,7 @@ def get_feats(self, doc):

print(corpus.get_term_freq_df())

html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(
html = st.produce_scattertext_explorer(
corpus,
category='democrat',
category_name='Democratic',
Expand Down
3 changes: 1 addition & 2 deletions demo_japanese.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import pandas as pd
from urllib.request import urlopen
import scattertext as st
import scattertext.interface.ProduceScattertextExplorer


def main():
Expand All @@ -15,7 +14,7 @@ def main():
corpus = st.CorpusFromParsedDocuments(df,
category_col='title',
parsed_col='text').build()
html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(corpus,
html = st.produce_scattertext_explorer(corpus,
category='Shisei',
category_name='Shisei',
not_category_name='Horadanshaku tabimiyage',
Expand Down
22 changes: 22 additions & 0 deletions demo_names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import scattertext as st

df = st.SampleCorpora.ConventionData2012.get_data().assign(
parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)
)

corpus = st.CorpusFromParsedDocuments(
df, category_col='party', parsed_col='parse'
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))

html = st.produce_scattertext_explorer(
corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
minimum_term_frequency=0, pmi_threshold_coefficient=0,
width_in_pixels=1000, metadata=corpus.get_df()['speaker'],
transform=st.Scalers.dense_rank,
max_overlapping=3
)
open('./demo_compact.html', 'w').write(html)
print('open ./demo_compact.html in Chrome')
3 changes: 1 addition & 2 deletions demo_nmf_topic_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from sklearn.pipeline import Pipeline

import scattertext as st
import scattertext.interface.ProduceScattertextExplorer

convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)
Expand All @@ -30,7 +29,7 @@
feats_from_spacy_doc=topic_feature_builder
).build()

html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(
html = st.produce_scattertext_explorer(
topic_corpus,
category='democrat',
category_name='Democratic',
Expand Down
4 changes: 1 addition & 3 deletions demo_sentence_piece.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
import tempfile
import sentencepiece as spm

import scattertext.interface.ProduceScattertextExplorer

convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df.text.apply(st.whitespace_nlp_with_sentences)

Expand Down Expand Up @@ -33,7 +31,7 @@ def train_sentence_piece_tokenizer(documents, vocab_size):
feats_from_spacy_doc=st.FeatsFromSentencePiece(sp)
).build()

html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(
html = st.produce_scattertext_explorer(
corpus,
category='democrat',
category_name='Democratic',
Expand Down
3 changes: 1 addition & 2 deletions demo_word_list_topic_model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import scattertext as st
import scattertext.interface.ProduceScattertextExplorer
from scattertext import RankDifference

convention_df = st.SampleCorpora.ConventionData2012.get_data()
Expand All @@ -25,7 +24,7 @@
feats_from_spacy_doc=topic_feature_builder
).build()

html = scattertext.interface.ProduceScattertextExplorer.produce_scattertext_explorer(
html = st.produce_scattertext_explorer(
topic_corpus,
category='democrat',
category_name='Democratic',
Expand Down

0 comments on commit da7ac7c

Please sign in to comment.