Fixing CredTFIDF so that it you can use it to visualize metadata (i.e…

…., topics) in addition to terms. Added a flashtext-based functionality to let you visualize topics which have entries with spaces or otherwise contain multiple tokens. This is in PhraseFeatsFromTopicModel.
JasonKessler · Jul 19, 2020 · c7b791b · c7b791b
1 parent 8e68d81
commit c7b791b
Show file tree

Hide file tree

Showing 7 changed files with 126 additions and 30 deletions.
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 [![Gitter Chat](https://img.shields.io/badge/GITTER-join%20chat-green.svg)](https://gitter.im/scattertext/Lobby)
 [![Twitter Follow](https://img.shields.io/twitter/follow/espadrine.svg?style=social&label=Follow)](https://twitter.com/jasonkessler)
 
-# Scattertext 0.0.2.66
+# Scattertext 0.0.2.67
 
 A tool for finding distinguishing terms in corpora, and presenting them in an 
 interactive, HTML scatter plot. Points corresponding to terms are selectively labeled
@@ -95,7 +95,7 @@ with `word_similarity_explorer`, and the tokenization and sentence boundary dete
 capabilities will be low-performance regular expressions. See `demo_without_spacy.py`
 for an example. 
 
-It is recommended you install `jieba`, `spacy`, `empath`, `astropy`, `gensim` and `umap-learn` in order to 
+It is recommended you install `jieba`, `spacy`, `empath`, `astropy`, `flashtext`, `gensim` and `umap-learn` in order to 
 take full advantage of Scattertext. 
 
 Scattertext should mostly work with Python 2.7, but it may not.  
@@ -1646,6 +1646,11 @@ topic_model = {
 }
 ```
 
+Note that these topics are consist of terms which are *single* tokens. If you'd like to make use of terms which 
+consist of multiple tokens (e.g., have spaces in them) please use the `PhraseFeatsFromTopicModel` 
+instead of `FeatsFromTopicModel` in the example below. Note that `PhraseFeatsFromTopicModel` requires the 
+[`flashtext`](https://github.com/vi3k6i5/flashtext) library installed.  
+
 We can use the `FeatsFromTopicModel` class to transform this topic model into one which
 can be visualized using Scattertext. This is used just like any other feature builder,
 and we pass the topic model object into `produce_scattertext_explorer`.

diff --git a/scattertext/__init__.py b/scattertext/__init__.py
@@ -1,8 +1,6 @@
 from __future__ import print_function
 
-from scattertext.diachronic.TimeStructure import TimeStructure
-
-version = [0, 0, 2, 66]
+version = [0, 0, 2, 67]
 __version__ = '.'.join([str(e) for e in version])
 import re
 import numpy as np
@@ -117,6 +115,8 @@
 from scattertext.representations.CategoryEmbeddings import CategoryEmbeddingsResolver, EmbeddingAligner
 from scattertext.features.FeatsFromScoredLexicon import FeatsFromScoredLexicon
 from scattertext.features.SpacyEntities import SpacyEntities
+from scattertext.diachronic.TimeStructure import TimeStructure
+from scattertext.features.PhraseFeatsFromTopicModel import PhraseFeatsFromTopicModel
 
 
 def produce_scattertext_explorer(corpus,

diff --git a/scattertext/features/FeatsFromTopicModel.py b/scattertext/features/FeatsFromTopicModel.py
@@ -1,3 +1,4 @@
+from abc import ABC, abstractmethod
 from collections import Counter
 from re import split
 from sys import version_info
@@ -7,30 +8,10 @@
 from scattertext.ScatterChart import check_topic_model_string_format
 from scattertext.features.FeatsFromSpacyDoc import FeatsFromSpacyDoc
 
-
-class FeatsFromTopicModel(FeatsFromSpacyDoc):
-	def __init__(self,
-	             topic_model,
-	             use_lemmas=False,
-	             entity_types_to_censor=set(),
-	             tag_types_to_censor=set(),
-	             strip_final_period=False,
-	             **kwargs):
-		'''
-		Parameters
-		----------
-		topic_model : dict
-			{topicmodelname: [term1, term2, ....], ...}
-
-		Other parameters from FeatsFromSpacyDoc.__init__
-		'''
-		check_topic_model_string_format(topic_model)
+class FeatsFromTopicModelBase(ABC):
+	def __init__(self, topic_model):
 		self._topic_model = topic_model
 		self._lexicon_df = self._get_lexicon_df_from_topic_model(topic_model)
-		super(FeatsFromTopicModel, self).__init__(use_lemmas,
-		                                          entity_types_to_censor,
-		                                          tag_types_to_censor,
-		                                          strip_final_period)
 
 	def _get_lexicon_df_from_topic_model(self, topic_model):
 		return (pd.DataFrame(pd.Series(topic_model)
@@ -42,7 +23,7 @@ def _get_lexicon_df_from_topic_model(self, topic_model):
 		        .set_index('term'))
 
 	def _analyze(self, doc):
-		text_df = (pd.DataFrame(pd.Series(Counter(t for t in split(r"(\W)", doc.lower()) if t.strip())))
+		text_df = (pd.DataFrame(pd.Series(self._get_terms_from_doc(doc)))
 		           .join(self._lexicon_df)
 		           .dropna()
 		           .groupby('cat')
@@ -58,6 +39,40 @@ def get_doc_metadata(self, doc, prefix=''):
 			feature_counter[prefix + category] = int(score)
 		return feature_counter
 
+	@abstractmethod
+	def _get_terms_from_doc(self, doc):
+		pass
+
+class FeatsFromTopicModel(FeatsFromSpacyDoc, FeatsFromTopicModelBase):
+	def __init__(self,
+	             topic_model,
+	             use_lemmas=False,
+	             entity_types_to_censor=set(),
+	             tag_types_to_censor=set(),
+	             strip_final_period=False,
+	             **kwargs):
+		'''
+		Parameters
+		----------
+		topic_model : dict
+			{topicmodelname: [term1, term2, ....], ...}
+
+		Other parameters from FeatsFromSpacyDoc.__init__
+		'''
+		check_topic_model_string_format(topic_model)
+		self._topic_model = topic_model
+		self._lexicon_df = self._get_lexicon_df_from_topic_model(topic_model)
+		super(FeatsFromTopicModel, self).__init__(use_lemmas,
+		                                          entity_types_to_censor,
+		                                          tag_types_to_censor,
+		                                          strip_final_period)
+
+
+
+
+	def _get_terms_from_doc(self, doc):
+		return Counter(t for t in split(r"(\W)", doc.lower()) if t.strip())
+
 	def has_metadata_term_list(self):
 		return True
 

diff --git a/scattertext/features/PhraseFeatsFromTopicModel.py b/scattertext/features/PhraseFeatsFromTopicModel.py
@@ -0,0 +1,39 @@
+from collections import Counter
+from functools import reduce
+
+from scattertext import FeatsFromSpacyDoc
+from scattertext.features.FeatsFromTopicModel import FeatsFromTopicModelBase
+
+
+class PhraseFeatsFromTopicModel(FeatsFromTopicModelBase, FeatsFromSpacyDoc):
+    '''
+    This class allows you to make use of a topic model which has multi-token entries (i.e., terms in topics which
+    have spaces in them.)
+    It requires Flashtext to be installed.
+    '''
+    def __init__(self,
+                 topic_model,
+                 use_lemmas=False,
+                 entity_types_to_censor=set(),
+                 entity_types_to_use=None,
+                 tag_types_to_censor=set(),
+                 strip_final_period=False,
+                 keyword_processor_args = {'case_sensitive' :False}):
+        from flashtext import KeywordProcessor
+        self._keyword_processor = KeywordProcessor(**keyword_processor_args)
+        self._topic_model = topic_model
+        for keyphrase in reduce(lambda x, y: set(x) | set(y), topic_model.values()):
+            self._keyword_processor.add_keyword(keyphrase)
+        FeatsFromSpacyDoc.__init__(self, use_lemmas, entity_types_to_censor,
+                                   tag_types_to_censor, strip_final_period)
+        FeatsFromTopicModelBase.__init__(self, topic_model)
+
+
+    def get_top_model_term_lists(self):
+        return self._topic_model
+
+    def _get_terms_from_doc(self, doc):
+        return Counter(self._keyword_processor.extract_keywords(doc))
+
+    def get_feats(self, doc):
+        return Counter(self._get_terms_from_doc(str(doc)))
diff --git a/scattertext/termscoring/CredTFIDF.py b/scattertext/termscoring/CredTFIDF.py
@@ -103,7 +103,7 @@ def _get_score_df_from_category_Xs(self, tf_i_d_neg, tf_i_d_pos):
             'pos_cred_tfidf': pos_cred_tfidf,
             'neg_cred_tfidf': neg_cred_tfidf,
             'delta_cred_tf_idf': pos_cred_tfidf - neg_cred_tfidf
-        }, index=self.corpus_.get_terms())
+        }, index=self._get_index())
         return score_df
 
     def _set_scorer_args(self, **kwargs):

diff --git a/scattertext/test/test_phraseFeatsFromTopicModel.py b/scattertext/test/test_phraseFeatsFromTopicModel.py
@@ -0,0 +1,37 @@
+from collections import Counter
+from unittest import TestCase, mock
+from unittest.mock import patch, MagicMock
+import sys
+
+from scattertext.features.PhraseFeatsFromTopicModel import PhraseFeatsFromTopicModel
+
+
+class TestPhraseFeatsFromTopicModel(TestCase):
+    def test_get_doc_get_feats(self):
+        flashtext = MagicMock()
+        flashtext.KeywordProcessor().extract_keywords.return_value = ['A b', 'A b', 'C e F', 'B']
+        sys.modules["flashtext"] = flashtext
+
+        expected = Counter({'A b': 2, 'C e F': 1, 'B': 1})
+
+        actual = PhraseFeatsFromTopicModel(
+            topic_model={'Topic A': ['A b', 'b', 'C e F'],
+                         'Topic B': ['B', 'C e F']}
+        ).get_feats('A b A b C e F B')
+
+        self.assertEqual(expected, actual)
+
+
+    def test_get_doc_metadata(self):
+        flashtext = MagicMock()
+        flashtext.KeywordProcessor().extract_keywords.return_value = ['A b', 'A b', 'C e F', 'B']
+        sys.modules["flashtext"] = flashtext
+
+        expected = Counter({'Topic A': 3, 'Topic B': 2})
+
+        actual = PhraseFeatsFromTopicModel(
+            topic_model={'Topic A': ['A b', 'b', 'C e F'],
+                         'Topic B': ['B', 'C e F']}
+        ).get_doc_metadata('A b A b C e F B')
+
+        self.assertEqual(expected, actual)
diff --git a/setup.py b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 
 setup(name='scattertext',
-      version='0.0.2.66',
+      version='0.0.2.67',
       description='An NLP package to visualize interesting terms in text.',
       url='https://github.com/JasonKessler/scattertext',
       author='Jason Kessler',