Skip to content

Commit

Permalink
Fixing CredTFIDF so that it you can use it to visualize metadata (i.e…
Browse files Browse the repository at this point in the history
…., topics) in addition to terms. Added a flashtext-based functionality to let you visualize topics which have entries with spaces or otherwise contain multiple tokens. This is in PhraseFeatsFromTopicModel.
  • Loading branch information
JasonKessler committed Jul 19, 2020
1 parent 8e68d81 commit c7b791b
Show file tree
Hide file tree
Showing 7 changed files with 126 additions and 30 deletions.
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
[![Gitter Chat](https://img.shields.io/badge/GITTER-join%20chat-green.svg)](https://gitter.im/scattertext/Lobby)
[![Twitter Follow](https://img.shields.io/twitter/follow/espadrine.svg?style=social&label=Follow)](https://twitter.com/jasonkessler)

# Scattertext 0.0.2.66
# Scattertext 0.0.2.67

A tool for finding distinguishing terms in corpora, and presenting them in an
interactive, HTML scatter plot. Points corresponding to terms are selectively labeled
Expand Down Expand Up @@ -95,7 +95,7 @@ with `word_similarity_explorer`, and the tokenization and sentence boundary dete
capabilities will be low-performance regular expressions. See `demo_without_spacy.py`
for an example.

It is recommended you install `jieba`, `spacy`, `empath`, `astropy`, `gensim` and `umap-learn` in order to
It is recommended you install `jieba`, `spacy`, `empath`, `astropy`, `flashtext`, `gensim` and `umap-learn` in order to
take full advantage of Scattertext.

Scattertext should mostly work with Python 2.7, but it may not.
Expand Down Expand Up @@ -1646,6 +1646,11 @@ topic_model = {
}
```
Note that these topics are consist of terms which are *single* tokens. If you'd like to make use of terms which
consist of multiple tokens (e.g., have spaces in them) please use the `PhraseFeatsFromTopicModel`
instead of `FeatsFromTopicModel` in the example below. Note that `PhraseFeatsFromTopicModel` requires the
[`flashtext`](https://github.com/vi3k6i5/flashtext) library installed.
We can use the `FeatsFromTopicModel` class to transform this topic model into one which
can be visualized using Scattertext. This is used just like any other feature builder,
and we pass the topic model object into `produce_scattertext_explorer`.
Expand Down
6 changes: 3 additions & 3 deletions scattertext/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from __future__ import print_function

from scattertext.diachronic.TimeStructure import TimeStructure

version = [0, 0, 2, 66]
version = [0, 0, 2, 67]
__version__ = '.'.join([str(e) for e in version])
import re
import numpy as np
Expand Down Expand Up @@ -117,6 +115,8 @@
from scattertext.representations.CategoryEmbeddings import CategoryEmbeddingsResolver, EmbeddingAligner
from scattertext.features.FeatsFromScoredLexicon import FeatsFromScoredLexicon
from scattertext.features.SpacyEntities import SpacyEntities
from scattertext.diachronic.TimeStructure import TimeStructure
from scattertext.features.PhraseFeatsFromTopicModel import PhraseFeatsFromTopicModel


def produce_scattertext_explorer(corpus,
Expand Down
61 changes: 38 additions & 23 deletions scattertext/features/FeatsFromTopicModel.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from abc import ABC, abstractmethod
from collections import Counter
from re import split
from sys import version_info
Expand All @@ -7,30 +8,10 @@
from scattertext.ScatterChart import check_topic_model_string_format
from scattertext.features.FeatsFromSpacyDoc import FeatsFromSpacyDoc


class FeatsFromTopicModel(FeatsFromSpacyDoc):
def __init__(self,
topic_model,
use_lemmas=False,
entity_types_to_censor=set(),
tag_types_to_censor=set(),
strip_final_period=False,
**kwargs):
'''
Parameters
----------
topic_model : dict
{topicmodelname: [term1, term2, ....], ...}
Other parameters from FeatsFromSpacyDoc.__init__
'''
check_topic_model_string_format(topic_model)
class FeatsFromTopicModelBase(ABC):
def __init__(self, topic_model):
self._topic_model = topic_model
self._lexicon_df = self._get_lexicon_df_from_topic_model(topic_model)
super(FeatsFromTopicModel, self).__init__(use_lemmas,
entity_types_to_censor,
tag_types_to_censor,
strip_final_period)

def _get_lexicon_df_from_topic_model(self, topic_model):
return (pd.DataFrame(pd.Series(topic_model)
Expand All @@ -42,7 +23,7 @@ def _get_lexicon_df_from_topic_model(self, topic_model):
.set_index('term'))

def _analyze(self, doc):
text_df = (pd.DataFrame(pd.Series(Counter(t for t in split(r"(\W)", doc.lower()) if t.strip())))
text_df = (pd.DataFrame(pd.Series(self._get_terms_from_doc(doc)))
.join(self._lexicon_df)
.dropna()
.groupby('cat')
Expand All @@ -58,6 +39,40 @@ def get_doc_metadata(self, doc, prefix=''):
feature_counter[prefix + category] = int(score)
return feature_counter

@abstractmethod
def _get_terms_from_doc(self, doc):
pass

class FeatsFromTopicModel(FeatsFromSpacyDoc, FeatsFromTopicModelBase):
def __init__(self,
topic_model,
use_lemmas=False,
entity_types_to_censor=set(),
tag_types_to_censor=set(),
strip_final_period=False,
**kwargs):
'''
Parameters
----------
topic_model : dict
{topicmodelname: [term1, term2, ....], ...}
Other parameters from FeatsFromSpacyDoc.__init__
'''
check_topic_model_string_format(topic_model)
self._topic_model = topic_model
self._lexicon_df = self._get_lexicon_df_from_topic_model(topic_model)
super(FeatsFromTopicModel, self).__init__(use_lemmas,
entity_types_to_censor,
tag_types_to_censor,
strip_final_period)




def _get_terms_from_doc(self, doc):
return Counter(t for t in split(r"(\W)", doc.lower()) if t.strip())

def has_metadata_term_list(self):
return True

Expand Down
39 changes: 39 additions & 0 deletions scattertext/features/PhraseFeatsFromTopicModel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from collections import Counter
from functools import reduce

from scattertext import FeatsFromSpacyDoc
from scattertext.features.FeatsFromTopicModel import FeatsFromTopicModelBase


class PhraseFeatsFromTopicModel(FeatsFromTopicModelBase, FeatsFromSpacyDoc):
'''
This class allows you to make use of a topic model which has multi-token entries (i.e., terms in topics which
have spaces in them.)
It requires Flashtext to be installed.
'''
def __init__(self,
topic_model,
use_lemmas=False,
entity_types_to_censor=set(),
entity_types_to_use=None,
tag_types_to_censor=set(),
strip_final_period=False,
keyword_processor_args = {'case_sensitive' :False}):
from flashtext import KeywordProcessor
self._keyword_processor = KeywordProcessor(**keyword_processor_args)
self._topic_model = topic_model
for keyphrase in reduce(lambda x, y: set(x) | set(y), topic_model.values()):
self._keyword_processor.add_keyword(keyphrase)
FeatsFromSpacyDoc.__init__(self, use_lemmas, entity_types_to_censor,
tag_types_to_censor, strip_final_period)
FeatsFromTopicModelBase.__init__(self, topic_model)


def get_top_model_term_lists(self):
return self._topic_model

def _get_terms_from_doc(self, doc):
return Counter(self._keyword_processor.extract_keywords(doc))

def get_feats(self, doc):
return Counter(self._get_terms_from_doc(str(doc)))
2 changes: 1 addition & 1 deletion scattertext/termscoring/CredTFIDF.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def _get_score_df_from_category_Xs(self, tf_i_d_neg, tf_i_d_pos):
'pos_cred_tfidf': pos_cred_tfidf,
'neg_cred_tfidf': neg_cred_tfidf,
'delta_cred_tf_idf': pos_cred_tfidf - neg_cred_tfidf
}, index=self.corpus_.get_terms())
}, index=self._get_index())
return score_df

def _set_scorer_args(self, **kwargs):
Expand Down
37 changes: 37 additions & 0 deletions scattertext/test/test_phraseFeatsFromTopicModel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from collections import Counter
from unittest import TestCase, mock
from unittest.mock import patch, MagicMock
import sys

from scattertext.features.PhraseFeatsFromTopicModel import PhraseFeatsFromTopicModel


class TestPhraseFeatsFromTopicModel(TestCase):
def test_get_doc_get_feats(self):
flashtext = MagicMock()
flashtext.KeywordProcessor().extract_keywords.return_value = ['A b', 'A b', 'C e F', 'B']
sys.modules["flashtext"] = flashtext

expected = Counter({'A b': 2, 'C e F': 1, 'B': 1})

actual = PhraseFeatsFromTopicModel(
topic_model={'Topic A': ['A b', 'b', 'C e F'],
'Topic B': ['B', 'C e F']}
).get_feats('A b A b C e F B')

self.assertEqual(expected, actual)


def test_get_doc_metadata(self):
flashtext = MagicMock()
flashtext.KeywordProcessor().extract_keywords.return_value = ['A b', 'A b', 'C e F', 'B']
sys.modules["flashtext"] = flashtext

expected = Counter({'Topic A': 3, 'Topic B': 2})

actual = PhraseFeatsFromTopicModel(
topic_model={'Topic A': ['A b', 'b', 'C e F'],
'Topic B': ['B', 'C e F']}
).get_doc_metadata('A b A b C e F B')

self.assertEqual(expected, actual)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from setuptools import setup, find_packages

setup(name='scattertext',
version='0.0.2.66',
version='0.0.2.67',
description='An NLP package to visualize interesting terms in text.',
url='https://github.com/JasonKessler/scattertext',
author='Jason Kessler',
Expand Down

0 comments on commit c7b791b

Please sign in to comment.