Skip to content

Commit

Permalink
0.0.2.74 Update. Made with an eye toward R compatibility. See demo_de…
Browse files Browse the repository at this point in the history
…ltajsd.py for new features.
  • Loading branch information
JasonKessler committed Dec 14, 2020
1 parent b6750bf commit cccbed7
Show file tree
Hide file tree
Showing 8 changed files with 183 additions and 40 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
[![Gitter Chat](https://img.shields.io/badge/GITTER-join%20chat-green.svg)](https://gitter.im/scattertext/Lobby)
[![Twitter Follow](https://img.shields.io/twitter/follow/espadrine.svg?style=social&label=Follow)](https://twitter.com/jasonkessler)

# Scattertext 0.0.2.73
# Scattertext 0.0.2.74

A tool for finding distinguishing terms in corpora, and presenting them in an
interactive, HTML scatter plot. Points corresponding to terms are selectively labeled
Expand Down
25 changes: 22 additions & 3 deletions demo_deltajsd.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from scattertext.termcompaction.AssociationCompactor import JSDCompactor

from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer, RankDifference
from scattertext.CorpusFromPandas import CorpusFromPandas

convention_df = SampleCorpora.ConventionData2012.get_data()
Expand All @@ -14,6 +14,20 @@
nlp=whitespace_nlp_with_sentences
).build().get_unigram_corpus().compact(JSDCompactor(1000))

term_etc_df = corpus.get_term_freq_df('').assign(
DemocraticRank=lambda df: dense_rank(df['democrat']),
RepublicanRank=lambda df: dense_rank(df['republican']),
RankDiff=lambda df: RankDifference().get_scores(df['democrat'], df['republican']),
)

get_custom_term_html = '(function(x) {return "Term: " + x.term + "<span class=topic_preview>"' + ' '.join(
f''' + "<br>{name}: " + x.etc.{key}.toFixed(5)'''
for name, key in
[('Democratic Rank', 'DemocraticRank'),
('Republican Rank', 'RepublicanRank'),
('Rank Difference Score', 'RankDiff')]
) + '+ "</span>" ;})'

html = produce_frequency_explorer(
corpus,
category='democrat',
Expand All @@ -25,8 +39,13 @@
metadata=convention_df['speaker'],
term_scorer=DeltaJSDivergence(),
transform=dense_rank,
term_metadata_df=corpus.get_term_freq_df(''),
enable_term_category_description=False
term_metadata_df=term_etc_df,
get_custom_term_html=get_custom_term_html,
enable_term_category_description=False,
header_names={'upper': 'Top Dem. RankDiff',
'lower': 'Top GOP RankDiff'},
header_sorting_algos={'upper': '(function(a, b) {return b.etc.RankDiff - a.etc.RankDiff})',
'lower': '(function(a, b) {return a.etc.RankDiff - b.etc.RankDiff})'}
)

open('./demo_JSDivergence.html', 'wb').write(html.encode('utf-8'))
Expand Down
20 changes: 20 additions & 0 deletions scattertext/Scalers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,26 @@
def scale(vec, terms=None, other_vec=None):
return (vec - vec.min()) / (vec.max() - vec.min())

def scale_jointly(x, y):
ar = np.array([x, y])
scale_min = np.min(ar)
scale_max = np.min(ar)
return(
((x / scale_max * (x > 0) + (x / scale_min) * (x <= 0) * -1) + 1) * 0.5,
((y / scale_max * (y > 0) + (y / scale_min) * (y <= 0) * -1) + 1) * 0.5
)

def rotate_degrees(x, y, degrees):
return rotate_radians(y, x, np.pi * (degrees) / 180)

def rotate_radians(y, x, radians):
y = np.array(y)
x = np.array(x)
return (
x * np.cos(radians) - y * np.sin(radians),
x * np.sin(radians) + y * np.cos(radians)
)


def scale_neg_1_to_1_with_zero_mean_abs_max(vec):
max_abs = max(vec.max(), -vec.min())
Expand Down
20 changes: 17 additions & 3 deletions scattertext/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from scattertext.termscoring.DeltaJSDivergence import DeltaJSDivergence

version = [0, 0, 2, 73]
version = [0, 0, 2, 74]
__version__ = '.'.join([str(e) for e in version])
import re
import numpy as np
Expand Down Expand Up @@ -216,6 +216,9 @@ def produce_scattertext_explorer(corpus,
sort_doc_labels_by_name=False,
enable_term_category_description=True,
always_jump=True,
get_custom_term_html=None,
header_names=None,
header_sorting_algos=None,
return_data=False,
return_scatterplot_structure=False):
'''Returns html code of visualization.
Expand Down Expand Up @@ -442,6 +445,14 @@ def produce_scattertext_explorer(corpus,
Always jump to term contexts if a term is clicked
enable_term_category_description: bool, default True
List term/metadata statistics under category
get_custom_term_html: str, default None
Javascript function which displays term summary from term info
header_names: Dict[str, str], default None
Dictionary giving names of term lists shown to the right of the plot. Valid keys are
upper, lower and right.
header_sorting_algos: Dict[str, str], default None
Dictionary giving javascript sorting algorithms for panes. Valid keys are upper, lower
and right. Value is a JS function which takes the "data" object.
return_data : bool default False
Return a dict containing the output of `ScatterChartExplorer.to_dict` instead of
an html.
Expand Down Expand Up @@ -534,7 +545,7 @@ def produce_scattertext_explorer(corpus,
neutral_categories=neutral_categories,
extra_categories=extra_categories,
background_scorer=characteristic_scorer,
include_term_category_counts=include_term_category_counts
include_term_category_counts=include_term_category_counts,
)

if return_data:
Expand Down Expand Up @@ -590,7 +601,10 @@ def produce_scattertext_explorer(corpus,
show_corpus_stats=show_corpus_stats,
sort_doc_labels_by_name=sort_doc_labels_by_name,
enable_term_category_description=enable_term_category_description,
always_jump=always_jump)
always_jump=always_jump,
get_custom_term_html=get_custom_term_html,
header_names=header_names,
header_sorting_algos=header_sorting_algos)

if return_scatterplot_structure:
return scatterplot_structure
Expand Down
73 changes: 52 additions & 21 deletions scattertext/data/viz/scripts/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ buildViz = function (d3) {
showDiagonal = false,
useGlobalScale = false,
enableTermCategoryDescription = true,
getCustomTermHtml = null,
headerNames = null,
headerSortingAlgos = null
) {
//var divName = 'd3-div-1';
// Set the dimensions of the canvas / graph
Expand Down Expand Up @@ -971,6 +974,12 @@ buildViz = function (d3) {
if ('metalists' in fullData && info.term in fullData.metalists) {
termHtml = 'Topic: <b>' + info.term + '</b>';
}
console.log("HERE")
console.log(getCustomTermHtml)
if(getCustomTermHtml !== null) {
console.log("Making custom html")
termHtml = getCustomTermHtml(info);
}
d3.select('#' + divName + '-' + 'termstats')
.append('div')
.attr("class", "snippet_header")
Expand Down Expand Up @@ -2134,28 +2143,34 @@ buildViz = function (d3) {
return euclideanDistanceSortForNotCategory;
}

function pickScoreSortAlgo(category) {
function pickScoreSortAlgo(isTopPane) {
console.log("PICK SCORE ALGO")
console.log(category)
if (category == true) {
console.log(isTopPane)
if (isTopPane === true) {
if (headerSortingAlgos !== null && headerSortingAlgos['upper'] !== undefined)
return headerSortingAlgos['upper'];
return scoreSortForCategory;
} else {
if(headerSortingAlgos !== null && headerSortingAlgos['lower'] !== undefined)
return headerSortingAlgos['lower'];
return scoreSortForNotCategory;
}

}

function pickTermSortingAlgorithm(category) {
if (sortByDist) return pickEuclideanDistanceSortAlgo(category);
return pickScoreSortAlgo(category);
function pickTermSortingAlgorithm(isUpperPane) {
if (sortByDist) return pickEuclideanDistanceSortAlgo(isUpperPane);
return pickScoreSortAlgo(isUpperPane);
}

function showAssociatedWordList(data, word, header, isAssociatedToCategory, length = 14) {
function showAssociatedWordList(data, word, header, isUpperPane, length = 14) {
var sortedData = null;
var sortingAlgo = pickTermSortingAlgorithm(isAssociatedToCategory);
var sortingAlgo = pickTermSortingAlgorithm(isUpperPane);
console.log("showAssociatedWordList"); console.log(header); console.log("WORD"); console.log(word)
sortedData = data.filter(term => (term.display === undefined || term.display === true)).sort(sortingAlgo);
if (wordVecMaxPValue) {
function signifTest(x) {
if (isAssociatedToCategory)
if (isUpperPane)
return x.p >= 1 - minPVal;
return x.p <= minPVal;
}
Expand Down Expand Up @@ -2204,8 +2219,8 @@ buildViz = function (d3) {
function showTopTermsPane(data,
registerFigureBBox,
showAssociatedWordList,
catName,
notCatName,
upperHeaderName,
lowerHeaderName,
startingOffset) {
data = data.filter(term => (term.display === undefined || term.display === true));
//var catHeader = showCatHeader(startingOffset, catName, registerFigureBBox);
Expand All @@ -2217,7 +2232,7 @@ buildViz = function (d3) {
.attr('font-size', '12px')
.attr('font-weight', 'bolder')
.attr('font-decoration', 'underline')
.text(catName
.text(upperHeaderName
//"Top " + fullData['info']['category_name']
);
registerFigureBBox(catHeader);
Expand All @@ -2226,7 +2241,7 @@ buildViz = function (d3) {
word = wordListData.word;
var maxWidth = wordListData.maxWidth;

var notCatHeader = showNotCatHeader(startingOffset, word, notCatName);
var notCatHeader = showNotCatHeader(startingOffset, word, lowerHeaderName);
word = notCatHeader;
characteristicXOffset = catHeader.node().getBBox().x + maxWidth + 10;

Expand All @@ -2244,17 +2259,26 @@ buildViz = function (d3) {

var payload = Object();
if (showTopTerms) {
var upperHeaderName = "Top " + fullData['info']['category_name'];
var lowerHeaderName = "Top " + fullData['info']['not_category_name'];
if(headerNames !== null) {
if(headerNames.upper !== undefined)
upperHeaderName = headerNames.upper;
if(headerNames.lower !== undefined)
lowerHeaderName = headerNames.lower;
}
payload.topTermsPane = showTopTermsPane(
data,
registerFigureBBox,
showAssociatedWordList,
"Top " + fullData['info']['category_name'],
"Top " + fullData['info']['not_category_name'],
upperHeaderName,
lowerHeaderName,
width
);
payload.showTopTermsPane = showTopTermsPane;
payload.showAssociatedWordList = showAssociatedWordList;
payload.showWordList = showWordList;

/*var wordListData = topTermsPane.wordListData;
var word = topTermsPane.word;
var maxWidth = topTermsPane.maxWidth;
Expand Down Expand Up @@ -2853,15 +2877,21 @@ buildViz = function (d3) {
this.showWordList = payload.showWordList;


this.showAssociatedWordList = function (data, word, header, isAssociatedToCategory, length = 14) {
this.showAssociatedWordList = function (
data,
word,
header,
isUpperPane,
length = 14
) {
var sortedData = null;
if (!isAssociatedToCategory) {
if (!isUpperPane) {
sortedData = data.map(x => x).sort((a, b) => scores[a.i] - scores[b.i])
} else {
sortedData = data.map(x => x).sort((a, b) => scores[b.i] - scores[a.i])
}
console.log('sortedData');
console.log(isAssociatedToCategory);
console.log(isUpperPane);
console.log(sortedData.slice(0, length))
console.log(payload)
console.log(word)
Expand Down Expand Up @@ -2988,15 +3018,16 @@ buildViz = function (d3) {
this.showWordList = payload.showWordList;


this.showAssociatedWordList = function (data, word, header, isAssociatedToCategory, length = 14) {
this.showAssociatedWordList = function (data, word, header, isUpperPane, length = 14) {
var sortedData = null;
if (!isAssociatedToCategory) {
if (!isUpperPane) {
sortedData = data.map(x => x).sort((a, b) => scores[a.i] - scores[b.i])
} else {
sortedData = data.map(x => x).sort((a, b) => scores[b.i] - scores[a.i])
}
console.log("HEADERHEADER222")
console.log('sortedData');
console.log(isAssociatedToCategory);
console.log(isUpperPane);
console.log(sortedData.slice(0, length))
console.log(payload)
console.log(word)
Expand Down
39 changes: 35 additions & 4 deletions scattertext/test/test_HTMLVisualizationAssembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ def get_params(self, param_dict={}):
'null', 'false', 'false',
'"' + DEFAULT_D3_AXIS_VALUE_FORMAT + '"',
'"' + DEFAULT_D3_AXIS_VALUE_FORMAT + '"',
'false', '-1', 'true', 'false', 'true', 'false', 'false', 'false', 'true']
'false', '-1', 'true', 'false', 'true', 'false', 'false', 'false', 'true', 'null', 'null', 'null']
for i, val in param_dict.items():
params[i] = val
return 'buildViz(' + ','.join(params) + ');'
return 'buildViz(' + ',\n'.join(params) + ');\n'

def make_assembler(self):
scatterplot_structure = ScatterplotStructure(self.make_adapter())
Expand Down Expand Up @@ -402,7 +402,6 @@ def test_max_overlapping(self):
.call_build_visualization_in_javascript())
self.assertEqual(params, self.get_params({44: '10'}))


def test_show_corpus_stats(self):
visualization_data = self.make_adapter()
params = (ScatterplotStructure(visualization_data, show_corpus_stats=False)
Expand Down Expand Up @@ -442,8 +441,40 @@ def test_use_global_scale(self):
.call_build_visualization_in_javascript())
self.assertEqual(params, self.get_params({50: 'true'}))

def test_use_global_scale(self):
def test_enable_term_category_description(self):
visualization_data = self.make_adapter()
params = (ScatterplotStructure(visualization_data, enable_term_category_description=False)
.call_build_visualization_in_javascript())
self.assertEqual(params, self.get_params({51: 'false'}))

def test_get_custom_term_html(self):
visualization_data = self.make_adapter()
html = '(function(x) {return "Term: " + x.term})'
params = (ScatterplotStructure(
visualization_data,
get_custom_term_html=html
).call_build_visualization_in_javascript())
self.assertEqual(params, self.get_params({52: html}))

def test_header_names(self):
visualization_data = self.make_adapter()
header_names = {'upper': 'Upper Header Name', 'lower': 'Lower Header Name'}
params = (ScatterplotStructure(
visualization_data,
header_names=header_names
).call_build_visualization_in_javascript())
self.assertEqual(params, self.get_params(
{53: '''{"upper": "Upper Header Name", "lower": "Lower Header Name"}'''}
))

def test_header_sorting_algos(self):
visualization_data = self.make_adapter()
header_sorting_algos = {'upper': '(function(a, b) {return b.s - a.s})',
'lower': '(function(a, b) {return a.s - b.s})'}
params = (ScatterplotStructure(
visualization_data,
header_sorting_algos=header_sorting_algos
).call_build_visualization_in_javascript())
self.assertEqual(params, self.get_params(
{54: '''{"lower": (function(a, b) {return a.s - b.s}), "upper": (function(a, b) {return b.s - a.s})}'''}
))
Loading

0 comments on commit cccbed7

Please sign in to comment.