# `geom_hex()`

In [1]:
from math import sqrt

import numpy as np
import pandas as pd

from lets_plot import *

In [2]:
LetsPlot.setup_html()

In [3]:
data = {
    'x': [-10, -10, 10, 9, 11],
    'y': [-10, 10, -10, 9, 11],
    'w': [2, 4, 8, 1, 1],
}

In [4]:
identity_data = {
    'x': [-.5, .5, 0],
    'y': [0, 0, sqrt(3) / 2],
    'g': [1, 1, 2],
}

In [5]:
df = pd.read_csv("https://raw.githubusercontent.com/JetBrains/lets-plot-docs/refs/heads/master/data/mpg.csv")
print(df.shape)
df.head()

(234, 12)


Unnamed: 0.1,Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
0,1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
1,2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
2,3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
3,4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
4,5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


## Basic example with default stat

In [6]:
ggplot(df, aes("cty", "hwy")) + geom_hex()

## Basic example with `'identity'` stat

In [7]:
ggplot(identity_data, aes('x', 'y', fill='g')) + geom_hex(stat='identity')

## Aesthetics

In [8]:
gggrid([
    ggplot(data, aes('x', 'y')) + geom_hex(binwidth=[20, 20]) + geom_point(color='red') + ggtitle("Default"),
    ggplot(data, aes('x', 'y')) + geom_hex(width=.5, binwidth=[20, 20]) + geom_point(color='red') + ggtitle("width=.5"),
    ggplot(data, aes('x', 'y')) + geom_hex(height=0.5, binwidth=[20, 20]) + geom_point(color='red') + ggtitle("height=0.5"),
    ggplot(data, aes('x', 'y')) + geom_hex(aes(weight='w'), binwidth=[20, 20]) + geom_text(aes(label='w'), color='red') + ggtitle("weight='w'"),
    ggplot(data, aes('x', 'y')) + geom_hex(binwidth=[20, 20], alpha=.25) + geom_point(color='red') + ggtitle("alpha=.25"),
    ggplot(data, aes('x', 'y')) + geom_hex(binwidth=[20, 20], size=1, color='red', linetype='longdash') + geom_point(color='red') + ggtitle("size=2, color='red', linetype='longdash'"),
    ggplot(data, aes('x', 'y')) + geom_hex(aes(fill='..density..'), binwidth=[20, 20]) + geom_point(color='red') + ggtitle("fill='..density..'"),
], ncol=2)

## Parameters

### `bins`/`binwidth`

In [9]:
gggrid([
    ggplot(df, aes("cty", "hwy")) + geom_hex(),
    ggplot(df, aes("cty", "hwy")) + geom_hex(bins=[6, 7]),
    ggplot(df, aes("cty", "hwy")) + geom_hex(binwidth=[5, 5]),
])

### `drop`

In [10]:
gggrid([
    ggplot(df, aes("cty", "hwy")) + geom_hex(),
    ggplot(df, aes("cty", "hwy")) + geom_hex(drop=False),
])

### `tooltips`

In [11]:
ggplot(df, aes("cty", "hwy")) + \
    geom_hex(tooltips=layer_tooltips().title("(^x, ^y)")\
                                      .line("count|@..count..")\
                                      .line("density|@..density..").format("@..density..", ".3~f"))

### `position`

In [12]:
gggrid([
    ggplot(df, aes("cty", "hwy")) + geom_hex(binwidth=[4, 4]),
    ggplot(df, aes("cty", "hwy")) + geom_hex(binwidth=[4, 4], position=position_nudge(x=2, y=2)),
])

### `orientation`

In [13]:
gggrid([
    ggplot(data, aes('x', 'y')) + geom_hex(binwidth=[20, 20]) + geom_point(color='red'),
    ggplot(data, aes('x', 'y')) + geom_hex(binwidth=[20, 20], orientation='y') + geom_point(color='red'),
])

### `inherit_aes`

In [14]:
gggrid([
    ggplot(identity_data, aes(color='g')) + \
        geom_hex(aes('x', 'y', fill='g'), stat='identity', size=2, alpha=.3),
    ggplot(identity_data, aes(color='g')) + \
        geom_hex(aes('x', 'y', fill='g'), stat='identity', size=2, alpha=.3, inherit_aes=False),
])

### `manual_key`

In [15]:
ggplot(df, aes("cty", "hwy")) + geom_hex(manual_key="Manual key")

### `sampling`

In [16]:
# At least it works as well as it does for `geom_bin2d()`

def sampling_plot(sampling):
    return ggplot(df, aes("cty", "hwy")) + \
        geom_hex(sampling=sampling) + \
        ggtitle(str(sampling).replace("\n", " "))

gggrid([
    sampling_plot(sampling_pick(5)),
    sampling_plot(sampling_random(5, seed=42)),
    sampling_plot(sampling_systematic(5)),
], ncol=2)

### `color_by`/`fill_by`

In [17]:
ggplot(df, aes("cty", "hwy")) + \
    geom_hex(aes(paint_a="..count..", paint_b="..count.."),
             binwidth=[3, 3], size=1, color_by='paint_a', fill_by='paint_b') + \
    scale_gradient('paint_a', low="black", high="red", guide=guide_colorbar(title="count (border)")) + \
    scale_gradient('paint_b', low="black", high="yellow", guide=guide_colorbar(title="count (figure)"))

## Change geometry

In [18]:
ggplot(df, aes("cty", "hwy")) + \
    geom_hex(binwidth=[3, 3]) + \
    geom_point(aes(color='..count..'), stat='binhex', binwidth=[3, 3], shape=11, size=10) + \
    scale_color_gradient(low="yellow", high="black", guide='none') + \
    scale_fill_gradient(low="black", high="yellow", guide='none')

## With other layers

In [19]:
ggplot(df, aes("cty", "hwy")) + \
    geom_hex(binwidth=[5, 5], color="white", size=.5) + \
    geom_point(color="white", size=1.5) + \
    ggmarginal('tr', layer=geom_histogram(binwidth=5, color="lightgray", fill="white")) + \
    facet_grid(x="year") + \
    scale_x_continuous(breaks=list(range(0, 40, 5))) + \
    scale_y_continuous(breaks=list(range(0, 50, 5))) + \
    xlim(0, 40) + ylim(0, 50) + \
    theme_minimal() + \
    flavor_darcula()

In [20]:
gggrid([
    ggplot(df, aes("cty", "hwy")) + \
        geom_hex(binwidth=[5, 5]) + \
        ggtitle("Default coord"),
    ggplot(df, aes("cty", "hwy")) + \
        geom_hex(binwidth=[5, 5]) + \
        coord_flip() + \
        ggtitle("coord_flip()"),
    ggplot(df, aes("cty", "hwy")) + \
        geom_hex(binwidth=[5, 5]) + \
        coord_polar() + \
        ggtitle("coord_polar()"),
])

## Tests

In [21]:
def get_grid_data(n, m):
    return {
        'x': [i for i in range(n) for j in range(m)],
        'y': [j for i in range(n) for j in range(m)],
    }

def get_grid_plot(n, m, w, h):
    return ggplot(get_grid_data(n, m), aes('x', 'y')) + \
        geom_hex(binwidth=[w, h]) + \
        geom_point(color="red") + \
        ggtitle("binwidth=[{0}, {1}]".format(w, h))

gggrid([
    get_grid_plot(2, 2, 1, 1),
    get_grid_plot(2, 2, 2, 2),
    get_grid_plot(4, 4, 2, 2),
    get_grid_plot(4, 4, 1, 4),
    get_grid_plot(4, 4, 4, 1),
], ncol=2)

In [22]:
tests = [
    {
        'title': "Empty data",
        'data': {
            'x': [],
            'y': [],
        }
    },
    {
        'title': "One element",
        'data': {
            'x': [0],
            'y': [0],
        }
    },
    {
        'title': "NaN's in data",
        'data': {
            'x': [0, 1, np.nan, None, 1, 1],
            'y': [0, 1, 1, 1, np.nan, None],
        }
    },
    {
        'title': "Empty data, identity stat",
        'data': {
            'x': [],
            'y': [],
        },
        'stat': 'identity'
    },
    {
        'title': "One element, identity stat",
        'data': {
            'x': [0],
            'y': [0],
        },
        'stat': 'identity'
    },
    {
        'title': "NaN's in data, identity stat",
        'data': {
            'x': [0, 1, np.nan, None, 1, 1],
            'y': [0, 1, 1, 1, np.nan, None],
        },
        'stat': 'identity'
    },
]

gggrid([
    ggplot(t['data'], aes('x', 'y')) + \
        geom_hex(stat=t['stat'] if 'stat' in t else None) + \
        ggtitle(t['title'])
    for t in tests
], ncol=2)

In [23]:
# TODO: Wrong plots
def get_test_plot(x, y):
    return ggplot({'x': [0, x], 'y': [0, y], 'g': ['center', 'border']}, aes('x', 'y')) + \
        geom_hex(binwidth=[2, 2], show_legend=False) + \
        geom_point(aes(color='g')) + \
        scale_color_manual(values=["red", "green"]) + \
        coord_fixed(ratio=1, xlim=[-1.5, 1.5], ylim=[-1.5, 1.5])

hh = 4/3
gggrid([
    get_test_plot(0, hh),
    get_test_plot(0.5, 3 * hh / 4),
    get_test_plot(1, hh / 2),
    get_test_plot(1, 0),
    get_test_plot(1, -hh / 2),
    get_test_plot(0.5, -3 * hh / 4),
    get_test_plot(0, -hh),
    get_test_plot(-0.5, -3 * hh / 4),
    get_test_plot(-1, -hh / 2),
    get_test_plot(-1, 0),
    get_test_plot(-1, hh / 2),
    get_test_plot(-0.5, 3 * hh / 4),
], ncol=3)

## Problems

### Is the value of the `bins`/`binwidth` parameters interpreted correctly?

In [24]:
test_data = {'x': [1, 2, 3, 4], 'y': [1, 1, 4, 5]}

In [25]:
def bins_plot(bins=None):
    return ggplot(test_data, aes('x', 'y')) + \
        geom_hex(bins=bins, drop=False) + \
        geom_point(color='red') + \
        ggtitle(str(bins))

gggrid([
    bins_plot([1, 1]), # TODO: Wrong plot
    bins_plot([2, 2]),
    bins_plot([3, 5]),
    bins_plot([5, 3]),
], ncol=2)

In [26]:
def binwidth_plot(binwidth):
    return ggplot(test_data, aes('x', 'y')) + \
        geom_hex(binwidth=binwidth) + \
        geom_point(color='red') + \
        ggtitle(str([float("{:.2f}".format(v)) for v in binwidth]))

gggrid([
    binwidth_plot([5, 5]),
    binwidth_plot([2, 2]),
    binwidth_plot([5/3, 5/5]),
    binwidth_plot([5/5, 5/3]),
], ncol=2)

### Too much stretching of the hexagons

In [27]:
ggplot(data, aes('x', 'y')) + \
    geom_hex(binwidth=[10, 10]) + \
    geom_point(color='red')

In the plot above you can see that the hexagons are too big.

**Reason**: The statistics are fine, but the geometry is stretching the shapes with `ctx.getResolution()`. Because there are gaps in the data, and hexagons with zero `'count'` are skipped, the `resolution` is larger than it should be. `geom_bin2d()` does the same thing, but there it is not noticeable and is not a problem at all, since the rectangles join normally after stretching, not overlap. By the way, sometimes `resolution` will be calculated correctly if you just don't skip hexagons with zero `'count'` value:

In [28]:
ggplot(data, aes('x', 'y')) + \
    geom_hex(binwidth=[10, 10], drop=False) + \
    geom_point(color='red')

**Possible solution**:

Don't stretch the geometry based on `resolution`, but use another way to figure out what size the hexagons should be so they fit together. This probably means using the width and height computed in the statistic. Hence, width and height need to be passed in some aesthetics. Options:

- `'width'`/`'height'`: if we use these aesthetics, firstly the "normalization" (the intuition that a value of 1 corresponds to placing geometries butt-to-butt) disappears, and secondly we have to rewrite the range estimation procedure for axes (for hexagons), since it uses `'width'`/`'height'` in a special way.

- `'binwidth'`/`'binheight'`: will not work, because the name `'binwidth'` is already occupied by a function parameter.

- Some new aesthetics - doubtful, but possible.

General disadvantage of the approach: the already existing `geom_bin2d()` function works differently (stretches shapes using `resolution`), which means that when `stat='identity'` they will work differently, i.e. there will be a discrepancy in the API. On the other hand, maybe this is a reason to redo `geom_bin2d()`, since there is no stretching in ggplot2 either.