# Explicit Use of Statistics

Basically, for each statistical transformation, there is a geometry that uses it by default.

However, you can simply select an appropriate geometry and explicitly specify the desired stat for it.

Examples of how to do this can be found further in this notebook.

In [1]:
import pandas as pd

from lets_plot import *
from lets_plot.mapping import as_discrete

In [2]:
LetsPlot.setup_html()

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/mpg.csv")
print(df.shape)
df.head()

(234, 12)


Unnamed: 0.1,Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
0,1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
1,2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
2,3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
3,4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
4,5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


## Identity

Many geometries have `'identity'` stat by default and display the data as is.

In [4]:
minmax_cty_df = df.groupby("drv")["cty"].agg(["min", "max"]).reset_index()

ggplot(minmax_cty_df, aes(x="drv")) + geom_errorbar(aes(ymin="min", ymax="max"))

If you want to use geometry that has its own stats, but with your original calculations, prepare the data and use the `'identity'` stat explicitly.

In [5]:
mean_cty_df = df.groupby("drv")["cty"].mean().to_frame().reset_index()

ggplot(mean_cty_df, aes("drv", "cty")) + geom_bar(stat='identity') + ylab("mean cty")

## One Variable

### Discrete

#### `'count'`

In [6]:
gggrid([
    ggplot(df, aes("drv")) + \
        geom_bar() + \
        ggtitle("geom_bar()"),
    ggplot(df, aes("drv")) + \
        geom_lollipop(stat='count') + \
        ggtitle("geom_lollipop(stat='count')"),
])

### Continuous

#### `'bin'`

In [7]:
gggrid([
    ggplot(df, aes("cty")) + \
        geom_histogram() + \
        ggtitle("geom_histogram()"),
    ggplot(df, aes("cty")) + \
        geom_lollipop(stat='bin') + \
        ggtitle("geom_lollipop(stat='bin')"),
])

#### `'density'`

In [8]:
gggrid([
    ggplot(df, aes("cty")) + \
        geom_density() + \
        ggtitle("geom_density()"),
    ggplot(df, aes("cty")) + \
        geom_step(stat='density', n=50) + \
        ggtitle("geom_step(stat='density')"),
])

#### `'dotplot'`

In [9]:
gggrid([
    ggplot(df, aes("cty")) + \
        geom_dotplot() + \
        ggtitle("geom_dotplot()"),
    ggplot(df, aes("cty")) + \
        geom_lollipop(aes(y='..count..'), stat='dotplot') + \
        ggtitle("geom_lollipop(stat='dotplot')"),
])

#### `'ecdf'`

In [10]:
gggrid([
    ggplot(df, aes("cty")) + \
        stat_ecdf() + \
        ggtitle("stat_ecdf()"),
    ggplot(df, aes("cty")) + \
        geom_line(stat='ecdf') + \
        ggtitle("geom_line(stat='ecdf')"),
])

#### `'qq'`

In [11]:
gggrid([
    ggplot(df) + \
        geom_qq(aes(sample="cty")) + \
        ggtitle("geom_qq()"),
    ggplot(df) + \
        geom_step(aes(sample="cty"), stat='qq') + \
        ggtitle("geom_step(stat='qq')"),
])

#### `'qq_line'`

In [12]:
gggrid([
    ggplot(df) + \
        geom_qq(aes(sample="cty")) + \
        geom_qq_line(aes(sample="cty")) + \
        ggtitle("geom_qq_line()"),
    ggplot(df) + \
        geom_area(aes(sample="cty"), stat='qq_line',
                  size=0, fill="#fee0d2") + \
        geom_qq(aes(sample="cty")) + \
        coord_cartesian(ylim=[5, 35]) + \
        ggtitle("geom_area(stat='qq_line')"),
])

## Two Variables

### Both Discrete

#### `'count2d'`

In [13]:
gggrid([
    ggplot(df, aes(as_discrete("cyl", order=1), as_discrete("year"))) + \
        geom_pie(aes(fill="drv")) + \
        ggtitle("geom_pie()"),
    ggplot(df, aes(as_discrete("cyl", order=1), as_discrete("year"))) + \
        geom_point(aes(group="drv", color="drv", size='..count..'),
                   stat='count2d', alpha=.25) + \
        scale_size(range=[2, 30], guide='none') + \
        ggtitle("geom_point(stat='count2d')")
])

#### `'sum'`

In [14]:
gggrid([
    ggplot(df, aes("cty", "hwy")) + \
        stat_sum() + \
        ggtitle("stat_sum()"),
    ggplot(df, aes("cty", "hwy")) + \
        geom_tile(aes(fill='..n..'), stat='sum', size=0) + \
        ggtitle("geom_tile(stat='sum')"),
])

### One Discrete, One Continuous

#### `'boxplot'` and `'boxplot_outlier'`

In [15]:
gggrid([
    ggplot(df, aes("drv", "cty")) + \
        geom_boxplot() + \
        ggtitle("geom_boxplot()"),
    ggplot(df, aes("drv", "cty")) + \
        geom_ribbon(stat='boxplot') + \
        geom_ribbon(aes(ymin='..lower..', ymax='..upper..'), stat='boxplot') + \
        geom_ribbon(aes(ymin='..middle..', ymax='..middle..'), stat='boxplot', size=1.5) + \
        geom_linerange(stat='boxplot') + \
        geom_point(stat='boxplot_outlier') + \
        ggtitle("geom_ribbon(stat='boxplot')\n"
                "geom_linerange(stat='boxplot')\n"
                "geom_point(stat='boxplot_outlier')"),
])

#### `'densityridges'`

In [16]:
gggrid([
    ggplot(df, aes("cty", "drv")) + \
        geom_area_ridges() + \
        ggtitle("geom_area_ridges()"),
    ggplot(df, aes("cty", "drv")) + \
        geom_step(aes(y='..height..'), stat='densityridges', n=50) + \
        facet_grid(y="drv") + \
        ggtitle("geom_step(stat='densityridges')"),
])

#### `'summary'`

In [17]:
gggrid([
    ggplot(df, aes("drv", "cty")) + \
        stat_summary() + \
        ggtitle("stat_summary()"),
    ggplot(df, aes("drv", "cty")) + \
        geom_crossbar(stat='summary') + \
        ggtitle("geom_crossbar(stat='summary')"),
])

#### `'ydensity'`

In [18]:
gggrid([
    ggplot(df, aes("drv", "cty")) + \
        geom_violin() + \
        ggtitle("geom_violin()"),
    ggplot(df, aes("drv", "cty")) + \
        geom_lollipop(aes(x='..violinwidth..'), stat='ydensity',
                      n=25, dir='h', fatten=1) + \
        facet_grid(x="drv") + \
        xlab("drv") + \
        ggtitle("geom_lollipop(stat='ydensity')"),
])

#### `'ydotplot'`

In [19]:
gggrid([
    ggplot(df, aes("drv", "cty")) + \
        geom_ydotplot() + \
        ggtitle("geom_ydotplot()"),
    ggplot(df, aes("drv", "cty")) + \
        geom_lollipop(aes(x='..count..'), stat='ydotplot', dir='h', fatten=1.5) + \
        facet_grid(x="drv") + \
        ggtitle("geom_lollipop(stat='ydotplot')"),
])

### Both Continuous

#### `'bin2d'`

In [20]:
gggrid([
    ggplot(df, aes("cty", "hwy")) + \
        geom_bin2d() + \
        ggtitle("geom_bin2d()"),
    ggplot(df, aes("cty", "hwy")) + \
        geom_point(aes(color='..count..'), stat='bin2d') + \
        ggtitle("geom_point(stat='bin2d')"),
])

#### `'contour'` and `'contourf'`

In [21]:
def get_contour_data(n=50):
    import numpy as np
    from scipy.stats import multivariate_normal
    x = np.linspace(-1, 1, n)
    y = np.linspace(-1, 1, n)
    X, Y = np.meshgrid(x, y)
    mean = np.zeros(2)
    cov = [[1, .75],
           [.75, 1]]
    rv = multivariate_normal(mean, cov)
    Z = rv.pdf(np.dstack((X, Y)))
    return pd.DataFrame(dict(x=X.flatten(), y=Y.flatten(), z=Z.flatten()))

contour_df = get_contour_data()

gggrid([
    ggplot(contour_df, aes("x", "y", z="z")) + \
        geom_contour() + \
        coord_fixed(ratio=.75) + \
        ggtitle("geom_contour()"),
    ggplot(contour_df, aes("x", "y", z="z")) + \
        geom_point(stat='contour', alpha=.2) + \
        coord_fixed(ratio=.75) + \
        ggtitle("geom_point(stat='contour')"),
    ggplot(contour_df, aes("x", "y", z="z")) + \
        geom_contourf(aes(fill='..level..'), show_legend=False) + \
        coord_fixed(ratio=.75) + \
        ggtitle("geom_contourf()"),
    ggplot(contour_df, aes("x", "y", z="z")) + \
        geom_point(aes(color='..level..'), stat='contourf',
                   alpha=.2, show_legend=False) + \
        coord_fixed(ratio=.75) + \
        ggtitle("geom_point(stat='contourf')"),
], ncol=2)

#### `'density2d'` and `'density2df'`

In [22]:
gggrid([
    ggplot(df, aes("cty", "hwy")) + \
        geom_density2d() + \
        coord_fixed(ratio=.5) + \
        ggtitle("geom_density2d()"),
    ggplot(df, aes("cty", "hwy")) + \
        geom_point(stat='density2d', alpha=.2) + \
        coord_fixed(ratio=.5) + \
        ggtitle("geom_point(stat='density2d')"),
    ggplot(df, aes("cty", "hwy")) + \
        geom_density2df(aes(fill='..level..'), show_legend=False) + \
        coord_fixed(ratio=.5) + \
        ggtitle("geom_density2df()"),
    ggplot(df, aes("cty", "hwy")) + \
        geom_point(aes(color='..level..'), stat='density2df',
                   alpha=.2, show_legend=False) + \
        coord_fixed(ratio=.5) + \
        ggtitle("geom_point(stat='density2df')"),
], ncol=2)

#### `'qq2'`

In [23]:
gggrid([
    ggplot(df, aes("cty", "hwy")) + \
        geom_qq2() + \
        ggtitle("geom_qq2()"),
    ggplot(df, aes("cty", "hwy")) + \
        geom_step(stat='qq2') + \
        ggtitle("geom_step(stat='qq2')"),
])

#### `'qq2_line'`

In [24]:
gggrid([
    ggplot(df, aes("cty", "hwy")) + \
        geom_qq2() + \
        geom_qq2_line() + \
        ggtitle("geom_qq2_line()"),
    ggplot(df, aes("cty", "hwy")) + \
        geom_area(stat='qq2_line', size=0, fill="#fee0d2") + \
        geom_qq2() + \
        coord_cartesian(ylim=[7, 58]) + \
        ggtitle("geom_area(stat='qq2_line')"),
])

#### `'smooth'`

In [25]:
gggrid([
    ggplot(df, aes("cty", "hwy")) + \
        geom_point() + \
        geom_smooth() + \
        ggtitle("geom_smooth()"),
    ggplot(df, aes("cty", "hwy")) + \
        geom_point() + \
        geom_pointrange(stat='smooth', n=50, color="magenta", fatten=3) + \
        ggtitle("geom_pointrange(stat='smooth')"),
])

#### `'summary_bin'`

In [26]:
gggrid([
    ggplot(df, aes("cty", "hwy")) + \
        stat_summary_bin() + \
        ggtitle("stat_summary_bin()"),
    ggplot(df, aes("cty", "hwy")) + \
        geom_crossbar(stat='summarybin') + \
        ggtitle("geom_crossbar(stat='summarybin')"),
])