# Histogram, Line, and Area Charts with Altair

Altair Simple Histogram represents frequency distribution using rectangles whose width represents the class interval. A histogram is a graphical representation of data points arranged into a specified range.

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 10)
import altair as alt

## Automobile Dataset

We will use the Automobile Data Set [https://archive.ics.uci.edu/ml/datasets/automobile] from the UCI Machine Learning Repository [https://archive-beta.ics.uci.edu/]. It includes categorical and continuous variables. 

Defining the headers

In [2]:
# Defining the headers
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration", "num_doors", "body_style", 
        "drive_wheels", "engine_location", "wheel_base", "length", "width", "height", "curb_weight", "engine_type", 
        "num_cylinders", "engine_size", "fuel_system", "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
        "city_mpg", "highway_mpg", "price"]

In [3]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,...,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,...,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,...,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,...,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,...,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,...,115.0,5500.0,18,22,17450.0


In [4]:
df.dtypes

symboling              int64
normalized_losses    float64
make                  object
fuel_type             object
aspiration            object
num_doors             object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_cylinders         object
engine_size            int64
fuel_system           object
bore                 float64
stroke               float64
compression_ratio    float64
horsepower           float64
peak_rpm             float64
city_mpg               int64
highway_mpg            int64
price                float64
dtype: object

## Histograms

The most common graph for displaying frequency distributions is a histogram.

Let's work with three variables: `length`, `width`, and `height`.

In [5]:
# Working with length, width, and height
dimens = df[['length','width','height']]
dimens.head()

Unnamed: 0,length,width,height
0,168.8,64.1,48.8
1,168.8,64.1,48.8
2,171.2,65.5,52.4
3,176.6,66.2,54.3
4,176.6,66.4,54.3


In [6]:
alt.Chart(dimens).mark_bar().encode(
    alt.X("length:Q"),
    y='count()',
).properties(
    title={"text":"Car Length", "fontSize":16}    
)

Including the `bin` parameter

In [7]:
# Including the bin parameter
alt.Chart(dimens).mark_bar().encode(
    alt.X("length:Q", bin=True),
    y='count()',
).properties(
    title={"text":"Car Length", "fontSize":16}    
)

Increasing the number of bins

In [8]:
# Increasing the number of bins
alt.Chart(dimens).mark_bar().encode(
    alt.X("length:Q", bin=alt.Bin(maxbins=20)),
    y='count()',
).properties(
    title={"text":"Car Length", "fontSize":16}     
)

In [9]:
print('Min(Length) =', df.length.min())
print('Max(Length) =', df.length.max())

Min(Length) = 141.1
Max(Length) = 208.1


Using `extend`, `step`

In [10]:
# Using extend, step
alt.Chart(dimens).mark_bar().encode(
    alt.X("length:Q", bin=alt.Bin(extent=[140,210], step=5)),
    y='count()',
).properties(
    title={"text":"Car Length", "fontSize":16}  
)

In [11]:
# Using extend, step
alt.Chart(dimens).mark_bar().encode(
    alt.X("length:Q", bin=alt.Bin(extent=[140,210], step=10)),
    y='count()',    
).properties(
    title={"text":"Car Length", "fontSize":16},
    width=500,
    height=300
)

### Histogram with Mean 

Defining the chart and assigning it to the variable baseH

In [12]:
# Defining the chart and assigning it to the variable baseH
baseH = alt.Chart(dimens)

Assigning a variable to the bar chart

In [13]:
# Assigning a variable to the bar chart
barH = baseH.mark_bar(color='wheat').encode(
    alt.X("length:Q", bin=alt.Bin(extent=[140,210], step=10)),
    y='count()', 
).properties(
    title={"text":"Car Length", "fontSize":16},
)

Creating a line chart with the mean

In [14]:
# Creating a line chart with the mean
meanH = baseH.mark_rule(color='black').encode(
    x='mean(length):Q',
    size=alt.value(4)
)

barH + meanH

In [15]:
barH + meanH

### Histogram with Min, Mean, and Max 

Creating a line chart with the min

In [16]:
# Creating a line chart with the min
minH = baseH.mark_rule(color='black').encode(
    x='min(length):Q',
    size=alt.value(1)
)

Creating a line chart with the max

In [17]:
# Creating a line chart with the max
maxH = baseH.mark_rule(color='black').encode(
    x='max(length):Q',
    size=alt.value(1)
)

Layered Charts

barH + minH + meanH + maxH

In [18]:
barH + minH + meanH + maxH

### Histograms in rows

width

In [19]:
# width
alt.Chart(dimens).mark_bar(color='darksalmon').encode(
    alt.X("width:Q", bin=True),
    y='count()',
).properties(
    title={"text":"Car Width", "fontSize":16}    
)

width by fuel_type

In [20]:
# width by fuel_type
alt.Chart(df).mark_bar(color='darksalmon').encode(
    alt.X("width:Q", bin=True),
    y='count()',
    row = 'fuel_type:N'
).properties(
    title={"text":"Car Width", "fontSize":16}    
)

Notice that for creating the last chart, we used `df` instead `dimens` DataFrame. That is because `dimens` does not have the variable `fuel_type`.

### Histograms in columns

height

In [21]:
# height
alt.Chart(dimens).mark_bar(color='skyblue').encode(
    alt.X("height:Q", bin=True),
    y='count()',    
).properties(
    title={"text":"Car Height", "fontSize":16}    
)

height by fuel_type

In [22]:
# height by fuel_type
alt.Chart(df).mark_bar(color='skyblue').encode(
    alt.X("height:Q", bin=True),
    y='count()',
    column =  'fuel_type:N'
).properties(
    title={"text":"Car Height", "fontSize":16}    
)

Notice that for creating the last chart, we used `df` instead `dimens` DataFrame. That is because `dimens` does not have the variable `fuel_type`.

### Overlapping Histograms

Two standard conventions for storing data in a DataFrame are long-form and wide-form.
- **Wide-form data** has one row per independent variable, with metadata recorded in the row and column labels.
- **Long-form data** has one row per observation, with metadata recorded within the table as values.

wide-form

In [23]:
# wide-form
dimens.head()

Unnamed: 0,length,width,height
0,168.8,64.1,48.8
1,168.8,64.1,48.8
2,171.2,65.5,52.4
3,176.6,66.2,54.3
4,176.6,66.4,54.3


long-form

In [24]:
# long-form
dimens.melt(var_name='dimension', value_name='value')

Unnamed: 0,dimension,value
0,length,168.8
1,length,168.8
2,length,171.2
3,length,176.6
4,length,176.6
...,...,...
610,height,55.5
611,height,55.5
612,height,55.5
613,height,55.5


`transform_fold`: The fold transform is, in short, a way to convert wide-form data to long-form data directly without any preprocessing.

Overlapping Histograms: length, width, and height

In [25]:
# Overlapping Histograms: length, width, and height
alt.Chart(dimens).transform_fold(
    ['length', 'width', 'height'],
    as_=['dimension','value']
).mark_bar(
    opacity=0.7,
    binSpacing=0
).encode(
    alt.X('value:Q', bin=alt.Bin(maxbins=100)),
    alt.Y('count()'),
    alt.Color('dimension:N')
).properties(
    width = 500,
    title= {'text':['Overlapping Histograms','(length, width, and height)'],
            'fontSize':20}
)

## Line Graphs

Line Graph of length

In [26]:
# Line Graph of length
alt.Chart(df).mark_line().encode(
    alt.X("length:Q"),
    y='count()',
).properties(
    title={"text":"Car Length", "fontSize":16}    
)

Line Graph with Points

In [27]:
# Line Graph with Points
alt.Chart(dimens).mark_line(
    point=alt.OverlayMarkDef(color="darkred")
).encode(
    alt.X("length:Q"),
    y='count()',
).properties(
    title={"text":"Car Length", "fontSize":16}    
)

In [28]:
# Line Graph with Points
alt.Chart(dimens).mark_line(
    point=alt.OverlayMarkDef(color="darkred", size=80)
).encode(
    alt.X("length:Q"),
    y='count()',
).properties(
    title={"text":"Car Length", "fontSize":16}    
)

### Overlapping Line Charts

In [29]:
# Overlapping Line Charts
alt.Chart(dimens).transform_fold(
    ['length', 'width', 'height'],
    as_=['dimension','value']
).mark_line(
    opacity=0.7,
).encode(
    alt.X('value:Q'),
    alt.Y('count()'),
    alt.Color('dimension:N')
).properties(
    title= {'text':'Overlapping Areas',
            'fontSize':16}
)

## Area Graphs

Filled Step Chart

In [30]:
# Filled Step Chart
alt.Chart(dimens).mark_area(
    opacity=0.7,
    color="silver",
    line = True
).encode(
    alt.X("length:Q"),
    y='count()',
).properties(
    title={"text":"Car Length", "fontSize":16}    
)

Filled Step Chart with interpolation

In [31]:
# Filled Step Chart with interpolation
alt.Chart(dimens).mark_area(
    opacity=0.7,
    color="silver",
    interpolate = 'step',
    line = True
).encode(
    alt.X("length:Q"),
    y='count()',
).properties(
    title={"text":"Car Length", "fontSize":16}    
)

### Overlapping Area Graphs

Overlapping Area Charts

In [32]:
# Overlapping Area Charts
alt.Chart(dimens).transform_fold(
    ['length', 'width', 'height'],
    as_=['dimension','value']
).mark_area(
    opacity=0.7,
    color="silver",
    line = True,
    interpolate = 'step'
).encode(
    alt.X('value:Q'),
    alt.Y('count()'),
    alt.Color('dimension:N')
).properties(
    width = 500,
    title= {'text':'Overlapping Areas',
            'fontSize':16}
)

Overlapping Area Charts

In [33]:
# Overlapping Area Charts
alt.Chart(dimens).transform_fold(
    ['length', 'width', 'height'],
    as_=['dimension','value']
).mark_area(
    opacity=0.7,
    color="silver",
    interpolate = 'step'
).encode(
    alt.X('value:Q'),
    alt.Y('count()'),
    alt.Color('dimension:N')
).properties(
    width = 500,
    title= {'text':'Overlapping Areas',
            'fontSize':16}
)

## References

- https://altair-viz.github.io/