# More on Plotly Express

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 10)
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Automobile Dataset

We will use the Automobile Data Set [https://archive.ics.uci.edu/ml/datasets/automobile] from the UCI Machine Learning Repository [https://archive-beta.ics.uci.edu/]. It includes categorical and continuous variables. 

In [3]:
# Defining the headers
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration", "num_doors", 
            "body_style", "drive_wheels", "engine_location","wheel_base", "length", "width", 
            "height", "curb_weight", "engine_type", "num_cylinders", "engine_size", "fuel_system",
            "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm","city_mpg", 
            "highway_mpg", "price"]

In [4]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )
df.head(3)

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,...,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,...,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,...,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,...,154.0,5000.0,19,26,16500.0


### Analyzing Missing Values

`Plotly` does not work with missing values. We are going to remove them.

In [5]:
# Analysing missing values
df.isnull().sum()

symboling             0
normalized_losses    41
make                  0
fuel_type             0
aspiration            0
num_doors             2
body_style            0
drive_wheels          0
engine_location       0
wheel_base            0
length                0
width                 0
height                0
curb_weight           0
engine_type           0
num_cylinders         0
engine_size           0
fuel_system           0
bore                  4
stroke                4
compression_ratio     0
horsepower            2
peak_rpm              2
city_mpg              0
highway_mpg           0
price                 4
dtype: int64

In [6]:
# Removing the missing values
df.dropna(inplace=True)
df.isnull().sum()

symboling            0
normalized_losses    0
make                 0
fuel_type            0
aspiration           0
num_doors            0
body_style           0
drive_wheels         0
engine_location      0
wheel_base           0
length               0
width                0
height               0
curb_weight          0
engine_type          0
num_cylinders        0
engine_size          0
fuel_system          0
bore                 0
stroke               0
compression_ratio    0
horsepower           0
peak_rpm             0
city_mpg             0
highway_mpg          0
price                0
dtype: int64

## Box, violin, and strip plots

### Boxplots

Boxplot, also named box and whisker plot, shows data with a five-number summary:
- min value
- quartile q1 (25%)
- quartile q2 or median (50%)
- quartile q3 (75%)
- max value

In [7]:
fig_x1 = px.box(df, x='make', y='price', 
            width=1000, title="Price by Make")
fig_x1.show()

In [8]:
fig_x2 = px.box(df, x='body_style', y='price', color='body_style',
            width=800, title="Price by Body Style")
fig_x2.show()

In [9]:
fig_x2 = px.box(df, x='body_style', y='price', color='body_style', points='all',
            width=800, title="Price by Body Style")
fig_x2.show()

### Strip Plots

In [10]:
fig_p = px.strip(df, x='body_style', y='price', color='body_style',
                width=800, title="Price by Body Style")
fig_p.show()

### Violin Plots

Violin plots combine box plots and kernel density graphs. They provide much more information than box plots. They allow, for instance, to know whether a data distribution is bimodal. 

In [11]:
fig_v = px.violin(df, x='body_style', y='price', color='body_style', 
                width=800, title="Price by Body Style")
fig_v.show()

In [12]:
fig_v = px.violin(df, x='body_style', y='price', color='body_style', points='all',
                width=800, title="Price by Body Style")
fig_v.show()

In [13]:
# Combining violin and box plots
fig_v = px.violin(df, x='body_style', y='price', color='body_style', box=True,
                width=800, title="Price by Body Style")
fig_v.show()

In [14]:
# Combining violin and box plots
fig_v = px.violin(df, x='body_style', y='price', color='body_style', box=True, points='all',
                width=800, title="Price by Body Style")
fig_v.show()

## Scatterplots

In [15]:
# Simple scatterplot
fig_st = px.scatter(df, x='city_mpg', y='highway_mpg', 
            width=800, title="City (MPG) vs Highway (MPG)")
fig_st.show()

In [16]:
# Adding the size parameter
fig_st = px.scatter(df, x='city_mpg', y='highway_mpg', size='price', color='price', 
                width=800, title="City (MPG) vs Highway (MPG) with Price")
fig_st.show()

In [17]:
# Adding boxplots as marginals
fig_st = px.scatter(df, x='city_mpg', y='highway_mpg', size='price', 
                color='price', marginal_x='box', marginal_y='box', 
                width=700, height=500, title="City (MPG) vs Highway (MPG) with Price")                    
fig_st.show()

In [18]:
# Adding violin plots as marginals
fig_st = px.scatter(df, x='city_mpg', y='highway_mpg',  size='price', 
                color='price', marginal_x='violin', marginal_y='violin', 
                width=700, height=500, title="City (MPG) vs Highway (MPG) with Price")                    
fig_st.show()

In [19]:
# Adding histograms as marginals
fig_st = px.scatter(df, x='city_mpg', y='highway_mpg', size='price', 
                color='price', marginal_x='histogram', marginal_y='histogram', 
                width=700, height=500, title="City (MPG) vs Highway (MPG) with Price")                    
fig_st.show()

In [20]:
# Combining violin and box plots as marginals
fig_st = px.scatter(df, x='city_mpg', y='highway_mpg', size='price', 
                color='price', marginal_x='violin', marginal_y='box', 
                width=700, height=500, title="City (MPG) vs Highway (MPG) with Price")                    
fig_st.show()

## Wide-Form vs. Long-Form Data in Plotly

There are some standard conventions for storing column-oriented data in a DataFrame. We will see long-form and wide-form data.
- **Long-form data** has one row per observation and one column per variable. This format is sometimes called "tidy." It helps store multivariate data.
- **Wide-form data** has one row per value of the first variable and one column per value of the second variable. It helps store 2-dimensional data. It is a crosstab.

Suppose we want to work only with two variables: `body_style` and `num_doors`

In [21]:
# Wide-Form Data Example (a crosstab)
wideF = pd.crosstab(df.num_doors, df.body_style)
wideF

body_style,convertible,hardtop,hatchback,sedan,wagon
num_doors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
four,0,0,10,68,17
two,2,5,46,11,0


In [22]:
# Long-Form Data Example
longF = wideF.reset_index().melt(
    id_vars=['num_doors']
    )
longF

Unnamed: 0,num_doors,body_style,value
0,four,convertible,0
1,two,convertible,2
2,four,hardtop,0
3,two,hardtop,5
4,four,hatchback,10
5,two,hatchback,46
6,four,sedan,68
7,two,sedan,11
8,four,wagon,17
9,two,wagon,0


### Graphs using wide-form data

In [23]:
# Bar Graph using wide-form data
fig_wd = px.bar(wideF, x=wideF.index, y=wideF.columns, 
            width=800, title='Number of doors vs body style')
fig_wd.show()

In [24]:
# Using the wide-form data, you do not need to specify x and y parameters
fig_wd = px.bar(wideF, 
            width=800, title='Number of doors vs body style')
fig_wd.show()

In [25]:
# Adding the facet_col parameter
fig_wd = px.bar(wideF, facet_col="body_style", 
            width=1000, title='Number of doors vs body style')
fig_wd.show()

In [26]:
# Adding the facet_row parameter
fig_wd = px.bar(wideF, facet_row="body_style", 
            width=500, height=1000, title='Number of doors vs body style')
fig_wd.show()

### Graphs using long-form data

In [27]:
longF

Unnamed: 0,num_doors,body_style,value
0,four,convertible,0
1,two,convertible,2
2,four,hardtop,0
3,two,hardtop,5
4,four,hatchback,10
5,two,hatchback,46
6,four,sedan,68
7,two,sedan,11
8,four,wagon,17
9,two,wagon,0


In [28]:
# Using the long-form data, you do need to specify x and y parameters
fig_lg = px.bar(longF, 
            width=800, title='Number of doors vs body style')            # This will raise an error!
fig_lg.show()

ValueError: Plotly Express cannot process wide-form data with columns of different type.

In [None]:
# Bar Graph using long-form data: you have to specify x and y parameters 
fig_lg = px.bar(longF, x='body_style', y='value', 
            width=800, title='Number of doors vs body style')
fig_lg.show()

In [None]:
# Adding the color parameter 
fig_lg = px.bar(longF, x='body_style', y='value', color='num_doors', 
            width=800, title='Number of doors vs body style')
fig_lg.show()

In [None]:
# Line Graph using long-form data
fig_ln = px.line(longF, x='body_style', y='value', color='num_doors', 
            width=800, title='Number of doors vs body style')
fig_ln.show()

In [None]:
# line_dash is similar to color but changes the dash pattern instead of color
fig_ln = px.line(longF, x='body_style', y='value', line_dash='num_doors', 
            width=800, title='Number of doors vs body style')
fig_ln.show()

In [None]:
# Using color and line_dash parameters
fig_ln = px.line(longF, x='body_style', y='value', color='num_doors', line_dash='num_doors', 
                width=800, title='Number of doors vs body style')
fig_ln.show()

## Tooltips

Plotly brings interactive tooltips. 
- `hover_data`: let you add or remove tooltips by setting them True/False
- `labels`: let you rename the column names inside the tooltip
- `hover_name`: highlights value of this column on the top of the tooltip

In [None]:
# Hover the mouse over the data points to see the values of fuel_type, make, and price
fig_h = px.histogram(df, x='make', y='price', color='fuel_type', histfunc='avg',
            width=1000, title='Average of Price with Make and Fuel Type')
fig_h.show()

In [None]:
# Hover the mouse over the data points to see the values of make and price
# You won't see fuel_type because we set it to false
fig_h = px.histogram(df, x='make', y='price', color='fuel_type', histfunc='avg',
            hover_data={'fuel_type':False},
            width=1000, title='Average of Price with Make and Fuel Type')
fig_h.show()

When you hover the mouse over the data points, you will see the values of make and price, you won't see fuel_type because we set it to false.

In [None]:
# Hover the mouse over the data points to see the values of make and price
fig_h = px.histogram(df, x='make', y='price', color='fuel_type', histfunc='avg',
            hover_data={'fuel_type':False}, 
            labels={'price':'Price', 'make':'Make', 'fuel_type':'Fuel Type'},
            width=1000, title='Average of Price with Make and Fuel Type')
fig_h.show()

We modified the labels. Now we have Price and Make with capital letters. 

Notice we also changed fuel_type to Fuel Type. You can see that in the legend.

Let's create a bar chart using the same variables.

In [None]:
# Hover the mouse over the data points to see the values of make and price
fig_b = px.bar(df, x='make', y='price', color='fuel_type', 
            hover_data={'fuel_type':False}, 
            labels={'price':'Price', 'make':'Make', 'fuel_type':'Fuel Type'},
            width=1000, title='Car Price with Make and Fuel Type')
fig_b.show()

Notice all the labels have capital letters. 

Now we want to visualize the car price with one decimal place. Let's see!

In [None]:
# Hover the mouse over the data points to see the values of make and price
fig_b = px.bar(df, x='make', y='price', color='fuel_type', 
            hover_data={'fuel_type':False, 'price':':.1f'}, 
            labels={'price':'Price', 'make':'Make', 'fuel_type':'Fuel Type'},
            width=1000, title='Car Price with Make and Fuel Type')
fig_b.show()

Hover the mouse over the data points to see the price value with one decimal place. 

Finally, we want to add an additional information: the horsepower value of each car.

In [None]:
# Hover the mouse over the data points to see the values of make and price
fig_b = px.bar(df, x='make', y='price', color='fuel_type', 
            hover_data={'fuel_type':False, 'price':':.1f'}, 
            labels={'price':'Price', 'make':'Make', 'fuel_type':'Fuel Type'},
            hover_name='horsepower',  
            width=1000, title='Car Price with Make and Fuel Type')
fig_b.show()

When you hover over the data point, you will se a number in the top left corner. That is the horsepower value for the car.

## Color Related Parameters

In [None]:
# color_discrete_sequence: set up the exact color of each bar using a list of colors
fig_b = px.bar(df, x='make', y='price', 
                color='fuel_type', color_discrete_sequence=['green', 'orange'],
                labels={'price':'Price', 'make':'Make', 'fuel_type':'Fuel Type'},
                width=1000, title='Car Price with Make and Fuel Type')
fig_b.show()

In [None]:
# color_discrete_map: set up the exact color of each bar using a dictionary
fig_b = px.bar(df, x='make', y='price', 
                color='fuel_type', color_discrete_map={'gas':'grey', 'diesel':'brown'},
                labels={'price':'Price', 'make':'Make', 'fuel_type':'Fuel Type'},
                width=1000, title='Car Price with Make and Fuel Type')                
fig_b.show()

In [None]:
# color_discrete_map: set up the exact color of each bar using a dictionary
fig_b = px.bar(df, x='make', y='price', 
                color='price', color_continuous_scale='ice_r',
                labels={'price':'Price', 'make':'Make'},
                width=1000, title='Car Price vs Make')
fig_b.show()

## Reference

- https://plotly.com/python/plotly-express/