## Plotly Module To Work With Statistical Based Charts

### Importing The Necessary Libraries

In [140]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from plotly.offline import iplot
import plotly as py
import plotly.tools as tls
import cufflinks as cf

### Sets Plotly And Cufflinks As Offline Mode

In [2]:
py.offline.init_notebook_mode(connected = True)
cf.go_offline()

# Histrogram Charts

    '''
        In statistics, a histogram is representation of the distribution of numerical data, where the 
        data are binned and the count for each bin is represented. 
        
        More generally, in plotly a histogram is an aggregated bar chart, with several possible 
        aggregation functions (e.g. sum, average, count...). Also, the data to be binned can be
        numerical data but also categorical or date data.
    
    '''

### Histogram with Plotly Express

In [141]:
import plotly.express as px
df1 = px.data.tips()
fig = px.histogram(df1, x="total_bill")
fig.show()

In [6]:
import plotly.express as px
df1 = px.data.tips()
# Here we use a column with categorical data
fig = px.histogram(df1, x="day")
fig.show()

### Choosing the number of bins

    '''
        By default, the number of bins is chosen so that this number is comparable to the 
        typical number of samples in a bin. 
        This number can be customized, as well as the range of values.

    '''


In [7]:
import plotly.express as px
df1 = px.data.tips()
fig = px.histogram(df1, x="total_bill", nbins=20)
fig.show()

### Aspect of the histogram plot

In [82]:
df1 = px.data.tips()
fig = px.histogram(df1, x="total_bill", color = 'sex',
                   title='Histogram of bills',
                   labels={'total_bill':'total bill'}, # can specify one label per df1 column
                   opacity=0.8, # opacity 80%
                   log_y=True, # represent bars with log scale
                   color_discrete_sequence=['indianred', 'blue'] # color of histogram bars
                   )
fig.show()

### Using histfunc

    '''
        For each bin of x, one can compute a function of data using histfunc. The argument of histfunc 
        is the dataframe column given as the y argument. 
        
        Below the plot shows that the average tip increases with the total bill.

    '''

In [83]:
df1 = px.data.tips()
fig = px.histogram(df1, x="total_bill", y="tip", histfunc='avg')
fig.show()

### Visualizing the distribution

    '''
        With the marginal keyword, a subplot is drawn alongside the histogram, visualizing the distribution. 
        See the distplot pagefor more examples of combined statistical representations.

    '''

In [85]:
df1 = px.data.tips()
fig = px.histogram(df1, x="total_bill", color="sex", marginal="rug", # can be `box`, `violin`
                         hover_data=df1.columns)
fig.show()

### Wokring With Randomly Generated DataFrame Data Sets

In [16]:
import numpy as np
import random 
import pandas as pd

In [17]:
np.random.seed(1234)

In [34]:
age = np.round(np.random.uniform(low = 21, high = 75, size = 100), 0) # rounded the age to 0 decimal point
salary = np.round(np.random.normal(loc = 3000, scale = 1000, size = 100), 4) # rounded the salart value up to 4 decimal point
binary_gender = random.choices(['Female', 'Male'], k = 100) # generating 100 male or females value as list item

In [36]:
# now creating the DataFrame
df2 = pd.DataFrame(data = dict(Age = age, Salary = salary, Gender = binary_gender ))
df2.head()

Unnamed: 0,Age,Salary,Gender
0,65.0,2387.4219,Female
1,32.0,2939.5123,Female
2,27.0,4092.6378,Female
3,60.0,953.9074,Female
4,44.0,3419.0311,Female


In [46]:
### Now Plotting The Age Column
fig = px.histogram(data_frame = df2, x = df2['Age'], nbins = 50, title = 'Age Distribution Hist')
fig.show()

In [56]:
import plotly.graph_objects as go
trace = go.Histogram(x = df2['Age'])
data = [trace]
layout = {'title' : 'Distribution Of Ages',
         'xaxis':{'title': 'Ages'}
         }
iplot({'data' : data, 'layout':layout})

# Type of normalization

    '''
        The default mode is to represent the count of samples in each bin. With the histnorm argument, 
        it is also possible to represent the percentage or fraction of samples in 
        each bin (histnorm='percent' or probability), or a density histogram (the sum of all bar 
        areas equals the total number of sample points, density), or a probability 
        density histogram (the sum of all bar areas equals 1, probability density).

    '''

### Frequency Distribution(Normalized Histrogram)

In [47]:
fig = px.histogram(df2, x=df2['Age'], histnorm='probability density')
fig.show()

In [48]:
fig = px.histogram(df2, x=df2['Age'], histnorm='density')
fig.show()

### OverLay Fashion

In [62]:
# filtering out the male and female candidates
female = df2[df2['Gender'] == 'Female']
male = df2[df2['Gender'] == 'Male']

In [65]:
trace0 = go.Histogram(x = female.Age, name = 'Female Age')
trace1 = go.Histogram(x = male.Age, name = 'Male Age')
data1 =  [trace0 , trace1]
data1

[Histogram({
     'name': 'Female Age',
     'x': array([65., 32., 27., 60., 44., 73., 69., 57., 56., 73., 47., 42., 24., 67.,
                 27., 53., 41., 55., 40., 70., 45., 58., 42., 62., 33., 58., 58., 40.,
                 42., 24., 65., 25., 65., 29., 36., 56., 50., 46., 40., 42., 31., 71.,
                 50., 42., 36., 60., 70., 48., 65., 49., 43., 23., 27., 54., 25., 66.])
 }),
 Histogram({
     'name': 'Male Age',
     'x': array([68., 42., 27., 52., 32., 43., 48., 55., 57., 59., 65., 71., 45., 42.,
                 30., 46., 37., 68., 51., 28., 74., 40., 42., 74., 33., 63., 56., 39.,
                 55., 50., 59., 56., 28., 73., 26., 33., 59., 30., 50., 53., 44., 66.,
                 42., 30.])
 })]

In [69]:
trace0 = go.Histogram(x = female.Age, name = 'Female Age')
trace1 = go.Histogram(x = male.Age, name = 'Male Age', opacity = 0.8)
data1 =  [trace0 , trace1]

layout = {'title': "Ages By Genderwise",
         'xaxis': {'title': 'Ages'},
         'barmode': 'overlay'}

iplot({'data': data1, 'layout':layout})

In [86]:
# specifying the data set we need
x0 = female.Age
x1 = male.Age

#creating the Figure Object
fig = go.Figure()


#adding the traces
fig.add_trace(go.Histogram(x=x0, name = 'Female Age'))
fig.add_trace(go.Histogram(x=x1, name = 'Male Age'))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

### Stacked Fashion

In [71]:
trace0 = go.Histogram(x = female.Age, name = 'Female Age')
trace1 = go.Histogram(x = male.Age, name = 'Male Age', opacity = 0.8)
data1 =  [trace0 , trace1]

layout = {'title': "Ages By Genderwise",
         'xaxis': {'title': 'Ages'},
         'barmode': 'stack'}

iplot({'data': data1, 'layout':layout})

### Specifying The Bins Size

In [74]:
trace0 = go.Histogram(x = female.Age, name = 'Female Age', opacity = 0.4,
                     xbins = dict(start = 20, end = 80, size = 5)) #xbins incremented by 5
trace1 = go.Histogram(x = male.Age, name = 'Male Age', opacity = 0.8,
                     xbins = dict(start = 20, end = 80, size = 5)) #xbins incremented by 5
data1 =  [trace0 , trace1]

layout = {'title': "Ages By Genderwise",
         'xaxis': {'title': 'Ages'},
         'barmode': 'stack'}

iplot({'data': data1, 'layout':layout})

In [77]:
trace0 = go.Histogram(x = female.Salary, name = 'Female Salary',
                     xbins = dict(start = 500, end = 5500, size = 200)) #xbins incremented by 5
trace1 = go.Histogram(x = male.Salary, name = 'Male Salary', opacity = 0.8,
                     xbins = dict(start = 500, end = 5500, size = 200)) #xbins incremented by 5
data1 =  [trace0 , trace1]

layout = {'title': "Salary By Genderwise",
         'xaxis': {'title': 'Salaries'},
         'barmode': 'stack'}

iplot({'data': data1, 'layout':layout})

### Styling The Histrogram

In [88]:
x0 = female.Salary
x1 = male.Salary

fig = go.Figure()
fig.add_trace(go.Histogram(
    x=x0,
    histnorm='percent',
    name='Female Salary', # name used in legend and hover labels
    xbins=dict( # bins used for histogram
        start=-500,
        end=5500,
        size=200
    ),
    marker_color='#EB89B5',
    opacity=0.75
))
fig.add_trace(go.Histogram(
    x=x1,
    histnorm='percent',
    name='Male Salary',
    xbins=dict(
        start=500,
        end=5500,
        size=200
    ),
    marker_color='#330C73',
    opacity=0.75
))

fig.update_layout(
    title_text='Salary Genderwise', # title of plot
    xaxis_title_text='Salaries', # xaxis label
    yaxis_title_text='Count', # yaxis label
    bargap=0.2, # gap between bars of adjacent location coordinates
    bargroupgap=0.1 # gap between bars of the same location coordinates
)

fig.show()

### Cumulative Histogram

In [89]:
x1 = male.Salary
fig = go.Figure(data=[go.Histogram(x=x1, cumulative_enabled=True)])

fig.show()

### Specify Aggregation Function

In [90]:
x = ["Apples","Apples","Apples","Oranges", "Bananas"]
y = ["5","10","3","10","5"]

fig = go.Figure()
fig.add_trace(go.Histogram(histfunc="count", y=y, x=x, name="count"))
fig.add_trace(go.Histogram(histfunc="sum", y=y, x=x, name="sum"))

fig.show()

### Custom Binning

    '''
        For custom binning along x-axis, use the attribute nbinsx. Please note that the 
        autobin algorithm will choose a 'nice' round bin size that may result in somewhat 
        fewer than nbinsx total bins. Alternatively, you can set the exact values 
        for xbins along with "autobinx" = False.
    
    '''

In [91]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

x = ['1970-01-01', '1970-01-01', '1970-02-01', '1970-04-01', '1970-01-02',
     '1972-01-31', '1970-02-13', '1971-04-19']

fig = make_subplots(rows=3, cols=2)

trace0 = go.Histogram(x=x, nbinsx=4)
trace1 = go.Histogram(x=x, nbinsx = 8)
trace2 = go.Histogram(x=x, nbinsx=10)
trace3 = go.Histogram(x=x,
                      xbins=dict(
                      start='1969-11-15',
                      end='1972-03-31',
                      size='M18'), # M18 stands for 18 months
                      autobinx=False
                     )
trace4 = go.Histogram(x=x,
                      xbins=dict(
                      start='1969-11-15',
                      end='1972-03-31',
                      size='M4'), # 4 months bin size
                      autobinx=False
                      )
trace5 = go.Histogram(x=x,
                      xbins=dict(
                      start='1969-11-15',
                      end='1972-03-31',
                      size= 'M2'), # 2 months
                      autobinx = False
                      )

fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig.append_trace(trace2, 2, 1)
fig.append_trace(trace3, 2, 2)
fig.append_trace(trace4, 3, 1)
fig.append_trace(trace5, 3, 2)

fig.show()

# Distribution Charts

In [111]:
import plotly.express as px
df3 = px.data.tips()
fig = px.histogram(df3, x="total_bill", y="tip", color="sex", marginal="rug", #or violin, box
                   hover_data=df3.columns)
fig.show()

### Working Figure Factory Package In Plotly 

    '''
    
        The distplot figure factory displays a combination of statistical representations of numerical data, 
        such as histogram, kernel density estimation or normal curve, and rug plot.

    '''

In [93]:
import plotly.figure_factory as ff


In [104]:
##checking how many function we have here
help(ff)

Help on package plotly.figure_factory in plotly:

NAME
    plotly.figure_factory

PACKAGE CONTENTS
    _2d_density
    _annotated_heatmap
    _bullet
    _candlestick
    _county_choropleth
    _dendrogram
    _distplot
    _facet_grid
    _gantt
    _ohlc
    _quiver
    _scatterplot
    _streamline
    _table
    _ternary_contour
    _trisurf
    _violin
    utils

FUNCTIONS
    create_2d_density(x, y, colorscale='Earth', ncontours=20, hist_color=(0, 0, 0.5), point_color=(0, 0, 0.5), point_size=2, title='2D Density Plot', height=600, width=600)
        **deprecated**, use instead
        :func:`plotly.express.density_heatmap`.
        
        :param (list|array) x: x-axis data for plot generation
        :param (list|array) y: y-axis data for plot generation
        :param (str|tuple|list) colorscale: either a plotly scale name, an rgb
            or hex color, a color tuple or a list or tuple of colors. An rgb
            color is of the form 'rgb(x, y, z)' where x, y, z belong to 

In [94]:
age = np.round(np.random.uniform(low = 21, high = 75, size = 100), 0) # rounded the age to 0 decimal point
salary = np.round(np.random.normal(loc = 3000, scale = 1000, size = 100), 4) # rounded the salart value up to 4 decimal point
binary_gender = random.choices(['Female', 'Male'], k = 100) # generating 100 male or females value as list item

In [95]:
# now creating the DataFrame
df2 = pd.DataFrame(data = dict(Age = age, Salary = salary, Gender = binary_gender ))
df2.head()

Unnamed: 0,Age,Salary,Gender
0,31.0,2777.4477,Male
1,56.0,3223.1075,Male
2,71.0,3051.6486,Male
3,32.0,3499.1673,Female
4,70.0,2188.752,Male


In [102]:
salary = df2.Salary.values.tolist() # df2.Salary cut the dataframe into Salary Series then df2.Salary.values makes it a ndarray
                                    # then df2.Salary.values.tolist() to makes it list of items
#salary

### Fuction Used -->
'''
    create_distplot(hist_data, group_labels, bin_size=1.0, curve_type='kde', colors=None, rug_text=None, 
    histnorm='probability density', show_hist=True, show_curve=True, show_rug=True)
'''

In [109]:
fig = ff.create_distplot(hist_data = [df2.Salary.values.tolist()],
                    group_labels = ['Salary Distribution'], # group_labels arguments need to be provided or u get TypeError
                    bin_size = [200])
iplot(fig)

### Plot Multiple Datasets

In [112]:
# filtering out the male and female candidates
female = df2[df2['Gender'] == 'Female']
male = df2[df2['Gender'] == 'Male']

In [123]:
hist_data = [female.Salary.values.tolist(), male.Salary.values.tolist()]
group_labels = ["Female Salary", "Male Salary"]
bins_sizes = [200, 200]
color = ['blue', 'rgba(55, 150, 200, 0.8)']
fig = ff.create_distplot(hist_data= hist_data, group_labels = group_labels, bin_size = bins_sizes, colors = color)
fig.show()

###

### Plot Normal Curve

In [129]:
hist_data = [female.Salary.values.tolist(), male.Salary.values.tolist()]
group_labels = ["Female Salary", "Male Salary"]
bins_sizes = [200, 200]
color = ['blue', 'rgba(55, 150, 200, 0.8)']
fig = ff.create_distplot(hist_data= hist_data, group_labels = group_labels, bin_size = bins_sizes, 
                         colors = color, curve_type = 'normal')
fig.layout.update({'title': 'Normally Fitted Chart'})
fig.show()

### Plot Only Curve and Rug

In [125]:
import plotly.figure_factory as ff
import numpy as np

x1 = np.random.randn(200) - 1
x2 = np.random.randn(200)
x3 = np.random.randn(200) + 1

hist_data = [x1, x2, x3]

group_labels = ['Group 1', 'Group 2', 'Group 3']
colors = ['#333F44', '#37AA9C', '#94F3E4']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, show_hist=False, colors=colors) #show_rug = False also make the rugplot not shown

# Add title
fig.update_layout(title_text='Curve and Rug Plot')
fig.show()

### Plot Only Hist and Rug With Different Bins Sizes

In [128]:

x1 = np.random.randn(200) - 1
x2 = np.random.randn(200)
x3 = np.random.randn(200) + 1

hist_data = [x1, x2, x3]

group_labels = ['Group 1', 'Group 2', 'Group 3']
colors = ['#835AF1', '#7FA6EE', '#B8F7D4']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, colors=colors,
                         bin_size=[0.3, 0.2, 0.1],
                         show_curve=False, )

# Add title
fig.update_layout(title_text='Hist and Rug Plot')
fig.show()

# Box Charts

    '''
        A box plot is a statistical representation of numerical data through their quartiles.
        The ends of the box represent the lower and upper quartiles, while the 
        median (second quartile) is marked by a line inside the box. 
    
    '''

### Box Plot with plotly.express

In [130]:
import plotly.express as px
df3 = px.data.tips()
fig = px.box(df3, y="total_bill")
fig.show()

In [131]:
df3 = px.data.tips()
fig = px.box(df3, x="time", y="total_bill")
fig.show()

### Display the underlying data

    '''
        With the points argument, display underlying data points with either all points (all), outliers only
        (outliers, default),or none of them (False).
    '''

In [132]:
df3 = px.data.tips()
fig = px.box(df3, x="time", y="total_bill", points="all")
fig.show()

### Choosing The Algorithm For Computing Quartiles

'''
    
    By default, quartiles for box plots are computed using the linear method.
    
    However, you can also choose to use an exclusive or an inclusive algorithm to compute quartiles.
    

    The exclusive algorithm uses the median to divide the ordered dataset into two halves. If the 
    sample is odd, it does not include the median in either half. 
    Q1 is then the median of the lower half and Q3 is the median of the upper half.

    
    The inclusive algorithm also uses the median to divide the ordered dataset into two halves, 
    but if the sample is odd,it includes the median in both halves. 
    Q1 is then the median of the lower half and Q3 the median of the upper half.


'''

In [133]:
df3 = px.data.tips()

fig = px.box(df3, x="day", y="total_bill", color="smoker")
fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
fig.show()

### Difference Between Quartile Algorithms

    '''
        It can sometimes be difficult to see the difference between the linear, inclusive, 
        and exclusive algorithms for computing quartiles. 
        
        In the following example, the same dataset is visualized using each of the three different 
        quartile computation algorithms.
    
    '''

In [134]:
data = [1,2,3,4,5,6,7,8,9]
df = pd.DataFrame(dict(
    linear=data,
    inclusive=data,
    exclusive=data
)).melt(var_name="quartilemethod")


fig = px.box(df, y="value", facet_col="quartilemethod", color="quartilemethod",
             boxmode="overlay", points='all')

fig.update_traces(quartilemethod="linear", jitter=0, col=1)
fig.update_traces(quartilemethod="inclusive", jitter=0, col=2)
fig.update_traces(quartilemethod="exclusive", jitter=0, col=3)

fig.show()