In [1]:
import math
import statistics
import numpy as np
import scipy.stats 
import pandas as pd

### Variance
It shows numerically how far the data points are from the mean

In [7]:
x = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, math.nan, 4, 28.0]

In [8]:
var_ = statistics.variance(x)
var_

123.2

In [9]:
statistics.variance(x_with_nan)

nan

In [10]:
var_ = np.var(x, ddof=1)  # dont worry about ddof, it is just a norm
# or var_ = y.var(ddof=1)
var_

123.19999999999999

In [11]:
# create np.array 
y, y_with_nan = np.array(x), np.array(x_with_nan)

# create pd.Series
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)

In [12]:
np.var(y, ddof=1) 

123.19999999999999

In [13]:
# ignore nan
z_with_nan.var(ddof=1)

123.19999999999999

### Standard Deviation
The sample standard deviation is another measure of data spread. It’s connected to the sample variance, as standard deviation, 𝑠, is "the positive square root of the sample variance".

The standard deviation is often more convenient than the variance because it has the same unit as the data points

In [15]:
std_ = var_ ** 0.5
std_

11.099549540409285

In [16]:
std_ = statistics.stdev(x)


In [17]:
np.std(y, ddof=1)

11.099549540409285

In [18]:
y.std(ddof=1)

11.099549540409285

In [19]:
y_with_nan.std(ddof=1)

nan

In [20]:
# ignore nan
np.nanstd(y_with_nan, ddof=1)

11.099549540409285

In [22]:
# pd.Series objects also have the method .std() that skips nan by default:
z_with_nan.std(ddof=1) # skipna= True (default)

11.099549540409285

### Skewness
1.  measures the asymmetry of a data sample
2. Usually, negative skewness values indicate that there’s a dominant tail on the left side, which you can see with the first set. Positive skewness values correspond to a longer or fatter tail on the right side, which you can see in the second set. 
![image.png](https://files.realpython.com/media/py-stats-07.92abf9f362b0.png)
3. If the skewness is close to 0 (for example, between −0.5 and 0.5), then the dataset is considered quite symmetrical.

In [27]:
# create np.array 
y, y_with_nan = np.array(x), np.array(x_with_nan)
scipy.stats.skew(y, bias=False)
#  bias is set to False to enable the corrections for statistical bias

1.9470432273905927

In [24]:
scipy.stats.skew(y_with_nan, bias=False)

nan

In [28]:
# pandas Series objects
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)
z.skew()

1.9470432273905924

In [29]:
z_with_nan.skew() # ignore nan by default

1.9470432273905924

### Percentiles

this returns the item not the average of the range of p% of items!

In [42]:
x = [-5.0, -1.1, 0.1, 2.0, 8.0, 12.8, 21.0, 25.8, 41.0]

In [43]:
y = np.array(x)
np.percentile(y, 25)

0.1

In [44]:
np.percentile(y, 50)

8.0

In [45]:
np.median(y) # as same as 50% (second quartile)

8.0

In [46]:
# print a lot at once
np.percentile(y, [25, 50, 75])

array([ 0.1,  8. , 21. ])

In [47]:
# ignore nan
y_with_nan = np.insert(y, 2, np.nan)
y_with_nan

array([-5. , -1.1,  nan,  0.1,  2. ,  8. , 12.8, 21. , 25.8, 41. ])

In [48]:
np.nanpercentile(y_with_nan, [25, 50, 75])

array([ 0.1,  8. , 21. ])

In [49]:
# NumPy: another way is to use quantile() and nanquantile()
np.quantile(y, [0.25, 0.5, 0.75])

array([ 0.1,  8. , 21. ])

In [53]:
np.nanquantile(y_with_nan, [0.25, 0.5, 0.75])

array([ 0.1,  8. , 21. ])

In [54]:
# pd.Series objects have the method .quantile():

z, z_with_nan = pd.Series(y), pd.Series(y_with_nan)
z.quantile(0.05)

-3.44

In [55]:
# ignore nan by default
z_with_nan.quantile([0.25, 0.5, 0.75])

0.25     0.1
0.50     8.0
0.75    21.0
dtype: float64

### Ranges
The range of data is the difference between the maximum and minimum element in the dataset.

In [56]:
print(y)
print(y_with_nan)
print(z)
print(z_with_nan)

[-5.  -1.1  0.1  2.   8.  12.8 21.  25.8 41. ]
[-5.  -1.1  nan  0.1  2.   8.  12.8 21.  25.8 41. ]
0    -5.0
1    -1.1
2     0.1
3     2.0
4     8.0
5    12.8
6    21.0
7    25.8
8    41.0
dtype: float64
0    -5.0
1    -1.1
2     NaN
3     0.1
4     2.0
5     8.0
6    12.8
7    21.0
8    25.8
9    41.0
dtype: float64


In [57]:
np.ptp(y)

46.0

In [58]:
np.ptp(z)

46.0

In [59]:
np.ptp(y_with_nan) # !!

nan

In [60]:
np.ptp(z_with_nan)

nan

### Summary of Descriptive Statistics

In [62]:
# scipy's sexy describe()
print(y)
result = scipy.stats.describe(y, ddof=1, bias=False)
result

[-5.  -1.1  0.1  2.   8.  12.8 21.  25.8 41. ]


DescribeResult(nobs=9, minmax=(-5.0, 41.0), mean=11.622222222222222, variance=228.75194444444446, skewness=0.9249043136685094, kurtosis=0.14770623629658886)

In [64]:
print(y_with_nan)
result = scipy.stats.describe(
    y_with_nan, ddof=1, bias=False,
    nan_policy='omit' # ignore nan
    )
result

[-5.  -1.1  nan  0.1  2.   8.  12.8 21.  25.8 41. ]


DescribeResult(nobs=9, minmax=(masked_array(data=-5.,
             mask=False,
       fill_value=1e+20), masked_array(data=41.,
             mask=False,
       fill_value=1e+20)), mean=11.622222222222222, variance=228.75194444444446, skewness=masked_array(data=0.92490431,
             mask=False,
       fill_value=1e+20), kurtosis=0.14770623629658886)

Returned results:
- nobs: the number of observations or elements in your dataset
- minmax: the tuple with the minimum and maximum values of your dataset
- mean: the mean of your dataset
- variance: the variance of your dataset
- skewness: the skewness of your dataset
- kurtosis: the kurtosis of your dataset

In [65]:
# Pandas (a.k.a. Series) also has .descibe()
result = z.describe()
result

count     9.000000
mean     11.622222
std      15.124548
min      -5.000000
25%       0.100000
50%       8.000000
75%      21.000000
max      41.000000
dtype: float64