In [1]:
import math
import statistics
import numpy as np
import scipy.stats 
import pandas as pd

### Basic

In [4]:
x = [8, 1, 2.5, 4, 28]
x_with_nan = [8.0, 1, 2.5, math.nan, 4, 28.0]

[8.0, 1, 2.5, nan, 4, 28.0]

In [5]:
x

[8, 1, 2.5, 4, 28]

In [6]:
x_with_nan

[8.0, 1, 2.5, nan, 4, 28.0]

In [8]:
math.isnan(np.nan)

True

In [9]:
np.isnan(x_with_nan[3])

True

In [11]:
# create np.array 
y, y_with_nan = np.array(x), np.array(x_with_nan)

# create pd.Series
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)

In [12]:
y

array([ 8. ,  1. ,  2.5,  4. , 28. ])

In [13]:
y_with_nan

array([ 8. ,  1. ,  2.5,  nan,  4. , 28. ])

In [14]:
z

0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64

In [15]:
z_with_nan

0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64

### Mean: 

It’s the sum of all the elements 𝑥ᵢ divided by the number of items in the dataset 𝑥.

In [18]:
mean_ = statistics.fmean(x)
mean_

8.7

In [19]:
mean_ = y.mean()
mean_

8.7

In [20]:
y_with_nan.mean() # not ignore nan

nan

In [21]:
np.nanmean(y_with_nan) 

8.7

In [22]:
z.mean()

8.7

In [23]:
z_with_nan.mean() # ignore nan

8.7

### Weighted Mean:

You define one weight 𝑤ᵢ for each data point 𝑥ᵢ of the dataset 𝑥, where 𝑖 = 1, 2, …, 𝑛 and 𝑛 is the number of items in 𝑥. 

Then, you multiply each data point with the corresponding weight, sum all the products, and divide the obtained sum with the sum of weights: Σᵢ(𝑤ᵢ𝑥ᵢ) / Σᵢ𝑤ᵢ.


The weighted mean is very handy when you need the mean of a dataset containing items that occur with given relative frequencies

In [27]:
#  For example, say that you have a set in which 20% of all items are equal to 2, 50% of the items are equal to 4, and the remaining 30% of the items are equal to 8. You can calculate the mean of such a set like this:
0.2 * 2 + 0.5 * 4 + 0.3 * 8


4.8

In [29]:
x = [8.0, 1, 2.5, 4, 28.0]
w = [0.1, 0.2, 0.3, 0.25, 0.15] 
wmean = sum(w[i] * x[i] for i in range(len(x))) / sum(w)
wmean

6.95

In [31]:
#  You can use np.average() to get the weighted mean of NumPy arrays or pandas Series:
y, z, w = np.array(x), pd.Series(x), np.array(w)
wmean = np.average(y, weights=w)
wmean

6.95

In [32]:
wmean = np.average(z, weights=w)
wmean

6.95

In [36]:
# another way using pure pyhton
(w * y).sum() / w.sum()

6.95

In [41]:
# be careful if your dataset contains nan values:
w = np.array([0.1, 0.2, 0.3, 0.0, 0.2, 0.1])
a = (w * y_with_nan).sum() / w.sum()
print(a)
a = np.average(y_with_nan, weights=w)
print(a)

nan
nan


### Harmonic Mean

𝑛 / Σᵢ(1/𝑥ᵢ), where 𝑖 = 1, 2, …, 𝑛 and 𝑛 is the number of items in the dataset 𝑥.

In [42]:
x

[8.0, 1, 2.5, 4, 28.0]

In [43]:
hmean = len(x) / sum(1 / item for item in x)
hmean

2.7613412228796843

In [46]:
hmean = statistics.harmonic_mean(x)
hmean

2.7613412228796843

In [47]:
statistics.harmonic_mean([1, 0, 2])

0

In [None]:
statistics.harmonic_mean([1, 2, -2])  # Raises StatisticsError

In [49]:
scipy.stats.hmean(y) # or 'z'

2.7613412228796843

### Geometric Mean

The geometric mean is the 𝑛-th root of the product of all 𝑛 elements 𝑥ᵢ in a dataset 𝑥: ⁿ√(Πᵢ𝑥ᵢ), where 𝑖 = 1, 2, …, 𝑛.

![image.png](https://files.realpython.com/media/py-stats-02.ec1ca0f9a9ac.png)

In [50]:
gmean = 1
for item in x:
    gmean *= item

gmean **= 1 / len(x)
gmean

4.677885674856041

In [51]:
gmean = statistics.geometric_mean(x)
gmean

4.67788567485604

In [52]:
gmean = statistics.geometric_mean(y_with_nan)
gmean

nan

In [53]:
# If there’s a zero or negative number among your data, 
# then statistics.geometric_mean() will raise the statistics.StatisticsError

In [54]:
scipy.stats.gmean(y)

4.67788567485604

### Median

The sample median is the middle element of a sorted dataset.

If the number of elements 𝑛 of the dataset is odd, then the median is the value at the middle position: (𝑛 + 1)/2. If 𝑛 is even, then the median is the arithmetic mean of the two values in the middle, that is, the items at the positions 0.5𝑛 and 0.5𝑛 + 1.


In [59]:
n = len(x)
if n % 2:
     median_ = sorted(x)[round(0.5*(n-1))]
else:
     x_ord, index = sorted(x), round(0.5 * n)
     median_ = 0.5 * (x_ord[index-1] + x_ord[index])
median_


4

In [60]:
statistics.median_low(x)

4

In [61]:
statistics.median_high(x)

4

In [63]:
statistics.median_low(x[:-1]) # [1, 2.5, 4, 8.0]

2.5

In [64]:
statistics.median_high(x[:-1])

4

In [67]:
sorted(x_with_nan)

[1, 2.5, 4, 8.0, nan, 28.0]

In [69]:
statistics.median(x_with_nan)

6.0

In [70]:
statistics.median_low(x_with_nan)

4

In [72]:
statistics.median_high(x_with_nan)

8.0

In [74]:
median_ = np.median(y)
median_

4.0

In [75]:
median_ = np.median(y[:-1])
median_

3.25

In [80]:
# However, if there’s a nan value in your dataset, 
# then np.median() issues the RuntimeWarning and returns nan. 
# If this behavior is not what you want, 
# then you can use nanmedian() to ignore all nan values

np.nanmedian(y_with_nan)

4.0

In [79]:
# pandas Series objects have the method .median() that ignores nan values by default
z.median()

4.0

In [78]:
z_with_nan.median()

4.0

### Mode

For example, in the set that contains the points 2, 3, 2, 8, and 12, the number 2 is the mode because it occurs twice, unlike the other items that occur only once.

In [86]:
u = [2, 3, 2, 8 ,2 , 8, 12]
# set(u) returns a Python set with all unique items in u
# max() -> (number of count, which item)
mode_ = max((u.count(item), item) for item in set(u))[1] 
mode_

2

In [87]:
mode_ = statistics.mode(u)
mode_

2

In [89]:
v = [12, 15, 12, 15, 21, 15, 12]
# statistics.mode(v)  # Raises StatisticsError
statistics.multimode(v) # there are two items (12&15) equally having the same number of max count

[12, 15]

In [90]:
statistics.mode([2, math.nan, 2])

2

In [91]:
statistics.multimode([2, math.nan, 2])

[2]

In [92]:
statistics.mode([2, math.nan, 0, math.nan, 5])

nan

In [93]:
statistics.multimode([2, math.nan, 0, math.nan, 5, 5])

[nan, 5]

In [101]:
# work with series
u, v, w = pd.Series(u), pd.Series(v), pd.Series([2, 3,  math.nan, math.nan])
u.mode()

0    2
dtype: int64

In [102]:
v.mode()

0    12
1    15
dtype: int64

In [103]:
w.mode() # ignore nan by default!!

0    2.0
1    3.0
dtype: float64

In [104]:
w.mode(dropna=False) # count nan in !!

0   NaN
dtype: float64

In [105]:
# I dont practice  "scipy.stats.mode()" here,
# casue I think it is not accurate
# since it does not return multiple items 
# where there are more than one items of max counts. 