In [1]:
import math
import statistics
import numpy as np
import scipy
import scipy.stats
import pandas as pd

In [2]:
print("numpy version", np.__version__)
print("scipy version", scipy.__version__)
print("pandas version", pd.__version__)

numpy version 1.18.5
scipy version 1.5.0
pandas version 1.1.3


In [3]:
x = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, math.nan, 4, 28.0]
print(x, len(x))
print(x_with_nan, len(x_with_nan))

[8.0, 1, 2.5, 4, 28.0] 5
[8.0, 1, 2.5, nan, 4, 28.0] 6


In [4]:
y, y_with_nan = np.array(x), np.array(x_with_nan)
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)
print(y)
print(y_with_nan)
print(z)
print(z_with_nan)

[ 8.   1.   2.5  4.  28. ]
[ 8.   1.   2.5  nan  4.  28. ]
0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64
0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64


## Measure of Tendency

In [5]:
print("x:", x)
mean_native = sum(x) / len(x)
print("mean native:", mean_native)
mean_stats = statistics.mean(x)
print("mean statistics:", mean_stats)

x: [8.0, 1, 2.5, 4, 28.0]
mean native: 8.7
mean statistics: 8.7


In [6]:
print("x with nan:", x_with_nan)
mean_with_nan_native = sum(x_with_nan) / len(x_with_nan)
print("x with nan native:", mean_with_nan_native)
mean_with_nan_stats = statistics.mean(x_with_nan)
print("x with nan stats:", mean_with_nan_stats)
mean_with_nan_np = np.mean(y_with_nan)
print("mean with numpy:", mean_with_nan_np)

x with nan: [8.0, 1, 2.5, nan, 4, 28.0]
x with nan native: nan
x with nan stats: nan
mean with numpy: nan


In [7]:
mean_ignoring_nan_np = np.nanmean(x_with_nan)
print("mean ignoring nan numpy:", mean_ignoring_nan_np)

mean ignoring nan numpy: 8.7


In [8]:
mean_ignoring_nan_pd = z.mean()
print("mean ignoring nan pandas", mean_ignoring_nan_pd)

mean ignoring nan pandas 8.7


In [9]:
arr = [2, 2, 4, 4, 4, 4, 4, 8, 8, 8]
print(arr, len(arr))
print("mean:", np.mean(arr))
print("weighted mean:", 0.2*2 + 0.5*4 + 0.3*8)

[2, 2, 4, 4, 4, 4, 4, 8, 8, 8] 10
mean: 4.8
weighted mean: 4.8


In [10]:
arr_x = [8., 1, 2.5, 4]
arr_w = [.1, .2, .3, .25]

print(
    "weighted mean:",
    sum(w*x for x, w in zip(arr_x, arr_w)) / sum(arr_w)
)

weighted mean: 3.2352941176470584


In [11]:
np_x, np_w = np.array(arr_x), np.array(arr_w)
np.average(np_x, weights=np_w)

3.2352941176470584

## Harmonic Name

In [17]:
x += [100]
print(x)
print("arithmatic mean:", sum(x) / len(x))
print("harmonic mean:", len(x) / sum(1/i for i in x))

[8.0, 1, 2.5, 4, 28.0, 100, 100]
arithmatic mean: 34.785714285714285
harmonic mean: 3.8236441669918064


In [18]:
scipy.stats.hmean(x)

3.8236441669918064

In [19]:
x

[8.0, 1, 2.5, 4, 28.0, 100, 100]

In [23]:
gmean = 1

for item in x:
    gmean *= item

gmean = gmean**(1/len(x))
print(gmean)
print(scipy.stats.gmean(x))

# ** sama dengan pangkat

11.221099960393236
11.221099960393238


## Median

In [27]:
x += [1000]

In [28]:
print(x, len(x), sorted(x))
print("median np:", np.median(x))

if len(x) % 2:
    med = sorted(x)[round(0.5 *(len(x)-1))]
else:
    ordered_x, index = sorted(x), round(0.5 * len(x))
    med = 0.5 * (ordered_x[index-1] + ordered_x[index])
    
print("median native:", med)

[8.0, 1, 2.5, 4, 28.0, 100, 100, 1000] 8 [1, 2.5, 4, 8.0, 28.0, 100, 100, 1000]
median np: 18.0
median native: 18.0


In [29]:
statistics.median(x), statistics.median_high(x), statistics.median_low(x)

(18.0, 28.0, 8.0)

In [33]:
sorted(x_with_nan), statistics.median(x_with_nan), statistics.median_low(x_with_nan), statistics.median_high(x_with_nan)

([1, 2.5, 4, 8.0, nan, 28.0], 6.0, 4, 8.0)

In [37]:
print(z)
print(z.median())
print(z_with_nan.sort_values())
print(z_with_nan.mean())

0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64
4.0
1     1.0
2     2.5
4     4.0
0     8.0
5    28.0
3     NaN
dtype: float64
8.7


## Mode

In [43]:
print(x)
print([x.count(i) for i in x])

[8.0, 1, 2.5, 4, 28.0, 100, 100, 1000]
[1, 1, 1, 1, 1, 2, 2, 1]


In [46]:
mode_ = scipy.stats.mode(x)
print(mode_.mode, mode_.count)

[100.] [2]


In [47]:
series_x = pd.Series(x)
series_x.mode()

0    100.0
dtype: float64

In [48]:
print(z)
print(z.mode())

0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64
0     1.0
1     2.5
2     4.0
3     8.0
4    28.0
dtype: float64


## Variance

In [49]:
x

[8.0, 1, 2.5, 4, 28.0, 100, 100, 1000]

In [57]:
n = len(x)
mean_x = sum(x) / len(x)
print("mean:", mean_x)
variance_x = sum((point - mean_x)**2 for point in x) / (n-1)
print("variance:", variance_x) #using basic variance
print("statistics:", statistics.variance(x)) #using statistics
print("numpy:", np.var(np.array(x), ddof=1)) #using numpy
print("pandas:", series_x.var()) #using pandas

mean: 155.4375
variance: 118226.38839285714
statistics: 118226.38839285714
numpy: 118226.38839285714
pandas: 118226.38839285714


## Standard Deviation

In [66]:
print("std native:", variance_x**0.5) #using basic standard deviation
print("std np:", np.std(x, ddof=1)) #using numpy
print("std statistics:", statistics.stdev(x)) #using statistics
print("std pandas:", series_x.std()) #using pandas
print("std scipy:", scipy.std(x, ddof=1)) #using scipy

std native: 343.8406438931517
std np: 343.8406438931517
std statistics: 343.8406438931517
std pandas: 343.8406438931517
std scipy: 343.8406438931517


  print("std scipy:", scipy.std(x, ddof=1)) #using scipy


## Skewness

In [70]:
print(x)
n = len(x)
mean_x = sum(x) / n
variance_x = sum((point - mean_x)**2 for point in x) / (n-1)
std_x = variance_x**0.5

skew_x = sum((item - mean_x)**3 for item in x)* n / ((n-1)* (n-2)* std_x**3) #using basic skewness
print(skew_x)
print(scipy.stats.skew(x, bias=False)) #using scipy
print(series_x.skew())

[8.0, 1, 2.5, 4, 28.0, 100, 100, 1000]
2.7460927194584865
2.746092719458487
2.746092719458487


## Percentile

In [74]:
print(x)
print(statistics.quantiles(x))

[8.0, 1, 2.5, 4, 28.0, 100, 100, 1000]
[2.875, 18.0, 100.0]


In [83]:
print(x)
print(np.percentile(x, 75))
print(np.percentile(x, 50))
print(np.percentile(x, 25))
print(np.quantile(x, .5))
print(np.median(x))

[8.0, 1, 2.5, 4, 28.0, 100, 100, 1000]
100.0
18.0
3.625
18.0
18.0


In [84]:
print(x_with_nan)
print(np.nanpercentile(x_with_nan, 75))
print(np.nanpercentile(x_with_nan, 50))
print(np.nanpercentile(x_with_nan, 25))
print(np.nanquantile(x_with_nan, .75))
print(np.median(x_with_nan))

[8.0, 1, 2.5, nan, 4, 28.0]
8.0
4.0
2.5
8.0
nan


In [86]:
q1 = np.quantile(x, .25)
q3 = np.quantile(x, .75)
interquantile = q3 - q1
print(q1, q3, interquantile)

3.625 100.0 96.375
