### 1. mean
$\mu = \frac{1}{N} \displaystyle\sum_{i=1}^{N} x{_i}$

In [1]:
import numpy as np
import scipy as sp
import pandas as pd

In [2]:
data = np.array([2, 3, 3, 4, 4, 4, 4, 5, 5, 6])
np.mean(data)

4.0

### sample variance
$ \sigma^{2} = \frac{1}{N-1} \displaystyle\sum_{i=1}^{N} (x{_i} - \mu)^{2}$

In [3]:
np.var(data, ddof=1)

1.3333333333333333

### standard diviation
$ \sigma = \sqrt{{\sigma}^{2}} = \sqrt{\frac{1}{N-1} \displaystyle\sum_{i=1}^{N} (x{_i} - \mu)^{2}} $

In [4]:
np.std(data, ddof=1)

1.1547005383792515

### Standardization
mean 0, std(variance) 1

In [5]:
data - np.mean(data), np.mean(data - np.mean(data))

(array([-2., -1., -1.,  0.,  0.,  0.,  0.,  1.,  1.,  2.]), 0.0)

In [6]:
data / np.std(data, ddof=1), np.std(data / np.std(data, ddof=1), ddof=1)

(array([1.73205081, 2.59807621, 2.59807621, 3.46410162, 3.46410162,
        3.46410162, 3.46410162, 4.33012702, 4.33012702, 5.19615242]),
 1.0)

### statistic etc

In [7]:
np.amax(data), np.amin(data)

(6, 2)

In [8]:
np.median(data)

4.0

### quantile

In [9]:
from scipy import stats

In [10]:
data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
stats.scoreatpercentile(data, 25)

3.0

In [11]:
stats.scoreatpercentile(data, 75)

7.0

### multivariate data

In [12]:
df = pd.DataFrame({'species': ['A', 'A', 'A', 'B', 'B', 'B'],
                   'length' : [2, 3, 4, 6, 8, 10]})
df

Unnamed: 0,species,length
0,A,2
1,A,3
2,A,4
3,B,6
4,B,8
5,B,10


### Statistics by group

In [13]:
# mean by group
df.groupby('species').mean()

Unnamed: 0_level_0,length
species,Unnamed: 1_level_1
A,3.0
B,8.0


In [14]:
# standard diviation by group
df.groupby('species').std(ddof=1)

Unnamed: 0_level_0,length
species,Unnamed: 1_level_1
A,1.0
B,2.0


In [15]:
df.groupby('species').describe()

Unnamed: 0_level_0,length,length,length,length,length,length,length,length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
A,3.0,3.0,1.0,2.0,2.5,3.0,3.5,4.0
B,3.0,8.0,2.0,6.0,7.0,8.0,9.0,10.0


### crosstabulation table

In [16]:
shoes = pd.DataFrame({'store': ['tokyo', 'tokyo', 'osaka', 'osaka'],
                     'color': ['blue',  'red', 'blue',  'red'],
                     'sales': [10, 15, 13, 9]})
shoes

Unnamed: 0,store,color,sales
0,tokyo,blue,10
1,tokyo,red,15
2,osaka,blue,13
3,osaka,red,9


In [17]:
pd.pivot_table(data=shoes, values='sales', columns='color', index='store')

color,blue,red
store,Unnamed: 1_level_1,Unnamed: 2_level_1
osaka,13,9
tokyo,10,15


### Covariance
covariance formula\
$ Cov(x,y) = \frac{1}{N}\displaystyle\sum_{i=1}^{N}(x_{i}-\mu_{x})(y_{i}-\mu_{y}) $\
$ Cov(x,y) = \frac{1}{N-1}\displaystyle\sum_{i=1}^{N}(x_{i}-\mu_{x})(y_{i}-\mu_{y}) $

### variance covariance matrix
$Cov(x, y)\begin{bmatrix}\sigma_{x}^{2}&Cov(x, y)\\Cov(x, y)&\sigma_{y}^{2}\\ \end{bmatrix}$

In [18]:
cov_data = pd.DataFrame({'x': [18.5, 18.7, 19.1, 19.7, 21.5, 21.7, 21.8, 22.0, 23.4, 23.8],
                         'y': [34, 39, 41, 38, 45, 41, 52, 44, 44, 49]})
cov_data

Unnamed: 0,x,y
0,18.5,34
1,18.7,39
2,19.1,41
3,19.7,38
4,21.5,45
5,21.7,41
6,21.8,52
7,22.0,44
8,23.4,44
9,23.8,49


In [19]:
# covariance
cov_data.cov(ddof=0)

Unnamed: 0,x,y
x,3.2816,6.906
y,6.906,25.21


In [20]:
# covariance ddof=1
cov_data.cov(ddof=1)

Unnamed: 0,x,y
x,3.646222,7.673333
y,7.673333,28.011111


### Pearson's Correlation Coefficient
$ \rho_{xy} = \frac{Cov(x,y)}{\sqrt{{\sigma_{x}^{2}\sigma_{y}^{2}}}} $
### Correlation Matrix
$Cov(x, y)\begin{bmatrix}1&\rho(x, y)\\\rho(x, y)&2\\ \end{bmatrix}$

In [21]:
# correlation coefficient
stats.pearsonr(cov_data.x, cov_data.y)[0]

# p-value
stats.pearsonr(cov_data.x, cov_data.y)[1]

0.010859034985841936

In [22]:
cov_data.corr()

Unnamed: 0,x,y
x,1.0,0.759272
y,0.759272,1.0
