In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

## Read data

In [2]:
data = pd.read_csv('atml-data.txt', sep='\t', names=['X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'none'])
del data['none']
data.head()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5
0,-0.365507,-0.417066,-1.710587,-1.457053,-1.31821
1,-0.285047,0.498189,-2.202331,-1.701304,-1.87288
2,-0.018177,-0.059014,0.686316,0.696371,1.291809
3,-2.214677,-2.405553,-4.122834,-3.088534,-2.493587
4,2.294686,2.728682,3.486602,4.187874,3.850728


## Calculate mean

$$
\mu_j = \frac{1}{n} \sum_{j=1}^n x_j, ~~~~~ \forall \mu_j \in \mathbf{\mu}
$$

Is used [pandas.DataFrame.mean](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.mean.html)

In [3]:
mean = data.mean()
mean

X_1    0.772915
X_2    0.920877
X_3    0.126194
X_4    0.909165
X_5    0.661482
dtype: float64

## Calculate covariance

$$
\mathbf{\Sigma} = \frac{1}{n-1} \sum_{k=1}^{n} (\mathbf{X}^{(k)} - \mu)(\mathbf{X}^{(k)} - \mu)^T
$$

## Using $numpy$ (mean and covariance)

In [4]:
mean = np.mean(data, axis=0)
mean

X_1    0.772915
X_2    0.920877
X_3    0.126194
X_4    0.909165
X_5    0.661482
dtype: float64

In [5]:
cov = np.cov(data, rowvar=0)
cov

array([[ 1.04306317,  0.80588233,  1.16113565,  1.35940949,  0.90060135],
       [ 0.80588233,  1.72203546,  1.73666803,  1.54732501,  1.55312727],
       [ 1.16113565,  1.73666803,  2.87426042,  2.70179449,  2.75686331],
       [ 1.35940949,  1.54732501,  2.70179449,  2.99116633,  2.73713501],
       [ 0.90060135,  1.55312727,  2.75686331,  2.73713501,  2.90998099]])

## 2.1) $X_2$ and $X_2 | \mathbf{X}_{1,3,4,5} = (x_1, x_3, x_4, x_5)'$

$X_2$

In [6]:
print("μ  = ", mean[1])
print("σ² = ", cov[1][1])

μ  =  0.920876815859
σ² =  1.72203546301


For calculate the $X_{i|j} = \mathcal{N}(\mu_{i|j}, \Sigma_{i|j})$, we can use the following identities:

$$
\mu_{i|j} = \mu_i + \Sigma_{ij}\Sigma_{jj}^{-1}(x_j - \mu_j)
$$
$$
\Sigma_{i|j} = \Sigma_{ii} - \Sigma_{ij}\Sigma_{jj}^{-1} \Sigma_{ji}
$$

In [7]:
def conditional_mean(mean, cov, x_j, i, j):
    """
    Calculate $\mu_{i|j}$, where the first index is ZERO
    """
    μ_i = mean[i]
    μ_j = mean[j]
    
    Σ_ij = cov[np.ix_(i, j)]
    Σ_jj = cov[np.ix_(j, j)]
    Σ_jj_inv = np.linalg.inv(Σ_jj)

    return μ_i + Σ_ij @ Σ_jj_inv @ np.array(x_j - μ_j)

i = [2] - np.ones(1, dtype=int)
j = [1,3,4,5] - np.ones(1, dtype=int)

x_j = data.iloc[3][j]

print('X_1,3,4,5 = \n', x_j)
print()
print('μ_{2|1,3,4,5,} =', conditional_mean(mean, cov, i=i, j=j, x_j=x_j))

X_1,3,4,5 = 
 X_1   -2.214677
X_3   -4.122834
X_4   -3.088534
X_5   -2.493587
Name: 3, dtype: float64

μ_{2|1,3,4,5,} = X_2   -2.058864
dtype: float64


In [8]:
def conditional_cov(cov, i, j):
    """
    Calculate $\Sigma_{i|j}$, where the first index is ZERO
    """
    Σ_ii = cov[np.ix_(i, i)]
    Σ_jj = cov[np.ix_(j, j)]
    Σ_ij = cov[np.ix_(i, j)]
    Σ_ji = cov[np.ix_(j, i)]
    
    Σ_jj_inv = np.linalg.inv(Σ_jj)

    return Σ_ii - Σ_ij @ Σ_jj_inv @ Σ_ji

print('Σ_{2|1,3,4,5} =', conditional_cov(cov, i=i, j=j))

Σ_{2|1,3,4,5} = [[ 0.30794075]]


For

$X_2 | \mathbf{X}_{1,3,4,5} = (x_1, x_3, x_4, x_5)'$
$X_2 | \mathbf{X}_{1,3,4,5} = (-2.214677, -4.122834, -3.088534, -2.493587)'$

We have

$$μ_{2|1,3,4,5} = -2.05886390512$$
$$Σ_{2|1,3,4,5} = 0.307940750928$$


## 2.2) $\mathbf{X}_{1,4}$ and $\mathbf{X}_{1,4} | \mathbf{X}_{2,3,5} = (x_2, x_3, x_5)'.$

$\mathbf{X}_{1,4}$

In [9]:
cov_x14 = np.array([
    [cov[0][0], cov[0][3]],
    [cov[3][0], cov[3][3]]
])

print("μ  = ", [mean[0]]+[mean[3]])
print("σ² = \n", cov_x14)

μ  =  [0.77291470502578141, 0.90916460120595821]
σ² = 
 [[ 1.04306317  1.35940949]
 [ 1.35940949  2.99116633]]


For calculate the $X_{i|j} = \mathcal{N}(\mu_{i|j}, \Sigma_{i|j})$, we can use the following identities:

$$
\mu_{i|j} = \mu_i + \Sigma_{ij}\Sigma_{jj}^{-1}(x_j - \mu_j)
$$
$$
\Sigma_{i|j} = \Sigma_{ii} - \Sigma_{ij}\Sigma_{jj}^{-1} \Sigma_{ji}
$$

In [10]:
i = [1,4] - np.ones(1, dtype=int)
j = [2,3,5] - np.ones(1, dtype=int)

x_j = data.iloc[3][j]

print('X_2,3,5 = \n', x_j)
print()
print('μ_{1,4|2,3,5} = \n', conditional_mean(mean, cov, i=i, j=j, x_j=x_j))

X_2,3,5 = 
 X_2   -2.405553
X_3   -4.122834
X_5   -2.493587
Name: 3, dtype: float64

μ_{1,4|2,3,5} = 
 X_1   -1.690034
X_4   -2.566434
dtype: float64


In [11]:
print('Σ_{2|1,3,4,5} = \n', conditional_cov(cov, i=i, j=j))

Σ_{2|1,3,4,5} = 
 [[ 0.40276335  0.38531681]
 [ 0.38531681  0.37073028]]


For

$X_1,4 | \mathbf{X}_{2,3,5} = (x_2, x_3, x_5)'$
$X_1,4 | \mathbf{X}_{2,3,5} = (-2.405553, -4.122834, -2.493587)'$

We have

$$μ_{1,4|2,3,5} = [-1.690034, -2.566434]$$
$$Σ_{1,4|2,3,5} = $$
```
                                               [[ 0.40276335  0.38531681]
                                                [ 0.38531681  0.37073028]]
```