In [None]:
import pandas as pd
import numpy as np
from scipy.interpolate import make_interp_spline
import matplotlib.pyplot as plt
import decimal

## Covariance
For numeric features

In [None]:
def cov(a, b):
    mean_a = a.mean()
    mean_b = b.mean()
    return (a * b).mean() - mean_a * mean_b


def cov_matrix(mtrx):
    n = len(mtrx)
    means = mtrx.mean(axis=0)
    mtrx_means = np.expand_dims(means, axis=1) @ np.expand_dims(means, axis=0)
    mtrx_mults = mtrx.T @ mtrx / n
    return mtrx_mults - mtrx_means

### Covariance of 2 features

In [None]:
df = pd.read_csv('dataset/height-weight-sex-train.csv')
h_bound = (120, 200)
w_bound = (30, 120)
df = df[(df['Height'] > h_bound[0]) & (df['Height'] < h_bound[1])]
df = df[(df['Weight'] > w_bound[0]) & (df['Weight'] < w_bound[1])]
mtrx = df[['Height', 'Weight']].to_numpy()
print('Cov: {}'.format(cov(mtrx[:, 0], mtrx[:, 1])))

### Covariance matrix of multiple features

In [None]:
targets = ['KRW=X', 'CNY=X', 'JPY=X', 'VND=X', 'SGD=X', 'EUR=X']
df = pd.read_csv('./dataset/dollar_exchange_rates.csv')[['Date', *targets]]
ys = df[targets].dropna()
xs = df['Date'][1:]
model = make_interp_spline(ys.index, ys, k=5)
mtrx = model(xs.index)
df = pd.DataFrame(cov_matrix(mtrx), columns=targets, index=targets)

plt.matshow(df, cmap='plasma')
for (i, j), z in np.ndenumerate(df):
    plt.text(j, i, '{:0.1e}'.format(
        decimal.Decimal(z)), ha='center', va='center')
plt.xticks(range(0, len(df.columns)), df.columns)
plt.yticks(range(0, len(df.index)), df.index)
plt.colorbar()
plt.title('Covariance Matrix of dollar exchange rates', pad=40)
plt.show()