In [100]:
import pandas as pd
import random
import numpy as np

n_samples = 100


def make_sample():
    a, b, c = 0.0, 0.0, 0.0
    
    pab, pc = random.random(), random.random()
    
    if pab >= 0.7:
        a = random.uniform(0.7, 1.00)
        b = random.uniform(0.7, 1.00)
    
    c = pc
    
    return [a, b, c]


D = np.array([
    make_sample() for _ in range(n_samples)
])

#D = np.array([
#    [1, 1, 0],
#    [1, 1, 0],
#    [1, 1, 0],
#    [1, 1, 0],
#    [0, 0, 0],
#    [0, 0, 0],
#    [0, 0, 1],
#    [0, 0, 0],
#])

print(D)

[[0.         0.         0.20279804]
 [0.         0.         0.26659362]
 [0.97461808 0.87085627 0.64935699]
 [0.         0.         0.80917789]
 [0.         0.         0.39408086]
 [0.88535204 0.73449614 0.26694594]
 [0.         0.         0.80142334]
 [0.9058259  0.70668602 0.98130111]
 [0.         0.         0.67622192]
 [0.97748264 0.7622089  0.29642848]
 [0.         0.         0.35934301]
 [0.88081806 0.87977171 0.84738835]
 [0.         0.         0.85105455]
 [0.         0.         0.98756972]
 [0.         0.         0.39791323]
 [0.         0.         0.09584853]
 [0.         0.         0.0310824 ]
 [0.70344101 0.81862751 0.14644124]
 [0.         0.         0.03111017]
 [0.         0.         0.14229571]
 [0.81876016 0.89576015 0.00534713]
 [0.         0.         0.99622351]
 [0.         0.         0.75837427]
 [0.79057889 0.92734651 0.80164685]
 [0.         0.         0.68111347]
 [0.75791098 0.73357793 0.95433952]
 [0.84170574 0.78732444 0.37656612]
 [0.         0.         0.36

# Covariance

![Covariance formula](img/covariance_formula.png)

In [101]:
def mean(X):
    return sum(X) / len(X)

def variations(X):
    avg = mean(X)
    return np.array([x - avg for x in X])


def covariance(col1, col2, norm=True):
    
    num = sum(variations(col1) * variations(col2))

    denom = float(len(col1) - 1) if norm else 1
    
    return num / denom


cols = [[] for _ in range(3)]
for i, (a, b, c) in enumerate(list(D)):
    cols[0].append(a)
    cols[1].append(b)
    cols[2].append(c)
cols = np.array(cols)

print(cols)
print(covariance(cols[0], cols[1]))
print(covariance(cols[1], cols[2]))
print(covariance(cols[0], cols[2]))

df = pd.DataFrame(D)
print(df[[0, 1]].cov())

[[0.         0.         0.97461808 0.         0.         0.88535204
  0.         0.9058259  0.         0.97748264 0.         0.88081806
  0.         0.         0.         0.         0.         0.70344101
  0.         0.         0.81876016 0.         0.         0.79057889
  0.         0.75791098 0.84170574 0.         0.73044896 0.
  0.         0.         0.         0.         0.         0.92845929
  0.81030306 0.         0.7427007  0.         0.9762614  0.
  0.         0.         0.         0.         0.         0.75336061
  0.81506827 0.         0.80620927 0.         0.76161235 0.71715692
  0.         0.         0.         0.         0.74665591 0.
  0.         0.         0.         0.         0.         0.84475073
  0.         0.94030903 0.         0.         0.74149957 0.
  0.86767668 0.75059396 0.         0.91813713 0.         0.
  0.         0.77250141 0.83741276 0.         0.75024676 0.91403305
  0.         0.         0.         0.         0.         0.
  0.         0.         0.  

[Online calculator for proof](https://www.thecalculator.co/math/Covariance-Calculator-705.html).

# Correlation

![Correlation formula](img/correlation_formula.png)

In [102]:
import math

def stdev(X):
    avg = mean(X)
    diffs = [(x - avg) ** 2 for x in X]
    return math.sqrt(sum(diffs))


def correlation(X, Y):
    
    num = covariance(X, Y, norm=False)
    
    denom = stdev(X) * stdev(Y)

    return num / denom


print(cols)
print(correlation(cols[0], cols[1]))
print(correlation(cols[1], cols[2]))
print(correlation(cols[0], cols[2]))


[[0.         0.         0.97461808 0.         0.         0.88535204
  0.         0.9058259  0.         0.97748264 0.         0.88081806
  0.         0.         0.         0.         0.         0.70344101
  0.         0.         0.81876016 0.         0.         0.79057889
  0.         0.75791098 0.84170574 0.         0.73044896 0.
  0.         0.         0.         0.         0.         0.92845929
  0.81030306 0.         0.7427007  0.         0.9762614  0.
  0.         0.         0.         0.         0.         0.75336061
  0.81506827 0.         0.80620927 0.         0.76161235 0.71715692
  0.         0.         0.         0.         0.74665591 0.
  0.         0.         0.         0.         0.         0.84475073
  0.         0.94030903 0.         0.         0.74149957 0.
  0.86767668 0.75059396 0.         0.91813713 0.         0.
  0.         0.77250141 0.83741276 0.         0.75024676 0.91403305
  0.         0.         0.         0.         0.         0.
  0.         0.         0.  