In [48]:
import numpy as np
import pandas as pd 
import seaborn as sn

def covariance(x_arr, y_arr):
    assert len(x_arr) == len(y_arr)
    x_mean = np.mean(x_arr)
    y_mean = np.mean(y_arr)

    result = 0 

    for i in range(len(x_arr)):
        result += (x_arr[i] - x_mean) * (y_arr[i] - y_mean) 

    return result / (len(x_arr) - 1)

x_array = [2, 3, 5, 7, 8, 9, 14, 16, 18, 19]
y_array = [2, 3, 6, 7, 9, 11, 15, 16, 18, 19]

In [49]:
def covariance_matrix(data):
    '''
    data: list of float lists that are equal in length
    '''
    result = []
    for i in range(len(data)):
        result.append([])
        x_dataset = data[i]
        for j in range(len(data)):
            y_dataset = data[j]
            result[i].append(covariance(x_dataset, y_dataset))
            
    return result

dataset = [[1, 1, 1], [1, 2, 1], [1, 3, 2], [1, 4, 3]]

matrix = covariance_matrix(dataset)
matrix

[[0.0, 0.0, 0.0, 0.0],
 [0.0, 0.3333333333333333, 0.5, 0.6666666666666666],
 [0.0, 0.5, 1.0, 1.5],
 [0.0, 0.6666666666666666, 1.5, 2.3333333333333335]]

In [50]:
cov = np.cov(dataset)
cov

array([[0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.33333333, 0.5       , 0.66666667],
       [0.        , 0.5       , 1.        , 1.5       ],
       [0.        , 0.66666667, 1.5       , 2.33333333]])

In [55]:
# Testing using pigeon racing dataset
df = pd.read_csv('dataset/pigeon-racing.csv')
# Remove unnecessary columns 
df = df.drop(columns=['Pos', 'Breeder', 'Pigeon', 'Name', 'Ent', 'Arrival', 'To Win', 'Eligible'])
df

Unnamed: 0,Color,Sex,Speed
0,BCWF,H,172.155
1,SIWF,H,163.569
2,BB,H,163.442
3,BBSP,H,163.392
4,BC,H,163.366
...,...,...,...
395,BB,H,90.901
396,SIL,H,87.817
397,BBSP,H,83.929
398,BC,H,78.286


In [56]:
# Map colors and sex into numbers

unique_colors = df.Color.unique()

color_id_dict = {k:v for (k,v) in zip(unique_colors, range(len(unique_colors)))}

sex_id_dict = {'H': 0, 'C': 1}
df['Sex'] = [sex_id_dict[x] for x in df['Sex']]
df['Color'] = [color_id_dict[x] for x in df['Color']]
df

Unnamed: 0,Color,Sex,Speed
0,0,0,172.155
1,1,0,163.569
2,2,0,163.442
3,3,0,163.392
4,4,0,163.366
...,...,...,...
395,2,0,90.901
396,20,0,87.817
397,3,0,83.929
398,4,0,78.286


In [74]:
# Constant
pigeon_dataset = [df[x].tolist() for x in df.columns]
np.cov(pigeon_dataset)

array([[ 2.51989411e+01, -2.01315789e-02, -8.19805669e+00],
       [-2.01315789e-02,  2.20488722e-02, -1.52153885e-02],
       [-8.19805669e+00, -1.52153885e-02,  6.36975135e+02]])

In [75]:
covariance_matrix(pigeon_dataset)

[[25.198941102756834, -0.020131578947368365, -8.198056691729322],
 [-0.020131578947368365, 0.022048872180451203, -0.01521538847117778],
 [-8.198056691729322, -0.01521538847117778, 636.9751350821049]]