In [38]:
import pandas as pd 
import process
import numpy as np 
# Jerome path : r'C:\Users\33640\OneDrive\Documents\GitHub\Portfolio_clustering_project\Data\DataBase.csv'
# Nail path : '/Users/khelifanail/Documents/GitHub/Portfolio_clustering_project/Data/DataBase.csv'
df = pd.read_csv(r'/Users/khelifanail/Documents/GitHub/Portfolio_clustering_project/Data/DataBase.csv')

df.set_index('ticker', inplace=True)

df.columns = pd.to_datetime(df.columns.str[1:], format='%Y%m%d').strftime('%d/%m/%Y')

df_cleaned = df.fillna(0) # Utilisez la méthode fillna(0) pour remplacer les NaN par 0

df_cleaned = df_cleaned.transpose() ## WE WANT COLUMNS TO BE VECTOR OF RETURN FOR A GIVEN TICKER

We denote by $\bm{X}$ the $d \times n$ matrix of observations \textit{i.e.}

$$
\bm{X} = 
\begin{bmatrix}
    r_{1}^{(1)} & r_{1}^{(2)} & \ldots & r_{1}^{(n)} \\
    r_{2}^{(1)} & r_{2}^{(2)} & \ldots & r_{2}^{(n)} \\
    \vdots & \vdots & \ddots & \vdots \\
    r_{d}^{(1)} & r_{d}^{(2)} & \ldots & r_{d}^{(n)} \\
\end{bmatrix} 
= \begin{bmatrix} \mathbf{r}^{(1)} | ... |\mathbf{r}^{(n)}\end{bmatrix}
\in \mathbb{R}^{d\times n}
$$

where:
- $d$ corresponds to the number of days
- $n$ corresponds to the number of stocks

In general, a standard sample covariance can be generalized to include some arbitrary weight profile assigned along the time dimension. In particular, it is expressed in the following form

\begin{equation}
\bm{S_W} := \frac{1}{d} \bm{X}'W\bm{X} \in \mathbb{R}^{n\times n}
\end{equation}\\

The EWA-SC as defined can be written as a weighted sample covariance matrix if we define the matrix of weighted $W$ to be: 

\begin{align*}
    W_{t,k} =
    \begin{cases}
        d\frac{1 - \beta}{1-\beta^d} \beta^{d-t} & \text{if } t = k \\
        0 & \text{otherwise}
    \end{cases}
\end{align*}

If we define the auxiliary observation matrix: 

\begin{align}
    \tilde{\bm{{X}}} := \bm{W}^\frac{1}{2} \bm{X}
\end{align}

then we can see that the EWA-SC can be expressed in a similar form as the standard uniformly weighted sample covariance. The advantage of recasting the EWA-SC in this way is that many of the refinements for the standard sample covariance that have been developed over the years are at our disposal; including shrinkage.

In [65]:
######################### 1. We start by randomizing the auxiliary observation matrix  ̃X from Equation (5) along the time axis #########################
def auxilary_matrix(lookback_window, beta, df_cleaned):

    ## 1. We extract the data corresponding to the returns of our assets (columns) during these d days (lines)
    X = df_cleaned.iloc[lookback_window[0]:lookback_window[1],:] ## shape days * number of stocks
    days = len(lookback_window)
    ## 2. We slightly adjust the matrix of observations to get the auxiliary matrix that puts more weight on recent dates

    W = np.sqrt(np.diag(days * (1 - beta) * beta**(np.arange(lookback_window[0], lookback_window[1])[::-1]) / (1 - beta**days)))  # Compute the weight matrix
    X_tilde = pd.DataFrame(index=X.index, columns=X.columns, data=np.dot(W, X)).transpose()

    ## 3. We randomize the auxiliary matrix of observations according to the time axis
    # Randomized_X = X_tilde.transpose().sample(frac=1, axis=1, random_state=42) ## we transpose X as we want to have daily observations of the whole dataset !

    return X_tilde

# ---------------------------------------------------------------- TESTS ----------------------------------------------------------------

lookback_window = [0, 250]
beta = 0.999
X_tilde = auxilary_matrix(lookback_window=lookback_window, beta=beta, df_cleaned=df_cleaned)
X_tilde

Unnamed: 0_level_0,03/01/2000,04/01/2000,05/01/2000,06/01/2000,07/01/2000,10/01/2000,11/01/2000,12/01/2000,13/01/2000,14/01/2000,...,13/12/2000,14/12/2000,15/12/2000,18/12/2000,19/12/2000,20/12/2000,21/12/2000,22/12/2000,26/12/2000,27/12/2000
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AA,-0.011517,0.008873,0.042103,-0.010359,-0.014262,-0.028395,0.020025,-0.004652,-0.016146,-0.017838,...,0.034874,-0.001960,-0.008022,0.017513,0.037941,-0.048421,0.011866,0.068684,-0.007205,-0.009101
ABM,-0.008114,0.010908,-0.005474,0.000000,0.002735,0.000000,-0.008239,0.002774,-0.025375,0.011020,...,0.016225,-0.004050,0.002036,0.014203,0.008063,-0.006056,0.000000,0.049824,-0.027828,0.004050
ABT,-0.006285,-0.011297,0.009822,0.028791,0.025283,-0.018639,0.009610,-0.019301,-0.008172,0.000000,...,0.021859,-0.045522,0.028346,0.033244,0.025102,-0.023995,-0.012887,-0.016992,0.024946,-0.005136
ADI,-0.031854,-0.039107,0.012812,-0.024516,0.029780,0.042609,-0.028274,0.030739,-0.018126,0.050640,...,-0.094595,-0.024545,0.019716,-0.053733,0.034488,-0.003677,-0.053792,0.011478,0.012177,0.066282
ADM,0.000000,0.004662,-0.014069,0.009459,0.004645,-0.004623,-0.014022,0.018560,0.000000,0.027020,...,0.000000,0.009963,-0.009631,0.028360,0.004588,-0.009286,0.041145,-0.004512,0.013480,0.017626
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XLY,-0.025285,-0.014062,-0.014714,0.004271,0.031310,-0.003632,-0.007748,-0.007340,0.000000,-0.003213,...,-0.018872,0.000638,-0.007848,0.012292,-0.026661,0.020222,0.031583,0.012827,-0.010346,0.040522
XOM,-0.014701,-0.006446,0.033362,0.046735,-0.010357,-0.004612,0.002640,0.002658,0.018390,-0.018345,...,0.010703,-0.014460,-0.004434,0.024842,0.012233,-0.023881,-0.002942,0.022554,0.019953,-0.012039
XRX,0.030175,-0.042082,0.019011,0.007010,0.020713,-0.036513,-0.014022,0.007118,0.020973,-0.023224,...,0.033381,0.082212,0.030360,0.019749,0.048210,-0.040751,-0.110404,0.050606,-0.025312,0.038231
YUM,-0.029100,-0.010510,0.000000,-0.001516,-0.019955,0.036576,0.001505,-0.013576,0.003000,-0.024305,...,-0.026649,-0.027346,-0.007425,0.045462,0.008853,-0.048442,-0.047343,-0.009730,-0.009812,0.021511


We split the (randomized) auxiliary observations into $K$ non-overlapping folds **of equal size** represented as $\{\mathcal{I}_k | \mathcal{I}_k \subset \{1, ..., K\}\}_{k=1}^K$. Each set indexed by $\mathcal{I}_k$ works as a **"test" fold**, while the remaining observations' indices constitute a **"training" fold**,

In [40]:
from sklearn.model_selection import ShuffleSplit

######################### 2. We then split the (randomized) auxiliary observations into K non-overlapping folds of equal size #########################
def shuffle_split(data, K):
    # Initialize ShuffleSplit
    shuffle_split = ShuffleSplit(n_splits=K, test_size=0.2, random_state=42) 
    # test_size=0.2 : 20% des données pour l'ensemble de test, 80% pour l'ensemble d'entraînement.

    # Create empty list to store splits
    splits = []

    # Perform shuffling and splitting
    for train_index, test_index in shuffle_split.split(data.columns):
        train_fold = [data.columns[i] for i in train_index]
        test_fold = [data.columns[i] for i in test_index]
        splits.append((train_fold, test_fold)) ## attention à cette structure

    return splits

######################### 3. For each K fold configuration, we estimate the sample eigenvectors from the training set #########################
def eigen_sample(data, train_fold): ## we train the data on this test fold

    X_tilde_train = data.loc[:, train_fold]

    # Calculer la moyenne de l'ensemble d'entraînement
    mean_train = np.mean(X_tilde_train, axis=1)

    # Centrer les données d'entraînement
    centered_train_data = X_tilde_train.sub(mean_train, axis=0)

    # Calculer la matrice de covariance des données d'entraînement
    cov_matrix_train = np.dot(centered_train_data.T, centered_train_data) ## size number of assets * number of assets

    # Calculer les vecteurs et valeurs propres de la matrice de covariance
    _, eigenvectors_train = np.linalg.eigh(cov_matrix_train) ## .eigh and not .eig so that the eigenvalues are real 

    return eigenvectors_train


# ---------------------------------------------------------------- TESTS ----------------------------------------------------------------
days = 250
beta = 0.999
X_tilde = auxilary_matrix(days=days, beta=beta, df_cleaned=df_cleaned)
splits = shuffle_split(data=X_tilde, K=20)
eigenvector = eigen_sample(data=X_tilde, train_fold=splits[0][0])
eigenvector

array([[ 0.          ,  0.          ,  0.          , ..., -0.0009107837,
        -0.0518278123,  0.0294296205],
       [ 0.0875112458,  0.1732103799, -0.1759859212, ..., -0.0059831926,
         0.0028385124,  0.0206274048],
       [ 0.1785220296,  0.0187886882, -0.100921593 , ..., -0.0017788415,
        -0.0401733558,  0.0076795619],
       ...,
       [ 0.0068613387,  0.0143269608,  0.0190191802, ...,  0.0380196689,
         0.0328856314,  0.0335989447],
       [ 0.0095523052,  0.0212216391, -0.0149336477, ..., -0.0174692402,
        -0.047172594 ,  0.0424402826],
       [-0.0035457566,  0.0136947715,  0.0272468738, ...,  0.0057789542,
         0.0011852905,  0.0070006282]])

We consider a fixed exponential decay rate $\beta \in (0, 1)$ an its associated EWA-SC $\bm{E}$. Remember we denoted $\bm{\Sigma}$ the "true" and unobserved covariance matrix. Both these matrices are symmetric and thus admit the following spectral decomposition: 

\begin{equation}
\bm{E} = \sum_{i=1}^{n} \hat{\lambda}_i \hat{u}_i \hat{u}_i', \quad \text{and} \quad \bm{\Sigma} = \sum_{i=1}^{n} \lambda_i u_i u_i',
\end{equation}

where $(\hat{\lambda}_1, \ldots, \hat{\lambda}_n; \hat{u}_1, \ldots, \hat{u}_n)$  denotes a system of sample eigenvalues and eigenvectors of $\bm{E}$, and $(\lambda_1, \ldots, \lambda_n; u_1, \ldots, u_n)$ denotes a system of eigenvalues and eigenvectors of the "true" covariance $\bm{\Sigma}$. The eigenvalues are assumed to be sorted in ascending order.

To correct the bias previously mentioned, we consider a specific framework where the sample eigenvalues should be corrected while retaining the sample eigenvectors of the original matrix. This is mathematically tantamount to write:

\begin{equation}
\hat{\bm{\Sigma}} = \sum_{i=1}^{n} \xi_i \hat{u}_i \hat{u}_i',
\end{equation}

where $\bm{\xi} = (\xi_i)_{i=1,...,n}$  is an $n$-dimensional vector that we have to obtain. This framework is somewhat reasonable as, in absence of any **a priori** knowledge about the structure of the covariance matrix, the most natural guess that we have about the population eigenvectors is the sample eigenvectors that we observe. 

- For each $K$ fold configuration, we estimate the sample eigenvectors from the training set and then estimate an N-dimensional vector of out-of-sample variances using the test set and the sample eigenvectors. 

- Finally, we average the out-of-sample variance estimates over $K$ to give us the bias-corrected eigenvalue of the ith sample eigenvector portfolio denoted as $\xi^{\dagger}_i$ for all $i$.

These two last steps are equivalent to introducing the $K$-fold cross-validation estimator:

$$
\xi^{\dagger}_i := \frac{1}{K} \sum_{k=1}^K \sum_{t \in \mathcal{I}_k}  \frac{1}{\lvert \mathcal{I}_k \rvert} \left(\hat{u}_i[k]'\tilde{x}_t \right)^2, \quad \text{for } i = 1, \ldots, n,
$$

where: 
- $\lvert \mathcal{I}_k \rvert$ denotes the cardinality of the kth test set such that each of them is approximately equal in size, that is, $K \lvert \mathcal{I}_k \rvert \approx d$
- Here, $\hat{u}_i[k]$ is the $i$-th sample eigenvector of a sample covariance matrix that is obtained from the training fold, and $\tilde{x}$ is a sample vector of the auxiliary observation matrix from the test fold.

In [42]:
def intra_fold_loss(data, test_fold, sample_eigenvector_i, beta): ## we test the data on this test fold

    ## 1. get the fold cardinality 
    fold_cardinality = len(test_fold)

    ## 2. sample vector of the auxiliary observation matrix from the test fold (inspired from the code above)

    days = len(test_fold)
    X = data.loc[:,test_fold].transpose()

    ## 2. We slightly adjust the matrix of observations to get the auxiliary matrix that puts more weight on recent dates

    W = np.sqrt(np.diag(days * (1 - beta) * beta**(np.arange(days)[::-1]) / (1 - beta**days)))  # Compute the weight matrix
    X_tilde = pd.DataFrame(index=X.index, columns=X.columns, data=np.dot(W, X)).transpose()

    res = (np.dot(sample_eigenvector_i, X_tilde) ** 2) / fold_cardinality
    result = np.sum(res)

    return result

# ---------------------------------------------------------------- TESTS ----------------------------------------------------------------
beta = 0.99
data = X_tilde
sample_eigenvector_i = eigenvector[0]
X = X_tilde.loc[:, splits[0][1]]
test_fold = splits[0][1]
intra_loss = intra_fold_loss(data=data, test_fold=splits[0][1], sample_eigenvector_i=sample_eigenvector_i, beta=beta)
intra_loss

0.0005555442500824901

In [51]:
def average_loss_i(data, splits, index, beta):

    res = 0 ## to stock the overall loss

    for (train_fold, test_fold) in splits:

        ## sur chaque fold, on calcule les sample eigenvectors à partir du training fold correspondant

        sample_eigenvector_i = eigen_sample(data=data, train_fold=train_fold)[:, index] ## on ne garde que l'eigenvector correspondant au bon index

        ## sur chaque fold, on calcule la perte au sein du fold à partir de l'échantillon de test

        res = res + intra_fold_loss(data=data, test_fold=test_fold, sample_eigenvector_i=sample_eigenvector_i, beta=beta)

    res = res / len(splits) ## we average by the number of folds (which corresponds to the lengths of the splits)

    return res

# ---------------------------------------------------------------- TEST ----------------------------------------------------------------

In [44]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [57]:
np.set_printoptions(precision=10)

from tqdm import tqdm

def eigenvalue_estimator(df_cleaned, days, K, beta):

    data = auxilary_matrix(days=days, beta=beta, df_cleaned=df_cleaned)
    
    splits = shuffle_split(data=data, K=K)

    number_of_stocks = data.shape[0]

    x = np.zeros(number_of_stocks)  # initialisation de x

    for i in tqdm(range(number_of_stocks), desc='Calcul en cours', unit='itération'):
        x[i] = average_loss_i(data=data, splits=splits, index=i, beta=beta)   
                       
    return x

# ---------------------------------------------------------------- TEST ----------------------------------------------------------------
eigenvalue_estimator = eigenvalue_estimator(df_cleaned=df_cleaned, days=250, K=10, beta=0.99)

Calcul en cours: 100%|██████████| 695/695 [15:06<00:00,  1.30s/itération]


In [58]:
days = 250
beta = 0.99

## compute the sample exponential moving average correlation matrix
X = df_cleaned.iloc[0:days,:]
W = np.diag(days * (1 - beta) * beta**(np.arange(days)[::-1]) / (1 - beta**days))  # Compute the weight matrix, no sqrt as we want the real matrix
res1 = np.dot(X.T, W)  # Produit matriciel de X' et W
S = np.dot(res1, X)

## compute the eigenvectors of S

_, eigenvectors = np.linalg.eigh(S)

## computes the estimator 

# Tailles des matrices
num_eigenvalues = eigenvalue_estimator.shape[0]
num_features = eigenvectors.shape[0]

# Initialisation de Sigma avec des zéros
Sigma = np.zeros((num_features, num_features), dtype=np.complex128)

# Parcourir chaque vecteur propre et valeur propre
for i in range(num_eigenvalues):
    xi_dagger = eigenvalue_estimator[i]  # Conjugue de xi
    ui = eigenvectors[:, i]  # i-ème vecteur propre

    # Calcul du produit externe xi^† * ui * ui^† et addition à Sigma
    Sigma += xi_dagger * np.outer(ui, ui) 

# Sigma est maintenant la somme des produits xi^† * ui * ui^†
Sigma = pd.DataFrame(index=df_cleaned.columns, columns=df_cleaned.columns, data=np.real(Sigma))

In [59]:
Sigma

ticker,AA,ABM,ABT,ADI,ADM,ADX,AEE,AEG,AEM,AEP,...,XLI,XLK,XLP,XLU,XLV,XLY,XOM,XRX,YUM,ZTR
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AA,0.000787,0.000039,0.000035,-0.000014,0.000065,0.000037,0.000037,0.000021,-0.000020,0.000054,...,6.266226e-05,0.000006,3.454808e-05,0.000036,0.000040,8.020644e-05,3.763173e-05,0.000024,0.000083,0.000005
ABM,0.000039,0.000653,0.000012,0.000107,0.000020,0.000038,-0.000003,0.000010,-0.000041,0.000003,...,3.317305e-05,0.000070,-6.500309e-08,0.000011,0.000031,3.803996e-05,-1.201594e-05,0.000021,0.000032,0.000009
ABT,0.000035,0.000012,0.000617,-0.000130,0.000028,-0.000007,0.000042,-0.000004,0.000024,0.000046,...,4.331745e-07,-0.000069,5.233501e-05,0.000020,-0.000005,9.316122e-06,3.125942e-05,-0.000056,0.000054,-0.000006
ADI,-0.000014,0.000107,-0.000130,0.001731,0.000024,0.000149,-0.000054,0.000055,-0.000198,-0.000083,...,1.457453e-04,0.000501,-8.325466e-05,0.000005,0.000160,1.135020e-04,-8.086467e-05,0.000329,0.000109,0.000045
ADM,0.000065,0.000020,0.000028,0.000024,0.000640,-0.000004,0.000048,0.000021,-0.000014,0.000037,...,3.317216e-05,0.000011,1.997473e-05,0.000016,0.000017,4.550718e-05,5.039204e-06,-0.000021,0.000064,-0.000012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XLY,0.000080,0.000038,0.000009,0.000114,0.000046,0.000034,0.000026,0.000028,-0.000012,0.000034,...,6.513814e-05,0.000084,1.867435e-05,0.000026,0.000065,5.092263e-04,1.197855e-07,-0.000002,0.000083,0.000011
XOM,0.000038,-0.000012,0.000031,-0.000081,0.000005,-0.000003,0.000026,-0.000007,0.000036,0.000032,...,-3.169793e-06,-0.000051,2.112992e-05,0.000020,-0.000009,1.197855e-07,4.676391e-04,0.000006,0.000010,-0.000003
XRX,0.000024,0.000021,-0.000056,0.000329,-0.000021,0.000057,-0.000043,0.000003,-0.000052,-0.000066,...,1.762688e-05,0.000160,-4.681676e-05,0.000010,0.000031,-1.784730e-06,5.884007e-06,0.001302,0.000022,0.000028
YUM,0.000083,0.000032,0.000054,0.000109,0.000064,0.000038,0.000018,0.000022,-0.000029,0.000039,...,5.529586e-05,0.000065,3.391345e-05,0.000028,0.000056,8.327773e-05,1.005705e-05,0.000022,0.000703,0.000014
