In [64]:
import yfinance as yf
import numpy as np
import pandas as pd
from sklearn.mixture import GaussianMixture

In [3]:
# Data preparation : construction of datasets that consists in the returns of 200 S&P500 stocks over the period 2001-2023

stock_symbols = ["AAPL", "MSFT", "AMZN", "NVDA", "GOOGL", "TSLA", "META", "BRK.B", "UNH", "JPM", "JNJ", "XOM", "V", "PG", "AVGO", "LLY", "HD", "MA", "CVX", "MRK", "ABBV", "PEP", "COST", "ADBE", "KO", "WMT", "BAC", "CSCO", "MCD", "TMO", "CRM", "ACN", "PFE", "NFLX", "CMCSA", "ABT", "LIN", "AMD", "ORCL", "DHR", "WFC", "DIS", "TXN", "PM", "CAT", "INTC", "NEE", "UNP", "INTU", "COP", "VZ", "LOW", "NKE", "QCOM", "UPS", "IBM", "BA", "BMY", "HON", "SPGI", "AMAT", "RTX", "AMGN", "GE", "SBUX", "GS", "MS", "DE", "NOW", "PLD", "MDT", "ELV", "ISRG", "BLK", "BKNG", "ADP", "MDLZ", "LMT", "T", "TJX", "AXP", "SCHW", "ADI", "CVS", "MMC", "GILD", "LRCX", "SYK", "VRTX", "C", "AMT", "ETN", "CI", "CB", "ZTS", "REGN", "SLB", "MO", "TMUS", "FI", "EOG", "BDX", "MU", "PGR", "SO", "BSX", "CME", "PANW", "PYPL", "EQIX", "DUK", "ITW", "KLAC", "SNPS", "CSX", "ATVI", "AON", "SHW", "CL", "NOC", "CDNS", "ICE", "APD", "FCX", "FDX", "TGT", "HUM", "WM", "MMM", "MCK", "MPC", "ORLY", "HCA", "NXPI", "USB", "EMR", "PXD", "PH", "PNC", "CMG", "MCO", "APH", "ROP", "MAR", "NSC", "GM", "F", "GD", "PSX", "MCHP", "FTNT", "CARR", "EW", "MSI", "AJG", "TT", "DXCM", "ADM", "TDG", "AZO", "VLO", "CCI", "PSA", "OXY", "SRE", "ECL", "TEL", "AIG", "ON", "PCAR", "MNST", "ANET", "CHTR", "GIS", "ADSK", "STZ", "NUE", "MSCI", "KMB", "CTAS", "COF", "TFC", "JCI", "AFL", "D", "HEC", "IDXX", "AEP", "WMB", "WELL", "MET", "EXC", "HLT", "IQV", "PAYX", "EL", "CTVA", "O", "TRV", "ROST"]
start_date1 = "2001-01-01"
end_date1 = "2023-07-01"
data = pd.DataFrame(yf.download(tickers=stock_symbols, start=start_date1, end=end_date1))
log_returns = pd.DataFrame(np.log(data["Close"]/data["Open"]).transpose())

[*********************100%***********************]  200 of 200 completed

ERROR:yfinance:
2 Failed downloads:
ERROR:yfinance:['HEC', 'BRK.B']: Exception('%ticker%: No timezone found, symbol may be delisted')





In [4]:
# First month data

start1 = "2023-01-01"
end1 = "2023-02-01"
data = pd.DataFrame(yf.download(tickers=stock_symbols, start=start1, end=end1))
X_1 = pd.DataFrame(np.log(data["Close"]/data["Open"]).transpose())
X_1 = X_1.dropna()

[*********************100%***********************]  200 of 200 completed

ERROR:yfinance:
2 Failed downloads:
ERROR:yfinance:['HEC', 'BRK.B']: Exception('%ticker%: No timezone found, symbol may be delisted')





In [151]:
def multiple_clusterings(n_repeat, data, model):
  '''
  ## Type of data ##

  n_repeat : integer --> number of time we apply the clustering method
  model : sklearn model we use --> e.g. GaussianMixture()
  data : pd.DataFrame --> data we want to fit to the model

  -------------

  ## Output ##

  Outputs a pandas DataFrame object of shape (len(data.index), n_repeat)

  -------------

  ## Genera idea ##

  The idea is here to train the model on the dataset data multiple time (here n_repeat time)
  and create a DataFrame whose columns are the cluster labels of each stock and whose rows are
  the label of a given stock for each clustering method

  '''
  Y = pd.DataFrame(index=data.index)

  for i in range(n_repeat):
    model.fit(data)
    predicted_labels = model.predict(data)
    data_with_clusters = pd.DataFrame(predicted_labels, index=data.index)
    y_i = "Clustering n°%i" % (i+1)
    Y[y_i] = data_with_clusters

  return Y


In [169]:
def consensus_function(Y, n=5):


  '''
  ## Type of data ##

  Y : pd.DataFrame as output of the multiple_clusterings function

  -------------

  ## Output ##

  Consensus_matrix : pandas DataFrame of shape (len(Y.index), len(Y.index))

  -------------

  ## General idea ##

  Based on the output Y of multiple clusterings of the same dataset, create a
  symmetric consensus matrix with rows and columns corresponding to the symbols of
  the stocks we consider. The coefficients of this matrix have values 1 (if two stocks
  are in the same cluster in more than 75% of the realized clusterings given in Y) and
  0 if not.

  '''

  number_of_stocks = len(Y.index)
  number_of_clusterings = len(Y.columns)

  Consensus_matrix = np.zeros((number_of_stocks, number_of_stocks))
  for i in range(number_of_stocks):
    for j in range(i):
      compteur = 0
      for k in range(number_of_clusterings):
        name_k = "Clustering n°%d"%int(k+1)
        if (Y.loc[Y.index.tolist()[i]][name_k] == Y.loc[Y.index.tolist()[j]][name_k]):
          compteur = compteur + 1
      if compteur >= 0.75*n:
        Consensus_matrix[i][j] = 1
      else:
        Consensus_matrix[i][j] = 0

  return Consensus_matrix

The issue is that we're not considering a certain **degree of membership**. We could have three stocks X, Y, and Z such that their values in the consensus matrix are 1, and at the same time, X and Y have been in the same cluster 100% of the time, whereas Y and Z have only been in the same cluster 75% of the time. Here, we are assuming that the consensus function establishes a transitive relationship.


In [8]:
def same_cluster(stock_1, stock_2):
  return int((M.loc[stock_1][stock_2] == 1 or M.loc[stock_1][stock_2] == 1))

In [244]:
def consensus_clustering(n_repeat, data, n_components, cov_type):

  '''
  ## Type of data ##

  n_repeat : integer --> number of time we apply the clustering method
  model : sklearn model we use --> e.g. GaussianMixture()
  data : pd.DataFrame --> data we want to fit to the model

  -------------

  ## Output ##

  Outputs a pandas DataFrame object of shape (len(data.index), n_repeat)

  -------------

  %%%%% STEP 1 %%%%%%

  The idea is here to train the model on the dataset data multiple time (here n_repeat time)
  and create a DataFrame whose columns are the cluster labels of each stock and whose rows are
  the label of a given stock for each clustering method

  '''

  model = GaussianMixture(n_components=n_components, covariance_type=cov_type)
  stock_symbols = data.index.tolist()
  number_of_stocks = len(data.index)


  Y = pd.DataFrame(index=stock_symbols)

  for i in range(n_repeat):
    model.fit(data)
    predicted_labels = model.predict(data)
    data_with_clusters = pd.DataFrame(predicted_labels, index=stock_symbols)
    y_i = "Clustering n°%i" % (i+1)
    Y[y_i] = data_with_clusters


  '''

   %%%%% STEP 2 %%%%%%

  Based on the output Y of multiple clusterings of the same dataset, create a
  symmetric consensus matrix with rows and columns corresponding to the symbols of
  the stocks we consider. The coefficients of this matrix have values 1 (if two stocks
  are in the same cluster in more than 75% of the realized clusterings given in Y) and
  0 if not.

  '''

  ############## We first compute the consensus_matrix ###############

  consensus_matrix = np.zeros((number_of_stocks, number_of_stocks))

  for i in range(number_of_stocks):
    for j in range(i):
      compteur = 0
      for k in range(n_repeat):
        name_k = "Clustering n°%d"%int(k+1)
        if (Y.loc[Y.index.tolist()[i]][name_k] == Y.loc[Y.index.tolist()[j]][name_k]):
          compteur = compteur + 1
      if compteur >= 0.75*n_repeat:
        consensus_matrix[i][j] = 1
      else:
        consensus_matrix[i][j] = 0

  consensus_matrix = pd.DataFrame(consensus_matrix, index=stock_symbols, columns=stock_symbols)

  '''

  %%%%% STEP 3 %%%%%%

  Based on the consensus matrix that we just created, we compute the final clustering
  that corresponds to this consensus.

  '''

  ############### We then compute the final clustering from this consensus matrix ################

  clusters = np.array([])
  cluster_len = []
  nb_iterations = 0 ## way to count and control the number of iterations

  while len(stock_symbols) > 0 and nb_iterations < n_components:
    stock_ref = stock_symbols[0]
    cluster_i = np.array([])
    size_cluster_i = 0

    for stock in stock_symbols:
      if (same_cluster(stock, stock_ref) == 1): ## we should not impose further conditions as the consensus relation should be transitive within a cluster
        cluster_i = np.append(cluster_i, np.array([stock]))
        size_cluster_i += 1

    cluster_len.append(size_cluster_i)

    clusters = np.hstack((clusters, cluster_i))

    stock_symbols = np.setdiff1d(stock_symbols, cluster_i) ## we remove the stocks that have already been attributed to a cluster
    nb_iterations += 1

  final_clustering = np.array(np.split(clusters, np.cumsum(cluster_len)[:-1]))
  index=["Cluster n°%i"%(i+1) for i in range(n_components)]

  return pd.DataFrame(final_clustering, index=index, columns=["Cluster composition n°1"])

Maintenant que nous avons une idée d'un premier cluster, nous fixons la méthode suivante:
-on crée une base de données sur le mois 2023-02-01 à 2023-03-01

-on prend les valeurs moyennes des retours des actifs sur cette période

-on fait la moyenne de ces moyennes au sein des clusters du mois précédent

-on initialise les moyennes de chaque mélange gaussien selon les valeurs
trouvées à l'étape précédente

-on applique le modèle de mélange gaussien pour trouver les nouveaux clusters

In [247]:
Y = consensus_clustering(5, X_2, 5, "spherical")
Y

  final_clustering = np.array(np.split(clusters, np.cumsum(cluster_len)[:-1]))


Unnamed: 0,Cluster composition n°1
Cluster n°1,"[AAPL, ACN, ADI, ADSK, AEP, AVGO, AXP, BLK, CA..."
Cluster n°2,"[ABBV, ABT, AFL, AIG, AJG, AMGN, AZO, BAC, BDX..."
Cluster n°3,"[ADBE, C, DHR, DIS, FI, HLT, ICE, MA, MAR, MS,..."
Cluster n°4,"[ADM, CI, ELV, HUM, ISRG, MRK, ORLY, PGR, TMUS..."
Cluster n°5,"[ADP, AMT, APD, ATVI, CSX, EL, MMC, NSC, ORCL,..."


In [178]:
# second month data

start2 = "2023-02-01"
end2 = "2023-03-01"
data2 = pd.DataFrame(yf.download(tickers=stock_symbols, start=start2, end=end2))
X_2 = pd.DataFrame(np.log(data2["Close"]/data2["Open"]).transpose())
X_2 = X_2.dropna()
new_stock_list_2 = np.array(X_2.index)

[*********************100%***********************]  200 of 200 completed

ERROR:yfinance:
2 Failed downloads:
ERROR:yfinance:['HEC', 'BRK.B']: Exception('%ticker%: No timezone found, symbol may be delisted')





In [134]:
# creating a dataframe of the previous clusters
index = ["Cluster n°%i"%(i+1) for i in range(5)]

df_clusters_names = pd.DataFrame(final_clustering, index=index, columns=["Cluster Composition"])
df_clusters_names

Unnamed: 0,Cluster Composition
Cluster n°1,"[AAPL, ACN, ADBE, ADI, ADSK, AEP, AVGO, AXP, B..."
Cluster n°2,"[ABBV, ABT, AFL, AIG, AJG, AMGN, ATVI, AZO, BA..."
Cluster n°3,"[ADM, AON, BDX, CI, ELV, HUM, ISRG, MRK, ORLY,..."
Cluster n°4,[ADP]
Cluster n°5,"[AMAT, AMD, AMZN, ANET, COF, FTNT, KLAC, LRCX,..."


In [136]:
# Average values for each clusters
'''
we compute the average of the stock return during the second period for each cluster (given thecomposition we obtained at t-1).
'''
mean = []
for i in range(5): ## change the value 10 if the number of clusters is not the same
  labels_i = df_clusters_names.iloc[i][0]
  mean_i = X_2.mean(axis=1)[labels_i].mean()
  mean.append(mean_i)

In our setting, this is tantamount to assume that the $K$ Gaussian regimes we introduced have the following form $\mathcal{N}_d(\mu_1 \mathbf{1}, (\sigma_1)^2 \mathbb{I}_d), ..., \mathcal{N}_d(\mu_K\mathbf{1}, (\sigma_K)^2 \mathbb{I}_d)$ with $\mu_k \in \mathbb{R}$ and $\sigma_k \in \mathbb{R}^{+*}$ for each $k \in \begin{Bmatrix} 1, ..., K \end{Bmatrix}$. In other words, there is one mean per cluster. Thus, if we want to initialize the means for each one of the 11 clusters, we have to take the average of the value of the stocks in each cluster.

⟹ What about the weights --> should'nt we weight the importance of each stock in each cluster ?

In [138]:
init_mean=np.zeros((5, 19))
for i in range(5): ## we range in the number of clusters (11) and for each cluster
  y_i = mean[i] * np.ones(19) ## there are 19 days !!
  init_mean[i] = y_i

init_mean = pd.DataFrame(init_mean, index=df_clusters_names.index)

In [139]:
init_mean.shape

(5, 19)

In [140]:
n_components = 5
cov_type = "spherical"
init_mean=init_mean

# means_initarray-like of shape (n_components, n_features),  here n_components = 11,  n_features : 198

model2 = GaussianMixture(n_components=n_components, covariance_type=cov_type, means_init=init_mean)

model2.fit(X_2)

Y_2 = pd.DataFrame(model2.predict(X_2), index=new_stock_list_2)

In [141]:
Y_2.iloc[:, 0].max() ## there are 11 clusters

4

In [143]:
df_clusters_names_2 = pd.DataFrame(index = df_clusters_names.index, columns = ["Cluster composition n°2"])
for i in range(5):
  g_i = Y_2[Y_2.iloc[:, 0] == i].index.tolist()
  df_clusters_names_2.iloc[i][0] = g_i

df_clusters_names_2

Unnamed: 0,Cluster composition n°2
Cluster n°1,"[ADBE, ADSK, AMAT, AMD, ANET, DIS, ECL, EW, F,..."
Cluster n°2,"[ABT, ACN, AMZN, BDX, BLK, BSX, CARR, CHTR, CM..."
Cluster n°3,"[AFL, AIG, BMY, CI, CME, COP, CTVA, CVX, DE, D..."
Cluster n°4,"[ABBV, ADM, ADP, AEP, AJG, AMGN, AMT, AON, APD..."
Cluster n°5,"[AAPL, ADI, APH, AVGO, AXP, BA, BAC, BKNG, C, ..."


In [144]:
comparison_df = pd.concat([df_clusters_names_2, x], axis=1)
comparison_df

Unnamed: 0,Cluster composition n°2,Cluster composition n°1
Cluster n°1,"[ADBE, ADSK, AMAT, AMD, ANET, DIS, ECL, EW, F,...","[AAPL, ACN, ADBE, ADI, ADSK, AEP, AVGO, AXP, B..."
Cluster n°2,"[ABT, ACN, AMZN, BDX, BLK, BSX, CARR, CHTR, CM...","[ABBV, ABT, AFL, AIG, AJG, AMGN, ATVI, AZO, BA..."
Cluster n°3,"[AFL, AIG, BMY, CI, CME, COP, CTVA, CVX, DE, D...","[ADM, AON, BDX, CI, ELV, HUM, ISRG, MRK, ORLY,..."
Cluster n°4,"[ABBV, ADM, ADP, AEP, AJG, AMGN, AMT, AON, APD...",[ADP]
Cluster n°5,"[AAPL, ADI, APH, AVGO, AXP, BA, BAC, BKNG, C, ...","[AMAT, AMD, AMZN, ANET, COF, FTNT, KLAC, LRCX,..."
