In [1]:
import pandas as pd 
import numpy as np 

import os 
import sys 

module_path = os.path.abspath(os.path.join(os.getcwd(), '../Main_modules'))
sys.path.append(module_path)

import warnings 
import process 
from PyFolioC_classes import PyFolioC
warnings.filterwarnings("ignore") ## so that there are no polluting warnings as output of this cell



In [2]:
# Jerome path : r'C:\Users\33640\OneDrive\Documents\GitHub\Portfolio_clustering_project\Data\DataBase.csv'
# Nail path : '/Users/khelifanail/Documents/GitHub/Portfolio_clustering_project/Data/DataBase.csv'
df = pd.read_csv('/Users/khelifanail/Documents/GitHub/Portfolio_clustering_project/Data/DataBase.csv')

df.set_index('ticker', inplace=True)

df.columns = pd.to_datetime(df.columns.str[1:], format='%Y%m%d').strftime('%d/%m/%Y')

df_cleaned = df.dropna() # Utilisez la méthode fillna(0) pour remplacer les NaN par 0

df_cleaned = df_cleaned.transpose() ## WE WANT COLUMNS TO BE VECTOR OF RETURN FOR A GIVEN TICKER

df_cleaned.iloc[5025,:]

ticker
AA     0.006530
ABM   -0.010814
ABT    0.004153
ADI    0.000168
ADM    0.008015
         ...   
XLY    0.003114
XOM    0.010951
XRX    0.002444
YUM    0.000596
ZTR    0.000000
Name: 31/12/2019, Length: 663, dtype: float64

In [3]:
##################################################################### PARAMETERS #####################################################################
historical_data = df_cleaned
number_of_repetitions = 10
lookback_window = [3190,3265]  ## new lookback_window
evaluation_window = 5
number_of_clusters = 24
cov_method = 'SPONGE'
sigma = 0.01 ## on a fait bouger sigma ici
eta = 0.01
markowitz_type = 'expected_returns'
beta = 0.9
K = 4  # Number of fold for the cross validation
tc=0.0001

##################################################################### PORTFOLIO ######################################################################
portfolio = PyFolioC(number_of_repetitions=number_of_repetitions, historical_data=historical_data, lookback_window=lookback_window, evaluation_window=evaluation_window, number_of_clusters=number_of_clusters, sigma=sigma, eta=eta, EWA_cov=True, beta=beta, short_selling=True, cov_method=cov_method, markowitz_type=markowitz_type, transaction_cost_rate=tc)
## year 2008-2009 ==> 2007:2262 if evaluation_window == 2
## year 2008-2009 ==> 2007:2265 if evaluation_window == 5
## year 2012-2013 ==> 3016:3265
## year 2018-2019 ==> 4524:4774
## year 2016-2019 ==> 4021:4774
## year 2010-2020 ==> 2512:5279 (we go until 5277 to have a multiple of 5 for the difference)
######### year 2013-2019 ==> 3265:5025 352 window of 5

In [6]:
def calculate_mean_correlation(df):
    """
    Calculate the average correlation between columns of a DataFrame and optionally plot a heatmap.
    Args:
        df (pd.DataFrame): The DataFrame containing the data to analyze.
    Returns:
        float: The average correlation between columns of the DataFrame.
    """
    if df.empty:
        raise ValueError("The DataFrame is empty. Please provide a valid DataFrame.")
    
    correlation_matrix = df.corr()
    correlation_values = correlation_matrix.values
    n = correlation_values.shape[0]

    total_correlation = 0
    count = 0
    
    for i in range(n):
        for j in range(i + 1, n):
            total_correlation += correlation_values[i, j]
            count += 1

    mean_correlation = total_correlation / count if count != 0 else 0
    return mean_correlation

def get_most_corr_cluster(portfolio, lookback_window, df_cleaned, number=1, strat='correlation'):

    '''
    return the number-th most correlated cluster
    '''

    mean_corr_list = []

    for name, cluster in portfolio.cluster_composition.items():
        tickers = cluster['tickers']
        tickers_df = df_cleaned[tickers].iloc[lookback_window[0]:lookback_window[1], :]
        mean_corr = calculate_mean_correlation(tickers_df)
        mean_corr_list.append((name, mean_corr, tickers))

    sorted_cluster_corr_list = sorted(mean_corr_list, key=lambda x: x[1])
    most_corr_cluster = sorted_cluster_corr_list[-number]

    return most_corr_cluster

def most_corr_returns(portfolio, lookback_window, evaluation_window, df_cleaned, number=1):
    
    # tuple of length 3: (cluster name, average correlation, cluster composition)
    most_corr_cluster = get_most_corr_cluster(portfolio, lookback_window, df_cleaned, number)
    cluster_name = most_corr_cluster[0]
    ticker_list = most_corr_cluster[2]

    # we prepare an empty dataset
    most_corr_cluster_returns = pd.DataFrame(
        index=df_cleaned.index[lookback_window[1]:lookback_window[1]+evaluation_window], 
        columns=[cluster_name], 
        data=np.zeros((evaluation_window, 1))
    )

    for ticker in ticker_list:
        most_corr_cluster_returns[cluster_name] += df_cleaned[ticker][lookback_window[1]:lookback_window[1]+evaluation_window] * portfolio.consolidated_weight[ticker].values[0]

    return most_corr_cluster_returns

def most_corr_PnL(consolidated_portfolio, lookback_window, evaluation_window, df_cleaned, number=1):

    most_corr_return = most_corr_returns(consolidated_portfolio, lookback_window, evaluation_window, df_cleaned, number)

    cumulative_returns = np.cumprod(1 + most_corr_return) * 1 - 1

    return cumulative_returns.iloc[-1][0]


In [7]:
x = most_corr_PnL(portfolio, lookback_window, evaluation_window, df_cleaned)

In [8]:
non_adjusted_returns = np.cumprod(portfolio.portfolio_return['return'] + 1) - 1 

In [13]:
x/non_adjusted_returns[-1]

0.003704408560364042