# <a id="table-of-contents"></a>Table of Contents

1. [Portfolio_marginal_attributes](#portfolio_marginal_attributes)
2. [Portfolio_attributes](#portfolio_attributes)

---

In [None]:
import os
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt


from plotly.subplots import make_subplots
from scipy.stats import trim_mean
from scipy.stats import kurtosis, skew

In [None]:
# MANUAL INPUTS

#List portfolio securities here:
portfolio_with_candidate = ["SPY_history.csv", "XLU_history.csv", "XLF_history.csv"]
portfolio_without_candidate= ["XLF_history.csv","SPY_history.csv"]
candidate = ["XLU_history.csv"]

# Define weights_with_candidate (make sure the keys match the column names):
weights_with_candidate = {
    'Return_SPY': 0.3,  # 50% weight to SPY
    'Return_XLU': 0.5,  # 30% weight to XLU
    'Return_XLF': 0.2   # 20% weight to XLF
}

weights_without_candidate = {
    "Return_SPY": 0.6,  # Adjust weights to exclude candidate
    "Return_XLF": 0.4
}

observations_to_keep= 1500

In [None]:
def process_portfolio_data(securities, weights_with_candidate):
    # Define the Data folder path
    data_folder = os.path.join(os.path.expanduser("~/Desktop/Trading"), "Data")
    
    # Initialize an empty list to store the DataFrames
    data_frames = []
    
    # Iterate over the list of security files
    for csv in securities:
        # Construct the full file path
        file_path = os.path.join(data_folder, csv)
        
        # set "data" equal to a pandas dataframe created from the csv file
        data = pd.read_csv(file_path)
        
        # Convert 'Date' column to datetime format
        data['Date'] = pd.to_datetime(data['Date'])
        
        # Extract security name from file name (e.g., "SPY" from "SPY_history.csv")
        security_name = csv.split('_')[0]
        
        # Rename columns to include security name
        data = data.add_suffix(f'_{security_name}')
        
        # Calculate day-over-day price change based on 'Close' column
        data[f'Return_{security_name}'] = data[f'Close/Last_{security_name}'].pct_change()
        
        # Append the DataFrame to the list
        data_frames.append(data)
    
    # Combine all pandas DataFrames you just made into a single DataFrame
    combined_data = pd.concat(data_frames, axis=1)
    
    # Calculate the portfolio's uniform daily return (average of all securities' daily returns)
    return_columns = [col for col in combined_data.columns if col.startswith("Return_")]
    combined_data["portfolio_uniform_daily_return"] = combined_data[return_columns].mean(axis=1)
    
    # Normalize weights to ensure they sum to 1 (just in case)
    total_weight = sum(weights_with_candidate.values())
    normalized_weights = {col: weight / total_weight for col, weight in weights_with_candidate.items()}
    
    # Calculate the portfolio's weighted daily return
    combined_data["portfolio_weighted_daily_return"] = sum(
        combined_data[col] * weight for col, weight in normalized_weights.items() if col in combined_data.columns
    )
    
    # Add Original_Index column to capture current index
    combined_data['Original_Index'] = combined_data.index
    
    return combined_data


In [None]:
# Process the data for different portfolios
portfolio_with_candidate_df = process_portfolio_data(portfolio_with_candidate, weights_with_candidate)
portfolio_without_candidate_df = process_portfolio_data(portfolio_without_candidate, weights_with_candidate)
candidate_df = process_portfolio_data(candidate, weights_with_candidate)

In [None]:
portfolio_with_candidate_df.columns

In [None]:
portfolio_with_candidate_df.head(1)

In [None]:
portfolio_with_candidate_df = pd.DataFrame(portfolio_with_candidate_df)

combined_returns_df = portfolio_with_candidate_df[[
    "Return_SPY", "Return_XLU", "Return_XLF"
]]


In [None]:
# Add function to sort DataFrame by Original_Index
def sort_by_original_index(data):
    return data.sort_values(by='Original_Index', ascending=False)

# Sort the DataFrames by Original_Index
portfolio_with_candidate_df = sort_by_original_index(portfolio_with_candidate_df)
portfolio_without_candidate_df = sort_by_original_index(portfolio_without_candidate_df)
candidate_df = sort_by_original_index(candidate_df)

In [None]:
# Function to drop observations after a specified number
def drop_excess_observations(dataframe, observations_to_keep):
    """
    Returns a new DataFrame with only the first 'observations_to_keep' rows.
    
    Args:
        dataframe (pd.DataFrame): Input DataFrame to truncate.
        observations_to_keep (int): Number of rows to retain.

    Returns:
        pd.DataFrame: Truncated DataFrame.
    """
    return dataframe.iloc[:observations_to_keep].reset_index(drop=True)



# Apply the function to each DataFrame
portfolio_with_candidate_df_truncated = drop_excess_observations(portfolio_with_candidate_df, observations_to_keep)
portfolio_without_candidate_df_truncated = drop_excess_observations(portfolio_without_candidate_df, observations_to_keep)
candidate_df_truncated = drop_excess_observations(candidate_df, observations_to_keep)


In [None]:
def calculate_weighted_returns(dataframe, weights_with_candidate, weights_without_candidate):
    """
    Calculates weighted portfolio returns for two scenarios: 
    with candidate and without candidate.

    Args:
        dataframe (pd.DataFrame): DataFrame containing security return columns.
        weights_with_candidate (dict): Weights for the portfolio with candidate included.
        weights_without_candidate (dict): Weights for the portfolio without candidate included.

    Returns:
        pd.DataFrame: DataFrame with two new columns for the weighted portfolio returns.
    """
    # Create a copy of the DataFrame to avoid modifying the original
    dataframe = dataframe.copy()
    
    # Calculate weighted return with candidate
    dataframe["weighted_return_with_candidate"] = sum(
        dataframe[security] * weight
        for security, weight in weights_with_candidate.items()
        if security in dataframe.columns
    )

    # Calculate weighted return without candidate
    dataframe["weighted_return_without_candidate"] = sum(
        dataframe[security] * weight
        for security, weight in weights_without_candidate.items()
        if security in dataframe.columns
    )

    return dataframe


In [None]:
combined_returns_df = calculate_weighted_returns(
    dataframe=portfolio_with_candidate_df,
    weights_with_candidate=weights_with_candidate,
    weights_without_candidate=weights_without_candidate
)


In [None]:
# Plot both distributions
plt.figure(figsize=(12, 6))
sns.histplot(
    combined_returns_df["weighted_return_with_candidate"].dropna(),
    bins=50, kde=True, color="blue", label="With Candidate", alpha=0.4
)
sns.histplot(
    combined_returns_df["weighted_return_without_candidate"].dropna(),
    bins=50, kde=True, color="green", label="Without Candidate", alpha=0.4
)

# Customize plot
plt.legend(fontsize=10)
plt.title("Comparison of Portfolio Weighted Return Distributions")
plt.xlabel("Daily Weighted Return")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()


## Portfolio_marginal_attributes <a id="portfolio_marginal_attributes"></a>

[Back to Table of Contents](#table-of-contents)

In [None]:
def calculate_weighted_statistics(dataframe, portfolio_list, weights_dictionary):
    """
    Calculates the weighted mean, standard deviation, skewness, and kurtosis 
    for the portfolio, rescaling weights to exclude cash holdings.
    
    Returns:
        dict: A dictionary with the rescaled weighted statistics for the portfolio.
    """
    # Validate input types
    if not isinstance(portfolio_list, list):
        raise ValueError("Expected portfolio_list to be a list of filenames.")

    # Initialize variables to store weighted statistics
    weighted_mean = 0
    weighted_std_dev = 0
    weighted_skewness = 0
    weighted_kurtosis = 0

    # Extract security names dynamically from the portfolio list
    securities = [csv.split('_')[0] for csv in portfolio_list]  # Extract 'SPY', 'XLU', etc.

    # Map the securities to their weight keys in the dictionary
    weight_keys = [f"Return_{security}" for security in securities]

    # Rescale the weights to exclude cash (normalize weights to sum to 1)
    invested_weights = {key: weights_dictionary.get(key, 0) for key in weight_keys}  # Match weights using correct keys

    total_invested_weight = sum(invested_weights.values())  # Calculate total invested weight


    # Avoid division by zero (e.g., no invested securities)
    if total_invested_weight > 0:
        normalized_weights = {key: weight / total_invested_weight for key, weight in invested_weights.items()}
        print("Normalized Weights:", normalized_weights)  # Print the normalized weights
    else:
        print("Warning: Total invested weight is zero. Cannot rescale weights.")
        return None  # Return None or raise an exception
    
    # Iterate over securities and calculate weighted statistics
    for security, weight_key in zip(securities, weight_keys):
        # Ensure the Close/Last column for the security exists
        close_col = f"Close/Last_{security}"
        
        if close_col in dataframe.columns:
            # Drop NaN values from the close column
            close_data = dataframe[close_col].dropna()

            # Calculate individual statistics
            mean = close_data.mean()
            std_dev = close_data.std()
            skewness = skew(close_data)
            kurt = kurtosis(close_data)

            # Fetch the normalized weight for this security
            weight = normalized_weights.get(weight_key, 0)  # Default to 0 if not found

            # Apply weights to the metrics
            weighted_mean += mean * weight
            weighted_std_dev += std_dev * weight
            weighted_skewness += skewness * weight
            weighted_kurtosis += kurt * weight
        else:
            print(f"Warning: Column {close_col} not found in the DataFrame.")
    
    # Combine all weighted metrics into a dictionary
    weighted_statistics = {
        'weighted_mean': weighted_mean,
        'weighted_std_dev': weighted_std_dev,
        'weighted_skewness': weighted_skewness,
        'weighted_kurtosis': weighted_kurtosis
    }

    return weighted_statistics


In [None]:
def plot_weighted_return_distributions(returns_with_candidate, returns_without_candidate, stats):
    """
    Plots the PDF (kernel density estimate) with histograms underneath for two distributions: 
    "With Candidate" and "Without Candidate".

    Args:
        returns_with_candidate (pd.Series): Weighted returns for "With Candidate" portfolio.
        returns_without_candidate (pd.Series): Weighted returns for "Without Candidate" portfolio.
        stats (dict): Dictionary containing mean, std dev, kurtosis, and skew for each portfolio.
    """
    # Create a dictionary for the data
    returns_data = {
        "With Candidate": returns_with_candidate.dropna(),
        "Without Candidate": returns_without_candidate.dropna()
    }
    
    # Initialize a figure
    plt.figure(figsize=(12, 6))
    colors = ["blue", "green"]
    
    # Overlay histograms and density plots
    for (label, data), color in zip(returns_data.items(), colors):
        sns.histplot(data, bins=50, kde=True, color=color, label=f"{label}\n"
            f"Mean: {stats[label]['Mean']:.4f}\n"
            f"Std Dev: {stats[label]['Std Dev']:.4f}\n"
            f"Kurtosis: {stats[label]['Kurtosis']:.4f}\n"
            f"Skew: {stats[label]['Skew']:.4f}",
            alpha=0.4)  # Adjust opacity for histograms
    
    # Customize plot
    plt.legend(fontsize=10)
    plt.title("Portfolio Weighted Return Distributions")
    plt.xlabel("Daily Weighted Return")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()


In [None]:
weighted_returns_data = {
    "With Candidate": portfolio_with_candidate_df["portfolio_weighted_daily_return"],
    "Without Candidate": portfolio_without_candidate_df["portfolio_weighted_daily_return"]
}

stats = {
    label: {
        "Mean": np.mean(data.dropna()),
        "Std Dev": np.std(data.dropna()),
        "Kurtosis": kurtosis(data.dropna()),
        "Skew": skew(data.dropna())
    }
    for label, data in weighted_returns_data.items()
}

# Call the adjusted function
plot_weighted_return_distributions(
    returns_with_candidate=portfolio_with_candidate_df["portfolio_weighted_daily_return"],
    returns_without_candidate=portfolio_without_candidate_df["portfolio_weighted_daily_return"],
    stats=stats
)


In [None]:
def trimmed_std_dev(data, trim_percent=0.02):
    """
    Calculate the trimmed standard deviation for a portfolio's average daily return.

    Parameters:
    - data (pd.DataFrame): DataFrame containing daily returns of all securities
    - trim_percent (float): Percentage of observations to trim from each end (default 10%)

    Returns:
    - float: Trimmed standard deviation of the portfolio
    """
    # Select only return columns (avoid including other numerical data)
    return_columns = [col for col in data.columns if col.startswith("Return_")]
    if not return_columns:
        raise ValueError("No return columns found in the dataset!")

    # Compute the portfolio's daily return (average of all securities' daily returns)
    data["portfolio_uniform_daily_return"] = data[return_columns].mean(axis=1)

    # Extract the portfolio daily returns as a series
    portfolio_returns = data["portfolio_uniform_daily_return"].dropna().values

    # Trim the extreme observations
    trim_count = int(len(portfolio_returns) * trim_percent)
    sorted_returns = np.sort(portfolio_returns)
    trimmed_returns = sorted_returns[trim_count:-trim_count]  # Trim bottom & top values

    # Compute and return standard deviation of the trimmed dataset
    return np.std(trimmed_returns, ddof=1)


In [None]:
# Compute trimmed standard deviations
trimmed_std_portfolio_without_candidate = trimmed_std_dev(portfolio_without_candidate_df, trim_percent=0.02)
trimmed_std_portfolio_with_candidate = trimmed_std_dev(portfolio_with_candidate_df, trim_percent=0.02)
trimmed_std_candidate = trimmed_std_dev(candidate_df, trim_percent=0.02)

# Print results
print("Trimmed Std Dev (Portfolio without Candidate):", trimmed_std_portfolio_without_candidate)
print("Trimmed Std Dev (Candidate Security):", trimmed_std_candidate)
print("Trimmed Std Dev (Portfolio with Candidate):", trimmed_std_portfolio_with_candidate)

In [None]:
def gini_mean_difference(data, column):
    """
    Calculate the Gini mean difference for a specified column in a DataFrame.
    
    Args:
        data (pd.DataFrame): The DataFrame containing the data.
        column (str): The column name for which to calculate the Gini mean difference.
        
    Returns:
        float: The Gini mean difference.
    """
    # Extract the specified column values
    values = data[column].dropna().values
    
    # Calculate the absolute differences between all pairs of elements
    diff_matrix = np.abs(np.subtract.outer(values, values))
    
    # Calculate the mean of the absolute differences
    gini_mean_diff = np.mean(diff_matrix)
    
    return gini_mean_diff


In [None]:
gini_portfolio_with_candidate = gini_mean_difference(portfolio_with_candidate_df, 'portfolio_uniform_daily_return')
gini_portfolio_without_candidate = gini_mean_difference(portfolio_without_candidate_df, 'portfolio_uniform_daily_return')
gini_candidate = gini_mean_difference(candidate_df, 'portfolio_uniform_daily_return')

print(f'Gini Mean Difference (without candidate): {gini_portfolio_without_candidate}')
print(f'Gini Mean Difference (candidate): {gini_candidate}')
print(f'Gini Mean Difference (with candidate): {gini_portfolio_with_candidate}')

DOUBLE CHECK THE STATIONARITY OF GINI MEAN COEFFICENT AND TRIMMED STANDARD DEVIATION

In [None]:
def plot_clustered_bar_chart_with_labels(gini_values, trimmed_std_values, labels):
    """
    Plots a clustered bar chart with value labels for Gini mean difference
    and trimmed standard deviation for three portfolios.

    Args:
        gini_values (list): A list of Gini mean differences for the portfolios.
        trimmed_std_values (list): A list of trimmed standard deviations for the portfolios.
        labels (list): A list of labels for the portfolios.
    """
    # Number of portfolios
    n_portfolios = len(labels)

    # Bar positions
    x = np.arange(n_portfolios)  # X-axis positions for the groups
    bar_width = 0.35  # Width of each bar

    # Plot bars
    plt.figure(figsize=(10, 6))
    gini_bars = plt.bar(x - bar_width / 2, gini_values, width=bar_width, label='Gini Mean Difference', color='skyblue')
    std_bars = plt.bar(x + bar_width / 2, trimmed_std_values, width=bar_width, label='Trimmed Std Dev', color='lightcoral')

    # Add labels and title
    plt.title('Comparison of Portfolio Statistics')
    plt.xlabel('Portfolios')
    plt.ylabel('Values')
    plt.xticks(x, labels)  # Set portfolio labels for x-axis ticks
    plt.legend()
    plt.grid(axis="y", linestyle="--", alpha=0.5)

    # Add value labels to the bars
    for bar in gini_bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.4f}', ha='center', va='bottom', fontsize=10)

    for bar in std_bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.4f}', ha='center', va='bottom', fontsize=10)

    # Display the plot
    plt.tight_layout()
    plt.show()

# Define the calculated values
gini_values = [gini_portfolio_with_candidate, gini_portfolio_without_candidate, gini_candidate]
trimmed_std_values = [trimmed_std_portfolio_with_candidate, trimmed_std_portfolio_without_candidate, trimmed_std_candidate]
labels = ["With Candidate", "Without Candidate", "Candidate Only"]

# Call the function to plot the bar chart
plot_clustered_bar_chart_with_labels(gini_values, trimmed_std_values, labels)


## Portfolio_attributes <a id="portfolio_attributes"></a>

[Back to Table of Contents](#table-of-contents)

Shape of the Distribution (Skewness, Kurtosis, Standard Deviation, Mean):
Since you're working with returns, which are essentially first-differenced prices, you're correct that they've already been detrended to some extent. This makes the assumption of mean-variance stationarity less critical.

Even though financial returns can still exhibit non-stationary behavior (e.g., volatility clustering), their distributional properties (like skewness and kurtosis) are relatively stable over time if calculated over a large enough sample.

In this case, you don't necessarily need to account for stationarity explicitly unless your analysis spans vastly different market conditions (like a bull market vs. a bear market).

You're absolutely right that for metrics like beta and covariance, which depend on relationships between securities, stationarity is more crucial. If the underlying data isn't stationary, these metrics could fluctuate unpredictably over time, making them unreliable.


You're absolutely correct that if your focus is on the stability of variance, covariance, and beta, the concept of mean stationarity isn't particularly relevant.Don't use dickey-fuller or ADF

In [None]:
# Select only the "Close/Last_" columns
close_columns = [col for col in portfolio_with_candidate_df.columns if col.startswith('Close/Last_')]

# Calculate daily percentage returns
returns = portfolio_with_candidate_df[close_columns].pct_change()

# Drop NaN values (from the first row caused by pct_change)
returns = returns.dropna()

# 1. Correlation Matrix
correlation_matrix = returns.corr()

# 2. Covariance Matrix
covariance_matrix = returns.cov()

# 3. Beta Matrix - Adjust to have the same shape as correlation and covariance matrices
benchmark = close_columns[0]
betas = {}

# Initialize beta_matrix with NaN values to match the size of the correlation and covariance matrices
beta_matrix = pd.DataFrame(np.nan, index=close_columns, columns=close_columns)

for col in close_columns:
    for row in close_columns:
        if col == row:
            beta_matrix.loc[row, col] = 1
        else:
            beta = covariance_matrix.loc[col, benchmark] / covariance_matrix.loc[benchmark, benchmark] 
            beta_matrix.loc[col, benchmark] = beta

In [None]:
# Define a conversion factor for covariance (e.g., multiplying by 1e6 for 'mu' units)
covariance_conversion_factor = 1e6  # Adjust this based on what "mu" represents

# Apply conversion factor to covariance matrix
covariance_matrix_mu = covariance_matrix * covariance_conversion_factor

# Store matrices in a dictionary for dynamic plotting
matrices = {
    "Correlation Matrix": correlation_matrix,
    "Covariance Matrix (mu)": covariance_matrix_mu,  # Use converted covariance matrix
    "Beta Matrix": beta_matrix
}

num_matrices = len(matrices)  # Adjust based on how many matrices you have

# Create subplots dynamically
fig = make_subplots(
    rows=1, cols=num_matrices,  
    subplot_titles=list(matrices.keys()),  # Dynamically set titles
    column_widths=[1/num_matrices] * num_matrices,  
    shared_yaxes=True,
    shared_xaxes=True
)

# Adjust x positions for colorbars below each heatmap, adding some spacing
x_positions = np.linspace(0.15, 0.85, num_matrices)  # Spread them evenly from left to right with spacing

# Add each matrix as a heatmap dynamically
for i, (title, matrix) in enumerate(matrices.items(), start=1):
    fig.add_trace(
        go.Heatmap(
            z=matrix.values,
            x=matrix.columns,
            y=matrix.columns,
            colorscale="RdBu",
            colorbar=dict(
                title=title.split()[0],  # Use first word of title (Correlation, Covariance, Beta)
                tickvals=[matrix.values.min(), 0, matrix.values.max()],
                yanchor="top",
                y=-0.25,  # Move colorbar slightly below the heatmap
                x=x_positions[i - 1],  # Align it under each respective heatmap with spacing
                xanchor="center",
                orientation="v"  # Make the colorbar vertical
            ),
            text=matrix.values.round(2),
            texttemplate="%{text}",
            showscale=True,
            hoverinfo="skip"
        ),
        row=1, col=i
    )

# Update layout
fig.update_layout(
    title="Correlation, Covariance (mu), and Beta Matrices",
    height=750,  # Increased height to accommodate vertical legends
    showlegend=False,
    title_x=0.5
)

# Show the interactive heatmap
fig.show()
