In [1]:
import pandas as pd

In [2]:
# Load the dataset
proxy_time_series_df_alex = pd.read_csv('data/proxy_time_series_alex.csv')

# Define a function to calculate R-squared
def calculate_r_squared(actual, predicted):
    ssr = ((actual - predicted) ** 2).sum()  # Sum of Squared Residuals
    tss = ((actual - actual.mean()) ** 2).sum()  # Total Sum of Squares
    return 1 - (ssr / tss)

# Calculate R^2 for a specific date
def r_squared_for_date(data, date, actual_col, predicted_col):
    date_data = data[data['Date'] == date]
    if not date_data.empty:
        actual = date_data[actual_col]
        predicted = date_data[predicted_col]
        return calculate_r_squared(actual, predicted)
    else:
        raise ValueError(f"No data found for the specified date: {date}")

# Calculate R^2 for the entire dataset
def r_squared_for_dataset(data, actual_col, predicted_col):
    actual = data[actual_col]
    predicted = data[predicted_col]
    return calculate_r_squared(actual, predicted)

# Calculate R^2 for a specific ticker and method
def calculate_r2_for_ticker(data, ticker, actual_col, predicted_col):
    # Filter the data for the specified ticker
    ticker_data = data[data["Ticker"] == ticker]
    if ticker_data.empty:
        raise ValueError(f"No data found for the specified ticker: {ticker}")
    
    # Calculate R^2
    r2 = calculate_r_squared(ticker_data[actual_col], ticker_data[predicted_col])
    return r2



In [3]:
# Specify columns for actual and predicted spreads
actual_col = "Actual_Spread"
predicted_col = "Proxy_Intersection"


specific_date = '2010-09-15' 
r2_specific_date = r_squared_for_date(proxy_time_series_df_alex, specific_date, actual_col, predicted_col)
print(f"R^2 CSRA for {specific_date}: {r2_specific_date}")


r2_entire_dataset = r_squared_for_dataset(proxy_time_series_df_alex, actual_col, predicted_col)
print(f"R^2 CSRA for the entire dataset: {r2_entire_dataset}")

# Compare to CSRA_Community
predicted_col = "Proxy_CSRA_Community"
 
r2_specific_date = r_squared_for_date(proxy_time_series_df_alex, specific_date, actual_col, predicted_col)
print(f"R^2 CSRA_Community for {specific_date}: {r2_specific_date}")


r2_entire_dataset = r_squared_for_dataset(proxy_time_series_df_alex, actual_col, predicted_col)
print(f"R^2 CSRA_Community for the entire dataset: {r2_entire_dataset}")

R^2 CSRA for 2010-09-15: -0.131787502095023
R^2 CSRA for the entire dataset: -0.054790981187530896
R^2 CSRA_Community for 2010-09-15: -0.2640007033215077
R^2 CSRA_Community for the entire dataset: -1.9054669834930165


In [14]:
ticker = "AAUK"  
actual_col = "Actual_Spread"  
predicted_col = "Proxy_CSRA"  # Replace with your desired proxy method (e.g., "Proxy_CSRA" or "Proxy_CSRA_Community")

r2_ticker = calculate_r2_for_ticker(proxy_time_series_df_alex, ticker, actual_col, predicted_col)
print(f"R^2 for {ticker} ({predicted_col}): {r2_ticker}")

R^2 for AAUK (Proxy_CSRA): -3.3918826854063227


In [5]:
# Loop over all tickers to calculate R^2
def calculate_r2_for_all_tickers(data, actual_col, predicted_col):
    ticker_r2_results = {}
    unique_tickers = data["Ticker"].unique()  # Get all unique tickers

    for ticker in unique_tickers:
        try:
            # Filter data for each ticker and calculate R^2
            ticker_data = data[data["Ticker"] == ticker]
            r2 = calculate_r_squared(ticker_data[actual_col], ticker_data[predicted_col])
            ticker_r2_results[ticker] = r2  # Store the result
        except Exception as e:
            print(f"Error calculating R^2 for ticker {ticker}: {e}")
            ticker_r2_results[ticker] = None  # Store None if there's an error

    return ticker_r2_results

# Specify the columns
actual_col = "Actual_Spread"  
predicted_col = "Proxy_Intersection"  # Replace with your desired proxy method (e.g., "Proxy_CSRA" or "Proxy_CSRA_Community")

# Calculate R^2 for all tickers
r2_results = calculate_r2_for_all_tickers(proxy_time_series_df_alex, actual_col, predicted_col)

# Display the results
for ticker, r2 in r2_results.items():
    print(f"R^2 for {ticker}: {r2}")


R^2 for AAUK: -0.5320478845059791
R^2 for ABE: -3.659662570565895
R^2 for ABHLTD: -1.0930331717795418
R^2 for ACAFP: 0.46342624595744586
R^2 for ACAFP-CIB: -5.484440979297272
R^2 for ACCOR: -0.8350824582429979
R^2 for ACEA: -337.9886840970632
R^2 for ADIG: -28.14384111925619
R^2 for AEGON: 0.6303897403136484
R^2 for AF: -3.832322131202476
R^2 for AF-AirFrance: -3.7680731256001145
R^2 for AGASSA: -1.0227860385655858
R^2 for AGS: -22.028368024599718
R^2 for AGS-Intl: -224.49366350665767
R^2 for AIRLIQ: -34.639122430666006
R^2 for AKZO: -0.09875602850682519
R^2 for AKZO-ICILD: -1686.9428278627058
R^2 for ALCLCT: -3.28703587816549
R^2 for ALINDR: -96.35306587496324
R^2 for ALPHBK: -5.537245527686435
R^2 for ALSTOM: -1.0171671880194886
R^2 for ALT: -108.15221160984966
R^2 for ALYD: -118.58135356654746
R^2 for ALZSE: -0.7269735199804246
R^2 for AMROBK: 0.7467415707382156
R^2 for ANGLIA: 0.2576816468145303
R^2 for ARMLL: -3.6156132716860174
R^2 for ARMLL-FRA: -4.362723104590749
R^2 for ARMLL-

In [6]:
# Define a function to calculate R^2 for each bucket on a given date, including item counts
def calculate_r2_by_bucket(data, metadata, date, actual_col, predicted_col, bucket_cols):
    # Filter the main data for the specified date
    date_data = data[data["Date"] == date]
    
    if date_data.empty:
        raise ValueError(f"No data found for the specified date: {date}")

    # Merge the main data with metadata to get bucket information
    merged_data = date_data.merge(metadata, on="Ticker", how="inner")
    
    # Group data by the specified bucket columns
    grouped = merged_data.groupby(bucket_cols)

    # Calculate R^2 for each bucket and include the number of items
    bucket_r2_results = {}
    for bucket, group in grouped:
        actual = group[actual_col]
        predicted = group[predicted_col]
        bucket_size = len(group)  # Number of items in the bucket
        try:
            r2 = calculate_r_squared(actual, predicted)
            bucket_r2_results[bucket] = {"R^2": r2, "Count": bucket_size}
        except ZeroDivisionError:
            bucket_r2_results[bucket] = {"R^2": None, "Count": bucket_size}  # Handle cases with no variance in actual values

    return bucket_r2_results

metadata = pd.read_csv('data/metadata.csv') 
bucket_cols = ["Sector", "Country", "AverageRating"]  # Define the bucket columns
actual_col = "Actual_Spread"
predicted_col = "Proxy_CSRA"  
date = "2010-09-14" 


r2_results_by_bucket = calculate_r2_by_bucket(
    data=proxy_time_series_df_alex,
    metadata=metadata,
    date=date,
    actual_col=actual_col,
    predicted_col=predicted_col,
    bucket_cols=bucket_cols
)
for bucket, results in r2_results_by_bucket.items():
    r2 = results["R^2"]
    count = results["Count"]
    print(f"Bucket {bucket}: R^2 = {r2}, Count = {count}")



Bucket ('Basic Materials', 'Austria', 12): R^2 = -inf, Count = 1
Bucket ('Basic Materials', 'Belgium', 9): R^2 = -inf, Count = 1
Bucket ('Basic Materials', 'Finland', 10): R^2 = -510.4894363942954, Count = 2
Bucket ('Basic Materials', 'France', 10): R^2 = -inf, Count = 1
Bucket ('Basic Materials', 'France', 17): R^2 = -inf, Count = 1
Bucket ('Basic Materials', 'Germany', 6): R^2 = -1121.199297212437, Count = 2
Bucket ('Basic Materials', 'Germany', 10): R^2 = -inf, Count = 1
Bucket ('Basic Materials', 'Germany', 14): R^2 = -inf, Count = 1
Bucket ('Basic Materials', 'Germany', 17): R^2 = -inf, Count = 1
Bucket ('Basic Materials', 'Luxembourg', 9): R^2 = 0.7024238652592537, Count = 2
Bucket ('Basic Materials', 'Luxembourg', 18): R^2 = -inf, Count = 1
Bucket ('Basic Materials', 'Netherlands', 9): R^2 = -inf, Count = 1
Bucket ('Basic Materials', 'Netherlands', 10): R^2 = -inf, Count = 1
Bucket ('Basic Materials', 'Norway', 18): R^2 = -inf, Count = 1
Bucket ('Basic Materials', 'Switzerland',

  return 1 - (ssr / tss)
