In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from scipy.stats import ks_2samp, wasserstein_distance,energy_distance
import pickle
from scipy.spatial.distance import cdist, pdist

In [2]:
with open('compatibility_analysis.pkl', 'rb') as f:
    saved_data = pickle.load(f)

# Unpack
embeddings_1 = saved_data['embeddings_1']
embeddings_2 = saved_data['embeddings_2']
cols1_h = saved_data['cols1_h']
cols2_h = saved_data['cols2_h']
compatibility_score = saved_data['compatibility_score']

print("✅ Data loaded successfully.")
print(f"↪️ Compatibility score: {compatibility_score:.2f}")
print(f"↪️ Columns in Dataset 1: {len(cols1_h)}")
print(f"↪️ Columns in Dataset 2: {len(cols2_h)}")

✅ Data loaded successfully.
↪️ Compatibility score: 0.78
↪️ Columns in Dataset 1: 9
↪️ Columns in Dataset 2: 9


In [4]:
def convert_columns_to_numeric(df, file_label=""):
    print(f"\nProcessing {file_label}")
    print("Original dtypes:")
    print(df.dtypes)

    converted_cols = {}
    for col in df.columns:
        original_dtype = df[col].dtype
        if not pd.api.types.is_numeric_dtype(df[col]):
            df[col] = pd.to_numeric(df[col], errors='coerce')
            new_dtype = df[col].dtype
            if new_dtype != original_dtype:
                converted_cols[col] = (original_dtype, new_dtype)

    if converted_cols:
        print("\nColumns converted:")
        for col, (orig, new) in converted_cols.items():
            print(f" - {col}: {orig} -> {new}")
    else:
        print("\nNo columns were converted.")

    return df

# Replace these with your actual CSV file paths
file1_path = '/home/g7/Desktop/Thesis I/Datasets/Ingestor_Datasets/DF_1.csv'
file2_path = '/home/g7/Desktop/Thesis I/Datasets/Ingestor_Datasets/DF_2.csv'

df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)

df1 = convert_columns_to_numeric(df1, "File 1")
df2 = convert_columns_to_numeric(df2, "File 2")


Processing File 1
Original dtypes:
open            float64
high            float64
low             float64
close           float64
volume          float64
vwap            float64
timestamp         int64
transactions      int64
otc             float64
dtype: object

No columns were converted.

Processing File 2
Original dtypes:
open            float64
high            float64
low             float64
close           float64
volume          float64
vwap            float64
timestamp         int64
transactions      int64
otc             float64
dtype: object

No columns were converted.


In [18]:
def run_ks_tests(df1, df2):
    ks_stat_matrix = pd.DataFrame(index=df1.columns, columns=df2.columns)
    p_value_matrix = pd.DataFrame(index=df1.columns, columns=df2.columns)

    for col1 in df1.columns:
        for col2 in df2.columns:
            # Drop missing values
            series1 = df1[col1].dropna()
            series2 = df2[col2].dropna()

            # Check if either series is empty after dropping NaN
            if len(series1) == 0 or len(series2) == 0:
                ks_stat_matrix.loc[col1, col2] = None
                p_value_matrix.loc[col1, col2] = None
                continue

            # Perform the KS test
            stat, p_value = ks_2samp(series1, series2)

            # Store the results in the matrices
            ks_stat_matrix.loc[col1, col2] = stat
            p_value_matrix.loc[col1, col2] = p_value

    return ks_stat_matrix, p_value_matrix


In [29]:
ks_stat_matrix, p_value_matrix = run_ks_tests(df1, df2)

In [21]:
def run_wasserstein_test(df1, df2):
    wasserstein_matrix = pd.DataFrame(index=df1.columns, columns=df2.columns)

    for col1 in df1.columns:
        for col2 in df2.columns:
            # Check if both columns contain numeric data
            if not pd.api.types.is_numeric_dtype(df1[col1]) or not pd.api.types.is_numeric_dtype(df2[col2]):
                wasserstein_matrix.loc[col1, col2] = None
                continue

            # Drop missing values
            series1 = df1[col1].dropna()
            series2 = df2[col2].dropna()

            # Check if either series is empty after dropping NaN
            if len(series1) == 0 or len(series2) == 0:
                wasserstein_matrix.loc[col1, col2] = None
                continue

            # Z-score normalization
            s1 = (series1 - series1.mean()) / series1.std()
            s2 = (series2 - series2.mean()) / series2.std()

            # Calculate Wasserstein distance
            dist = wasserstein_distance(s1, s2)
            wasserstein_matrix.loc[col1, col2] = dist

    return wasserstein_matrix


In [None]:
wasserstein_distance_matrix = run_wasserstein_test(df1, df2)

In [23]:
def calculate_psi(expected, actual, bins=2):
    expected = expected.dropna()
    actual = actual.dropna()

    if isinstance(bins, int):
        bin_edges = np.percentile(expected, np.linspace(0, 100, bins + 1))
        bin_edges = np.unique(bin_edges)  # Remove duplicates
        if len(bin_edges) < 2:
            return np.nan  # Cannot bin a constant column
    else:
        bin_edges = bins

    expected_bins = np.histogram(expected, bins=bin_edges)[0]
    actual_bins = np.histogram(actual, bins=bin_edges)[0]

    if expected_bins.sum() == 0 or actual_bins.sum() == 0:
        return np.nan  # Avoid divide by zero

    expected_dist = expected_bins / expected_bins.sum()
    actual_dist = actual_bins / actual_bins.sum()

    # Add small value to avoid log(0) or divide-by-zero
    epsilon = 1e-6
    psi = np.sum((expected_dist - actual_dist) * np.log((expected_dist + epsilon) / (actual_dist + epsilon)))

    return psi

def run_psi_tests(df1, df2, bins=4, epsilon=1e-4):
    psi_matrix = pd.DataFrame(index=df1.columns, columns=df2.columns)

    for col1 in df1.columns:
        for col2 in df2.columns:
            # Check if both columns contain numeric data
            if not pd.api.types.is_numeric_dtype(df1[col1]) or not pd.api.types.is_numeric_dtype(df2[col2]):
                psi_matrix.loc[col1, col2] = None
                continue

            # Drop missing values
            series1 = df1[col1].dropna()
            series2 = df2[col2].dropna()

            # Check if either series is empty after dropping NaN
            if len(series1) == 0 or len(series2) == 0:
                psi_matrix.loc[col1, col2] = None
                continue

            # Calculate PSI
            psi = calculate_psi(series1, series2, bins=bins)
            psi_matrix.loc[col1, col2] = psi if not np.isnan(psi) else epsilon

    return psi_matrix



In [31]:
psi_matrix = run_psi_tests(df1, df2, bins=4)

In [25]:
def run_energy_distance(df1, df2):
    energy_matrix = pd.DataFrame(index=df1.columns, columns=df2.columns)

    for col1 in df1.columns:
        for col2 in df2.columns:
            # Check if both columns contain numeric data
            if not pd.api.types.is_numeric_dtype(df1[col1]) or not pd.api.types.is_numeric_dtype(df2[col2]):
                energy_matrix.loc[col1, col2] = None
                continue

            # Drop missing values
            s1 = df1[col1].dropna()
            s2 = df2[col2].dropna()

            # Check if either series is empty after dropping NaN
            min_len = min(len(s1), len(s2))
            if min_len == 0:
                energy_matrix.loc[col1, col2] = None
                continue

            # Z-score normalization
            s1 = (s1 - s1.mean()) / s1.std()
            s2 = (s2 - s2.mean()) / s2.std()

            # Compute energy distance
            d_xy = cdist(s1.values.reshape(-1, 1), s2.values.reshape(-1, 1)).mean()
            d_xx = pdist(s1.values.reshape(-1, 1)).mean() if len(s1) > 1 else 0
            d_yy = pdist(s2.values.reshape(-1, 1)).mean() if len(s2) > 1 else 0

            energy_dist = 2 * d_xy - d_xx - d_yy
            energy_matrix.loc[col1, col2] = energy_dist

    return energy_matrix


In [32]:
energy_dist_matrix = run_energy_distance(df1, df2)

In [33]:
def normalize_matrix(matrix, higher_is_better=True, epsilon=1e-6):
    normalized_matrix = matrix.copy()

    if higher_is_better:
        # Normalize by dividing by the maximum value to scale between 0 and 1
        max_val = normalized_matrix.max().max()
        if max_val > 0:
            normalized_matrix = normalized_matrix / max_val
    else:
        # Invert the matrix to make smaller values larger
        normalized_matrix = 1 / (normalized_matrix + epsilon)

    return normalized_matrix

In [34]:
normalized_ks_stat_matrix = normalize_matrix(ks_stat_matrix, higher_is_better=False)
normalized_p_value_matrix = normalize_matrix(p_value_matrix, higher_is_better=True)
normalized_wasserstein_matrix = normalize_matrix(wasserstein_distance_matrix, higher_is_better=False)
normalized_psi_matrix = normalize_matrix(psi_matrix, higher_is_better=True)
normalized_energy_matrix = normalize_matrix(energy_dist_matrix, higher_is_better=False)