In [82]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from scipy.stats import ks_2samp, wasserstein_distance,energy_distance
import pickle
from scipy.spatial.distance import cdist, pdist

In [34]:
with open('compatibility_analysis.pkl', 'rb') as f:
    saved_data = pickle.load(f)

# Unpack
embeddings_1 = saved_data['embeddings_1']
embeddings_2 = saved_data['embeddings_2']
cols1_h = saved_data['cols1_h']
cols2_h = saved_data['cols2_h']
compatibility_score = saved_data['compatibility_score']

print("‚úÖ Data loaded successfully.")
print(f"‚Ü™Ô∏è Compatibility score: {compatibility_score:.2f}")
print(f"‚Ü™Ô∏è Columns in Dataset 1: {len(cols1_h)}")
print(f"‚Ü™Ô∏è Columns in Dataset 2: {len(cols2_h)}")

‚úÖ Data loaded successfully.
‚Ü™Ô∏è Compatibility score: 0.78
‚Ü™Ô∏è Columns in Dataset 1: 9
‚Ü™Ô∏è Columns in Dataset 2: 9


In [89]:
# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(embeddings_1, embeddings_2)

# Set similarity threshold
SIMILARITY_THRESHOLD = 0.85
Matching_THRESHOLD = 0.95
match_count = 0

# Map only columns above the threshold
closest_matches = {}
for i, row in enumerate(similarity_matrix):
    j_best = np.argmax(row)
    best_score = row[j_best]
    if best_score >= SIMILARITY_THRESHOLD:
        col_1 = cols1_h[i]
        col_2 = cols2_h[j_best]
        closest_matches[col_1] = (col_2, best_score)
    if best_score >= Matching_THRESHOLD:
        match_count += 1
        print(f"üîó Match Found: {cols1_h[i]} ‚ÜîÔ∏è {cols2_h[j_best]} (similarity: {best_score:.2f})")

Match_score = match_count/len(closest_matches)

# Report matches
print(f"\nüîç Closest Column Matches (threshold ‚â• {SIMILARITY_THRESHOLD}):")
if closest_matches:
    for col1, (col2, score) in closest_matches.items():
        print(f"- {col1} ‚Üí {col2} (similarity: {score:.2f})")
else:
    print("No matches above threshold.")

üîó Match Found: open ‚ÜîÔ∏è open (similarity: 1.00)
üîó Match Found: high ‚ÜîÔ∏è high (similarity: 1.00)
üîó Match Found: low ‚ÜîÔ∏è low (similarity: 1.00)
üîó Match Found: close ‚ÜîÔ∏è close (similarity: 1.00)
üîó Match Found: volume ‚ÜîÔ∏è volume (similarity: 1.00)
üîó Match Found: vwap ‚ÜîÔ∏è vwap (similarity: 1.00)
üîó Match Found: timestamp ‚ÜîÔ∏è timestamp (similarity: 1.00)
üîó Match Found: transactions ‚ÜîÔ∏è transactions (similarity: 1.00)
üîó Match Found: otc ‚ÜîÔ∏è otc (similarity: 1.00)

üîç Closest Column Matches (threshold ‚â• 0.85):
- open ‚Üí open (similarity: 1.00)
- high ‚Üí high (similarity: 1.00)
- low ‚Üí low (similarity: 1.00)
- close ‚Üí close (similarity: 1.00)
- volume ‚Üí volume (similarity: 1.00)
- vwap ‚Üí vwap (similarity: 1.00)
- timestamp ‚Üí timestamp (similarity: 1.00)
- transactions ‚Üí transactions (similarity: 1.00)
- otc ‚Üí otc (similarity: 1.00)


In [None]:
def convert_columns_to_numeric(df, file_label=""):
    print(f"\nProcessing {file_label}")
    print("Original dtypes:")
    print(df.dtypes)

    converted_cols = {}
    for col in df.columns:
        original_dtype = df[col].dtype
        if not pd.api.types.is_numeric_dtype(df[col]):
            df[col] = pd.to_numeric(df[col], errors='coerce')
            new_dtype = df[col].dtype
            if new_dtype != original_dtype:
                converted_cols[col] = (original_dtype, new_dtype)

    if converted_cols:
        print("\nColumns converted:")
        for col, (orig, new) in converted_cols.items():
            print(f" - {col}: {orig} -> {new}")
    else:
        print("\nNo columns were converted.")

    return df

# Replace these with your actual CSV file paths
file1_path = '/home/g7/Desktop/Thesis I/Datasets/Ingestor_Datasets/DF_1.csv'
file2_path = '/home/g7/Desktop/Thesis I/Datasets/Ingestor_Datasets/DF_2.csv'

df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)

df1 = convert_columns_to_numeric(df1, "File 1")
df2 = convert_columns_to_numeric(df2, "File 2")


Processing File 1
Original dtypes:
open            float64
high            float64
low             float64
close           float64
volume          float64
vwap            float64
timestamp         int64
transactions      int64
otc             float64
dtype: object

No columns were converted.

Processing File 2
Original dtypes:
open            float64
high            float64
low             float64
close           float64
volume          float64
vwap            float64
timestamp         int64
transactions      int64
otc             float64
dtype: object

No columns were converted.


In [40]:
def run_ks_tests(df1, df2, column_mapping):
    ks_stats = []
    p_values = []

    for col1, (col2, _) in column_mapping.items():
        if col1 not in df1.columns or col2 not in df2.columns:
            continue

        if not pd.api.types.is_numeric_dtype(df1[col1]) or not pd.api.types.is_numeric_dtype(df2[col2]):
            continue

        series1 = df1[col1].dropna()
        series2 = df2[col2].dropna()

        if len(series1) == 0 or len(series2) == 0:
            continue

        stat, p_value = ks_2samp(series1, series2)

        ks_stats.append(stat)
        p_values.append(p_value)

    avg_ks = sum(ks_stats) / len(ks_stats) if ks_stats else None
    avg_p = sum(p_values) / len(p_values) if p_values else None

    return avg_ks, avg_p

In [41]:
avg_ks, avg_p = run_ks_tests(df1, df2, closest_matches)
print(f"üìä Avg KS Statistic: {avg_ks:.4f}")
print(f"üìä Avg p-value: {avg_p:.4f}")

üìä Avg KS Statistic: 0.7248
üìä Avg p-value: 0.0000


In [45]:
def run_wasserstein_test(df1, df2, column_mapping):
    distances = []

    for col1, (col2, _) in column_mapping.items():
        if col1 not in df1.columns or col2 not in df2.columns:
            continue

        if not pd.api.types.is_numeric_dtype(df1[col1]) or not pd.api.types.is_numeric_dtype(df2[col2]):
            continue

        series1 = df1[col1].dropna()
        series2 = df2[col2].dropna()

        if len(series1) == 0 or len(series2) == 0:
            continue

        # Normalize both series with z-score
        s1 = (series1 - series1.mean()) / series1.std()
        s2 = (series2 - series2.mean()) / series2.std()

        dist = wasserstein_distance(s1, s2)
        distances.append(dist)

    avg_dist = sum(distances) / len(distances) if distances else None
    return avg_dist

In [46]:
avg_wasserstein = run_wasserstein_test(df1, df2, closest_matches)
print(f"üìè Average Wasserstein Distance: {avg_wasserstein:.4f}")

üìè Average Wasserstein Distance: 0.3260


In [64]:
def calculate_psi(expected, actual, bins=2):
    expected = expected.dropna()
    actual = actual.dropna()

    if isinstance(bins, int):
        bin_edges = np.percentile(expected, np.linspace(0, 100, bins + 1))
        bin_edges = np.unique(bin_edges)  # Remove duplicates
        if len(bin_edges) < 2:
            return np.nan  # Cannot bin a constant column
    else:
        bin_edges = bins

    expected_bins = np.histogram(expected, bins=bin_edges)[0]
    actual_bins = np.histogram(actual, bins=bin_edges)[0]

    if expected_bins.sum() == 0 or actual_bins.sum() == 0:
        return np.nan  # Avoid divide by zero

    expected_dist = expected_bins / expected_bins.sum()
    actual_dist = actual_bins / actual_bins.sum()

    # Add small value to avoid log(0) or divide-by-zero
    epsilon = 1e-6
    psi = np.sum((expected_dist - actual_dist) * np.log((expected_dist + epsilon) / (actual_dist + epsilon)))

    return psi

def run_psi_tests(df1, df2, column_mapping, bins=4, epsilon=1e-4):
    psi_values = []

    for col1, (col2, _) in column_mapping.items():
        if col1 not in df1.columns or col2 not in df2.columns:
            continue

        if not pd.api.types.is_numeric_dtype(df1[col1]) or not pd.api.types.is_numeric_dtype(df2[col2]):
            continue

        series1 = df1[col1].dropna()
        series2 = df2[col2].dropna()

        if len(series1) == 0 or len(series2) == 0:
            continue

        psi = calculate_psi(series1, series2, bins=bins)
        psi_values.append(psi if not np.isnan(psi) else epsilon)

    avg_psi = sum(psi_values) / len(psi_values) if psi_values else None
    return avg_psi


In [66]:
avg_psi = run_psi_tests(df1, df2, closest_matches, bins=4)
print(f"üìä Average PSI (with fallback for NaNs): {avg_psi:.4f}")


üìä Average PSI (with fallback for NaNs): 0.3414


In [85]:
def run_energy_distance(df1, df2, column_mapping):
    cols1 = []
    cols2 = []

    for col1, (col2, _) in column_mapping.items():
        if col1 not in df1.columns or col2 not in df2.columns:
            continue
        if not pd.api.types.is_numeric_dtype(df1[col1]) or not pd.api.types.is_numeric_dtype(df2[col2]):
            continue

        s1 = df1[col1].dropna()
        s2 = df2[col2].dropna()

        min_len = min(len(s1), len(s2))
        if min_len == 0:
            continue

        # Z-score normalize both series
        s1 = (s1 - s1.mean()) / s1.std()
        s2 = (s2 - s2.mean()) / s2.std()

        cols1.append(s1.iloc[:min_len].to_numpy())
        cols2.append(s2.iloc[:min_len].to_numpy())

    if not cols1 or not cols2:
        return None

    X = np.array(cols1).T  # shape: (n_samples, n_features)
    Y = np.array(cols2).T

    # Compute energy distance
    d_xy = cdist(X, Y).mean()
    d_xx = pdist(X).mean() if len(X) > 1 else 0
    d_yy = pdist(Y).mean() if len(Y) > 1 else 0

    energy_dist = 2 * d_xy - d_xx - d_yy
    return energy_dist

In [86]:
energy_dist = run_energy_distance(df1, df2, closest_matches)
print(f"‚ö° Energy Distance (multivariate): {energy_dist:.4f}")


‚ö° Energy Distance (multivariate): 0.1861


In [106]:
def combine_scores(ks_stat, p_value, wasserstein, psi, energy, compatibility_score, match_score, weights=None):
    
    if weights is None:
        weights = {
            'ks': 1.0,
            'p': 1.0,
            'wasserstein': 1.0,
            'psi': 1.0,
            'energy': 1.0,
            'compatibility_score': 1.5,
            'match_score': 4.0
        }

    scores = {
        'ks': ks_stat,
        'p': 1 - p_value,
        'wasserstein': min(wasserstein / 1.0, 1.0),
        'psi': min(psi / 0.25, 1.0),
        'energy': min(energy / 1.0, 1.0),
        'compatibility_score': compatibility_score,
        'match_score': match_score
    }

    weighted_sum = sum(scores[k] * weights[k] for k in scores)
    total_weight = sum(weights.values())
    final_score = weighted_sum / total_weight

    print(f"\nüîç Final Score: {final_score:.4f}")

    if final_score >= 0.80:
        return "Compatible"
    elif final_score >= 0.60:
        return "Borderline"
    else:
        return "Incompatible"

In [107]:
Decision = combine_scores(ks_stat=avg_ks, p_value=avg_p, wasserstein=avg_wasserstein, psi=avg_psi, energy=energy_dist, compatibility_score=compatibility_score, match_score=Match_score)
print(f"\nüîç Final Decision: {Decision}")


üîç Final Score: 0.8004

üîç Final Decision: Compatible
