In [2]:
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import pickle
from scipy.stats import ks_2samp, wasserstein_distance
from scipy.spatial.distance import cdist, pdist
from scipy.stats import hmean

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
def preprocess_keyword(keyword):
    return keyword.replace("_", " ").lower()

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModel.from_pretrained("ProsusAI/finbert").to(device)

def generate_embeddings(keywords, tokenizer, model, device):
    inputs = tokenizer(keywords, padding=True, truncation=True, return_tensors="pt", max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)

    cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    return cls_embeddings

def preprocess_keyword(s: str) -> str:
    return (
        s.lower()
         .strip()
         .replace("_", " ")
         .replace("-", " ")
         .replace("#", " ")
         .replace("@", " ")
    )

def extract_columns(csv_path: str):
    raw = list(pd.read_csv(csv_path, nrows=0).columns)
    pre = [preprocess_keyword(c) for c in raw]
    return pre, raw

def build_subset_mapping(src_pre, tgt_pre):
    mapping = {}
    for s in src_pre:
        matches = [t for t in tgt_pre if s != t and s in t]
        if matches:
            mapping[s] = max(matches, key=len)
    return mapping

In [5]:
xgb_model = pickle.load(open('model.pkl', 'rb'))
scaler = StandardScaler()

path1 = '../../Datasets/Ingestor_Datasets/Dataset1.csv'
path2 = '../../Datasets/Ingestor_Datasets/Dataset2.csv'

cols1_pp, cols1_raw = extract_columns(path1)
cols2_pp, cols2_raw = extract_columns(path2)


map1_pp = build_subset_mapping(cols1_pp, cols2_pp) 
map2_pp = build_subset_mapping(cols2_pp, cols1_pp)

pp_to_raw1 = dict(zip(cols1_pp, cols1_raw))
pp_to_raw2 = dict(zip(cols2_pp, cols2_raw))

rename1 = {
    pp_to_raw1[src]: pp_to_raw2[tgt]
    for src, tgt in map1_pp.items()
    if src in pp_to_raw1 and tgt in pp_to_raw2
}
rename2 = {
    pp_to_raw2[src]: pp_to_raw1[tgt]
    for src, tgt in map2_pp.items()
    if src in pp_to_raw2 and tgt in pp_to_raw1
}

cols1_h = [rename1.get(c, c) for c in cols1_raw]
cols2_h = [rename2.get(c, c) for c in cols2_raw]

embeddings_1 = generate_embeddings(cols1_h, tokenizer, model, device)
embeddings_2 = generate_embeddings(cols2_h, tokenizer, model, device)

pair_embeddings = []
column_pairs   = []
for i, e1 in enumerate(embeddings_1):
    for j, e2 in enumerate(embeddings_2):
        pair_embeddings.append(np.concatenate([e1, e2]))
        column_pairs.append((cols1_h[i], cols2_h[j]))
pair_embeddings = np.vstack(pair_embeddings)

pair_scaled = scaler.fit_transform(pair_embeddings)

preds = xgb_model.predict(pair_scaled)

compatible_pairs = [
    (a, b)
    for (a, b), p in zip(column_pairs, preds)
    if p == 1
]

matched_A = {a for a, _ in compatible_pairs}
matched_B = {b for _, b in compatible_pairs}

coverage_A = len(matched_A) / len(cols1_h)
coverage_B = len(matched_B) / len(cols2_h)
compatibility_score = (hmean([coverage_A, coverage_B])
                       if coverage_A and coverage_B else 0.0)

In [6]:
def convert_columns_to_numeric(df, file_label=""):
    print(df.dtypes)

    converted_cols = {}
    for col in df.columns:
        original_dtype = df[col].dtype
        if not pd.api.types.is_numeric_dtype(df[col]):
            df[col] = pd.to_numeric(df[col], errors='coerce')
            new_dtype = df[col].dtype
            if new_dtype != original_dtype:
                converted_cols[col] = (original_dtype, new_dtype)

    if converted_cols:
        for col, (orig, new) in converted_cols.items():
            print(f" - {col}: {orig} -> {new}")
    
    numeric_df = df.select_dtypes(include=[np.number])
    print(f"\nReturning {len(numeric_df.columns)} numeric columns.")
    return numeric_df


df1 = pd.read_csv(path1)
df2 = pd.read_csv(path2)

df1_Full = df1.copy()
df2_Full = df2.copy()

df1 = convert_columns_to_numeric(df1, "File 1")
df2 = convert_columns_to_numeric(df2, "File 2")

strike      float64
last        float64
mark        float64
bid         float64
bid_size      int64
ask         float64
ask_size      int64
volume        int64
dtype: object

Returning 8 numeric columns.
open_interest           int64
implied_volatility    float64
delta                 float64
gamma                 float64
theta                 float64
vega                  float64
rho                   float64
dtype: object

Returning 7 numeric columns.


In [7]:
def run_ks_tests(df1, df2):
    ks_stat_matrix = pd.DataFrame(index=df1.columns, columns=df2.columns)
    p_value_matrix = pd.DataFrame(index=df1.columns, columns=df2.columns)

    for col1 in df1.columns:
        for col2 in df2.columns:
            series1 = df1[col1].dropna()
            series2 = df2[col2].dropna()

            if len(series1) == 0 or len(series2) == 0:
                ks_stat_matrix.loc[col1, col2] = None
                p_value_matrix.loc[col1, col2] = None
                continue

            stat, p_value = ks_2samp(series1, series2)

            ks_stat_matrix.loc[col1, col2] = stat
            p_value_matrix.loc[col1, col2] = p_value

    return ks_stat_matrix, p_value_matrix

def run_wasserstein_test(df1, df2):
    wasserstein_matrix = pd.DataFrame(index=df1.columns, columns=df2.columns)

    for col1 in df1.columns:
        for col2 in df2.columns:
            if not pd.api.types.is_numeric_dtype(df1[col1]) or not pd.api.types.is_numeric_dtype(df2[col2]):
                wasserstein_matrix.loc[col1, col2] = None
                continue

            series1 = df1[col1].dropna()
            series2 = df2[col2].dropna()

            if len(series1) == 0 or len(series2) == 0:
                wasserstein_matrix.loc[col1, col2] = None
                continue

            s1 = (series1 - series1.mean()) / series1.std()
            s2 = (series2 - series2.mean()) / series2.std()

            dist = wasserstein_distance(s1, s2)
            wasserstein_matrix.loc[col1, col2] = dist

    return wasserstein_matrix

def calculate_psi(expected, actual, bins=2):
    expected = expected.dropna()
    actual = actual.dropna()

    if isinstance(bins, int):
        bin_edges = np.percentile(expected, np.linspace(0, 100, bins + 1))
        bin_edges = np.unique(bin_edges)  
        if len(bin_edges) < 2:
            return np.nan  
    else:
        bin_edges = bins

    expected_bins = np.histogram(expected, bins=bin_edges)[0]
    actual_bins = np.histogram(actual, bins=bin_edges)[0]

    if expected_bins.sum() == 0 or actual_bins.sum() == 0:
        return np.nan  

    expected_dist = expected_bins / expected_bins.sum()
    actual_dist = actual_bins / actual_bins.sum()

    epsilon = 1e-6
    psi = np.sum((expected_dist - actual_dist) * np.log((expected_dist + epsilon) / (actual_dist + epsilon)))

    return psi

def run_psi_tests(df1, df2, bins=4, epsilon=1e-4):
    psi_matrix = pd.DataFrame(index=df1.columns, columns=df2.columns)

    for col1 in df1.columns:
        for col2 in df2.columns:
            if not pd.api.types.is_numeric_dtype(df1[col1]) or not pd.api.types.is_numeric_dtype(df2[col2]):
                psi_matrix.loc[col1, col2] = None
                continue

            series1 = df1[col1].dropna()
            series2 = df2[col2].dropna()

            if len(series1) == 0 or len(series2) == 0:
                psi_matrix.loc[col1, col2] = None
                continue

            psi = calculate_psi(series1, series2, bins=bins)
            psi_matrix.loc[col1, col2] = psi if not np.isnan(psi) else epsilon

    return psi_matrix

def run_energy_distance(df1, df2):
    energy_matrix = pd.DataFrame(index=df1.columns, columns=df2.columns)

    for col1 in df1.columns:
        for col2 in df2.columns:
            if not pd.api.types.is_numeric_dtype(df1[col1]) or not pd.api.types.is_numeric_dtype(df2[col2]):
                energy_matrix.loc[col1, col2] = None
                continue

            s1 = df1[col1].dropna()
            s2 = df2[col2].dropna()

            min_len = min(len(s1), len(s2))
            if min_len == 0:
                energy_matrix.loc[col1, col2] = None
                continue

            s1 = (s1 - s1.mean()) / s1.std()
            s2 = (s2 - s2.mean()) / s2.std()

            d_xy = cdist(s1.values.reshape(-1, 1), s2.values.reshape(-1, 1)).mean()
            d_xx = pdist(s1.values.reshape(-1, 1)).mean() if len(s1) > 1 else 0
            d_yy = pdist(s2.values.reshape(-1, 1)).mean() if len(s2) > 1 else 0

            energy_dist = 2 * d_xy - d_xx - d_yy
            energy_matrix.loc[col1, col2] = energy_dist

    return energy_matrix


In [8]:
ks_stat_matrix, p_value_matrix = run_ks_tests(df1, df2)
wasserstein_distance_matrix = run_wasserstein_test(df1, df2)
psi_matrix = run_psi_tests(df1, df2, bins=4)
energy_dist_matrix = run_energy_distance(df1, df2)

In [9]:
def normalize_matrix(matrix, higher_is_better=True, epsilon=1e-6):
    normalized_matrix = matrix.copy()

    if higher_is_better:
        max_val = normalized_matrix.max().max()
        if max_val > 0:
            normalized_matrix = normalized_matrix / max_val
    else:
        normalized_matrix = 1 / (normalized_matrix + epsilon)

    return normalized_matrix

normalized_ks_stat_matrix = normalize_matrix(ks_stat_matrix, higher_is_better=False)
normalized_wasserstein_matrix = normalize_matrix(wasserstein_distance_matrix, higher_is_better=False)
normalized_psi_matrix = normalize_matrix(psi_matrix, higher_is_better=True)
normalized_energy_matrix = normalize_matrix(energy_dist_matrix, higher_is_better=False)

In [10]:
def combined_matrix(w):
    return (
        normalized_ks_stat_matrix   ** w[0]
      * normalized_wasserstein_matrix ** w[1]
      * normalized_psi_matrix       ** w[2]
      * normalized_energy_matrix    ** w[3]
      * (compatibility_score          ** w[4])
    ).clip(upper=1.0)

In [11]:
with open('weights.pkl', 'rb') as f:
    weights = pickle.load(f)

w_opt = np.array([
    weights['ks'],
    weights['wasserstein'],
    weights['psi'],
    weights['energy'],
    weights['compatibility']
])

M_opt = combined_matrix(w_opt)

In [12]:
i, j = np.unravel_index(np.nanargmax(M_opt.values), M_opt.shape)
best_col1 = M_opt.index[i]
best_col2 = M_opt.columns[j]

df1_Full[best_col1] = df1_Full[best_col1].astype(float)
df2_Full[best_col2] = df2_Full[best_col2].astype(float)

combined = pd.merge(
    df1_Full,
    df2_Full,
    left_on=best_col1,
    right_on=best_col2,
    how='inner'
)

combined.to_csv('combined.csv', index=False)
print(f"✅ Wrote {len(combined)} rows to combined.csv")

✅ Wrote 950 rows to combined.csv
