In [73]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from tabpfn import TabPFNClassifier
from tabpfn.utils import normalize_data, to_ranking_low_mem, remove_outliers
from tabpfn.utils import NOP, normalize_by_used_features_f
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, RobustScaler, StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.base import clone
import torch
from skrub import TableVectorizer, MinHashEncoder
from sklearn.metrics import accuracy_score, roc_auc_score



def run_on_encoded_data_with_cst(X_enc, X_rest, y, dim_reduction_name, dim_reduction, model_name, model, encoding,
                        cv, regression=False, no_interaction_between_enc_and_rest=False, **kwargs):
    """
    X_enc: np array of shape (n_samples, embedding_dim), the embedded texts
    X_rest: np array of shape (n_samples, n_features), additional tabular data
    y: np array of shape (n_samples,), the classifcation target
    dim_reduction_name: str, the name of the dim reduction method
    dim_reduction: sklearn transformer, the dim reduction method
    model_name: str, the name of the model
    model: sklearn model, the model
    encoding: str, the name of the encoding which was used to create X_enc
    cv: sklearn cross validator, the cross validator to use
    regression: bool, whether to use regression or classification, default False
    no_interaction_between_enc_and_rest: bool, whether to use the interaction between X_enc and X_rest, default False
    """
    assert model_name in ["TabPFNClassifier", "TabPFNClassifier_basic", "LogisticRegression", "GradientBoostingClassifier", "GradientBoostingRegressor", "LinearRegression", "HistGradientBoostingClassifier"]
    assert encoding.startswith("lm__") or encoding.startswith("skrub__") or encoding.startswith("bert_custom__") or encoding.startswith("openai__") or encoding.startswith("bert_custom_pooling__")
    if no_interaction_between_enc_and_rest:
        assert model_name == "HistGradientBoostingClassifier"
    #TODO: make this cleaner
    # we want to eliminate certain combinations
    # passthrough and lm__ means taking the full lm embedding, which is slow if the model is not LogisticRegression
    # for skrub encodings, we don't want to use passthrough
    if dim_reduction_name == "passthrough" and not (model_name in ["LogisticRegression", "LinearRegression"]) and not encoding.startswith("skrub"):
        print("Skipping {} with {} and {}".format(model_name, dim_reduction_name, encoding))
        return None
    if dim_reduction_name != "passthrough" and encoding.startswith("skrub"):
        print("Skipping {} with {} and {}".format(model_name, dim_reduction_name, encoding))
        return None
    print("Running {} with {} and {}".format(model_name, dim_reduction_name, encoding))
    if X_rest is not None:
        # encode X_rest with the TableVectorizer
        if model_name.startswith("TabPFNClassifier"):
            # ordinal encoding for low_cardinality columns
            low_card_cat_transformer = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
        else:
            low_card_cat_transformer = OneHotEncoder(handle_unknown="ignore")
        if model_name.startswith("LogisticRegression"):
            numerical_transformer = StandardScaler()
        else:
            numerical_transformer = "passthrough"
        
        rest_trans = TableVectorizer(high_card_cat_transformer = MinHashEncoder(n_components=10, analyzer='char'),
                                    low_card_cat_transformer = low_card_cat_transformer,
                                    numerical_transformer=numerical_transformer,
                                    cardinality_threshold=30)
    if X_rest is not None and X_enc is not None:
        
        # Assuming X_enc and X_rest are numpy arrays, you can get their shapes
        n_enc_columns = X_enc.shape[1]
        # names of the columns should be of format original_column_name__index
        assert all(["__" in col for col in X_enc.columns])
        original_column_names = np.unique([col.split("__")[0] for col in X_enc.columns])
        # get the indices of the columns for each original column
        encoded_columns_indices = []
        for col in original_column_names:
            indices = [i for i, c in enumerate(X_enc.columns) if c.split("__")[0] == col]
            encoded_columns_indices.append(indices)
        print(encoded_columns_indices)
        print(len(encoded_columns_indices))
        n_rest_columns = X_rest.shape[1]

        # Create column indices for X_enc and X_rest
        enc_indices = np.arange(n_enc_columns)
        rest_indices = np.arange(n_enc_columns, n_enc_columns + n_rest_columns)
        #check
        all_indices = np.concatenate(encoded_columns_indices + [rest_indices])
        # assert no duplicates
        assert len(all_indices) == len(np.unique(all_indices)), f"Duplicate indices: {[i for i in all_indices if list(all_indices).count(i) > 1]}"
        # assert we have all indices
        assert set(all_indices) == set(np.arange(n_enc_columns + n_rest_columns))

        # Create the ColumnTransformer
        #TODO: test this
        transformers = []
        for i in range(len(original_column_names)):
            transformers.append((f"dim_reduction_{i}", dim_reduction if isinstance(dim_reduction, str) else clone(dim_reduction), encoded_columns_indices[i]))
        #transformers.append(('rest_trans', rest_trans, rest_indices))
        print(transformers)
        text_transformer = ColumnTransformer(
            transformers=transformers,
        )
        

        full_X = np.concatenate([X_enc, X_rest], axis=1)
        print(X_enc.shape, X_rest.shape, full_X.shape)
        #print(complete_trans.fit_transform(full_X).shape)
    else:
        raise NameError


        
    #scores = cross_val_score(pipeline, full_X, y, scoring="accuracy", cv=cv)
    # report both accuracy and roc_auc
    if regression:
        raise NameError
    
    #scores = cross_validate(pipeline, full_X, y, scoring=["accuracy", "roc_auc_ovr"], cv=cv)
    accuracy_list = []
    roc_auc_list = []
    for i, (train, test) in enumerate(cv.split(full_X, y)):
        print(f"Fold {i}")
        train_X_enc = X_enc.iloc[train]
        train_X_rest = X_rest.iloc[train]
        train_y = y[train]
        test_X_enc = X_enc.iloc[test]
        test_X_rest = X_rest.iloc[test]
        test_y = y[test]
        # transform the data
        train_X_enc_trans = text_transformer.fit_transform(train_X_enc)
        test_X_enc_trans = text_transformer.transform(test_X_enc)
        train_X_rest_trans = rest_trans.fit_transform(train_X_rest)
        test_X_rest_trans = rest_trans.transform(test_X_rest)
        # merge
        train_X_trans = np.concatenate([train_X_enc_trans, train_X_rest_trans], axis=1)
        test_X_trans = np.concatenate([test_X_enc_trans, test_X_rest_trans], axis=1)
        # fit the model
        if no_interaction_between_enc_and_rest:
            enc_indices = list(np.arange(train_X_enc_trans.shape[1]))
            model.set_params(interaction_cst=[enc_indices])
        model.fit(train_X_trans, train_y)
        # predict
        y_pred = model.predict(test_X_trans)
        # score
        accuracy_list.append(accuracy_score(test_y, y_pred))
        roc_auc_list.append(roc_auc_score(test_y, y_pred))


        

    try:
        n_train = cv.n_train
        n_test = cv.n_test
    except:
        n_train = np.nan
        n_test = np.nan
    res =  {
        'encoding': encoding,
        'dim_reduction': dim_reduction_name,
        'model': model_name,
        #'accuracies': scores['test_accuracy'],
        #'roc_auc': scores['test_roc_auc_ovr'],
        'accuracies': accuracy_list,
        'roc_auc': roc_auc_list,
        'n_train': n_train,
        'n_test': n_test,
        "no_interaction_between_enc_and_rest": no_interaction_between_enc_and_rest,
        **kwargs
    }
    # add the scores
    # if regression:
    #     res['neg_mean_squared_error'] = scores['test_neg_mean_squared_error']
    #     res['r2'] = scores['test_r2']
    # else:
    #     res['accuracies'] = scores['test_accuracy']
    #     res['roc_auc'] = scores['test_roc_auc_ovr']
    
    return res


In [45]:
from src.encodings import encode_high_cardinality_features
from src.data_loading import load_data

dataset = "goodreads"
X, y = load_data(dataset, max_rows=10_000)
encoding = "skrub__minhash_30"
X_enc, X_rest = encode_high_cardinality_features(X, encoding, dataset_name=dataset, override_cache=False, cardinality_threshold=30, fail_if_not_cached=True)

Removed 2 columns with missing values on 12 columns
Removed 1652 rows with missing values on 3967 rows
Removed 1652 rows with missing values on 2315 rows
Removed 2 columns with missing values on 9 columns
New shape: (2315, 10)
Original task: regression for goodreads
Converting to binary classification
Classes (array([0, 1]), array([1167, 1148]))
X shape: (2315, 10), y shape: (2315,)
numeric ['PageCount', 'NumberofRatings']
low_card_cat ['Format', 'Language']
high_card_cat ['Title', 'Description', 'FirstAuthor', 'NumberofReviews', 'Publisher', 'PublishDate']
High cardinality columns ['Title', 'Description', 'FirstAuthor', 'NumberofReviews', 'Publisher', 'PublishDate']
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
working d

In [46]:
from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier, GradientBoostingRegressor, HistGradientBoostingRegressor
from src.utils import FixedSizeSplit
cv = FixedSizeSplit(n_train=1000, n_test=500, n_splits=7)
run_on_encoded_data_with_cst(X_enc, X_rest, y, "passthrough", "passthrough", "HistGradientBoostingClassifier", HistGradientBoostingClassifier(), encoding, cv=cv, regression=False, no_interaction_between_enc_and_rest=False)

Running HistGradientBoostingClassifier with passthrough and skrub__minhash_30
[[30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59], [60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89], [90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119], [150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179], [120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]]
6
[('dim_reduction_0', 'passthrough', [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 

{'encoding': 'skrub__minhash_30',
 'dim_reduction': 'passthrough',
 'model': 'HistGradientBoostingClassifier',
 'accuracies': [0.62, 0.598, 0.59, 0.57, 0.62, 0.572, 0.634],
 'roc_auc': [0.62,
  0.5996794871794872,
  0.5886754701880752,
  0.5722754327702106,
  0.6190125512295082,
  0.5717486994797919,
  0.6339541432662923],
 'n_train': 1000,
 'n_test': 500}

In [74]:
def pipeline(dataset, encoding, dim_reduction_name, dim_reduction, no_interaction_between_enc_and_rest):
    X, y = load_data(dataset, max_rows=10_000)
    X_enc, X_rest = encode_high_cardinality_features(X, encoding, dataset_name=dataset, override_cache=False, cardinality_threshold=30, fail_if_not_cached=True)
    cv = FixedSizeSplit(n_train=1000, n_test=500, n_splits=7)
    return run_on_encoded_data_with_cst(X_enc, X_rest, y, dim_reduction_name, dim_reduction, "HistGradientBoostingClassifier", HistGradientBoostingClassifier(), encoding, cv=cv, regression=False, no_interaction_between_enc_and_rest=no_interaction_between_enc_and_rest, dataset=dataset)

In [75]:
from sklearn.decomposition import PCA
datasets = ['bikewale', 'clear_corpus', 'company_employees',
       'employee-remuneration-and-expenses-earning-over-75000',
       'employee_salary', 'goodreads', 'journal_jcr_cls', 'ramen_ratings',
       'spotify', 'us_accidents_counts', 'us_accidents_severity',
       'us_presidential', 'wine_review', 'zomato']

encodings = ["openai__", "lm__BAAI/bge-large-en-v1.5", "skrub__minhash_30"]
no_interaction_between_enc_and_rest_list = [False, True]
dim_reductions = {"PCA_30": PCA(n_components=30), "passthrough": "passthrough"}

import submitit
import time
# Generate all combinations of parameters
from itertools import product

jobs = []

executor = submitit.AutoExecutor(folder="logs")
executor.update_parameters(timeout_min=100, slurm_partition='parietal,normal', slurm_array_parallelism=200, cpus_per_task=2,
                            exclude="margpu009")
# change name of job
executor.update_parameters(name="switch_encoding")

with executor.batch():
    for dataset_name in datasets:
        for encoding in encodings:
            for dim_reduction_name, dim_reduction in dim_reductions.items():
                for no_interaction_between_enc_and_rest in no_interaction_between_enc_and_rest_list:
                    job = executor.submit(pipeline, dataset_name, encoding, dim_reduction_name, dim_reduction, no_interaction_between_enc_and_rest)
                    jobs.append(job)



In [76]:
jobs[0].result()

sacct: error: slurm_persist_conn_open_without_init: failed to open persistent connection to host:localhost:6819: Connection refused
sacct: error: Sending PersistInit msg: Connection refused
sacct: error: Problem talking to the database: Connection refused




sacct: error: slurm_persist_conn_open_without_init: failed to open persistent connection to host:localhost:6819: Connection refused
sacct: error: Sending PersistInit msg: Connection refused
sacct: error: Problem talking to the database: Connection refused




sacct: error: slurm_persist_conn_open_without_init: failed to open persistent connection to host:localhost:6819: Connection refused
sacct: error: Sending PersistInit msg: Connection refused
sacct: error: Problem talking to the database: Connection refused




sacct: error: slurm_persist_conn_open_without_init: failed to open persistent connection to host:localhost:6819: Connection refused
sacct: error: Sending PersistInit msg: Connection refused
sacct: error: Problem talking to the database: Connection refused




sacct: error: slurm_persist_conn_open_without_init: failed to open persistent connection to host:localhost:6819: Connection refused
sacct: error: Sending PersistInit msg: Connection refused
sacct: error: Problem talking to the database: Connection refused




sacct: error: slurm_persist_conn_open_without_init: failed to open persistent connection to host:localhost:6819: Connection refused
sacct: error: Sending PersistInit msg: Connection refused
sacct: error: Problem talking to the database: Connection refused




{'encoding': 'openai__',
 'dim_reduction': 'PCA_30',
 'model': 'HistGradientBoostingClassifier',
 'accuracies': [0.86, 0.86, 0.858, 0.858, 0.904, 0.848, 0.878],
 'roc_auc': [0.8600375415122492,
  0.8608717973400073,
  0.8560927067283,
  0.8571497213146262,
  0.9040821878350483,
  0.8460547504025764,
  0.8769638569748143],
 'n_train': 1000,
 'n_test': 500,
 'no_interaction_between_enc_and_rest': False,
 'dataset': 'bikewale'}

In [77]:
# create a dataframe with all the results
import pandas as pd
res_list = []
for job in jobs:
    res = job.result()
    if res is not None:
        print(res)
        res_list.append(res)
df = pd.DataFrame(res_list)
df = df.explode(["accuracies", "roc_auc"])
df.to_csv("../results/interaction_xp.csv")

{'encoding': 'openai__', 'dim_reduction': 'PCA_30', 'model': 'HistGradientBoostingClassifier', 'accuracies': [0.86, 0.86, 0.858, 0.858, 0.904, 0.848, 0.878], 'roc_auc': [0.8600375415122492, 0.8608717973400073, 0.8560927067283, 0.8571497213146262, 0.9040821878350483, 0.8460547504025764, 0.8769638569748143], 'n_train': 1000, 'n_test': 500, 'no_interaction_between_enc_and_rest': False, 'dataset': 'bikewale'}
{'encoding': 'openai__', 'dim_reduction': 'PCA_30', 'model': 'HistGradientBoostingClassifier', 'accuracies': [0.848, 0.882, 0.814, 0.818, 0.868, 0.874, 0.88], 'roc_auc': [0.8471388555422168, 0.8812463916864456, 0.8168276972624798, 0.8164993978321959, 0.8687619691337164, 0.8709831474316427, 0.8790489483563557], 'n_train': 1000, 'n_test': 500, 'no_interaction_between_enc_and_rest': True, 'dataset': 'bikewale'}
{'encoding': 'lm__BAAI/bge-large-en-v1.5', 'dim_reduction': 'PCA_30', 'model': 'HistGradientBoostingClassifier', 'accuracies': [0.888, 0.866, 0.85, 0.88, 0.89, 0.868, 0.832], 'roc

sacct: error: slurm_persist_conn_open_without_init: failed to open persistent connection to host:localhost:6819: Connection refused
sacct: error: Sending PersistInit msg: Connection refused
sacct: error: Problem talking to the database: Connection refused


{'encoding': 'openai__', 'dim_reduction': 'PCA_30', 'model': 'HistGradientBoostingClassifier', 'accuracies': [0.626, 0.616, 0.64, 0.63, 0.62, 0.672, 0.602], 'roc_auc': [0.6259540152642441, 0.6128498175563806, 0.6396054506733279, 0.6299308401639344, 0.6201853066841625, 0.6670111749886958, 0.6016265368852459], 'n_train': 1000, 'n_test': 500, 'no_interaction_between_enc_and_rest': False, 'dataset': 'goodreads'}
{'encoding': 'openai__', 'dim_reduction': 'PCA_30', 'model': 'HistGradientBoostingClassifier', 'accuracies': [0.6, 0.612, 0.582, 0.584, 0.584, 0.618, 0.636], 'roc_auc': [0.6000896229434736, 0.6114882172131147, 0.586810736230768, 0.5933294513457558, 0.5851940776310525, 0.620421362704918, 0.6392007184553211], 'n_train': 1000, 'n_test': 500, 'no_interaction_between_enc_and_rest': True, 'dataset': 'goodreads'}
{'encoding': 'lm__BAAI/bge-large-en-v1.5', 'dim_reduction': 'PCA_30', 'model': 'HistGradientBoostingClassifier', 'accuracies': [0.642, 0.644, 0.632, 0.6, 0.584, 0.606, 0.648], 'r

In [78]:
df

Unnamed: 0,encoding,dim_reduction,model,accuracies,roc_auc,n_train,n_test,no_interaction_between_enc_and_rest,dataset
0,openai__,PCA_30,HistGradientBoostingClassifier,0.86,0.860038,1000,500,False,bikewale
0,openai__,PCA_30,HistGradientBoostingClassifier,0.86,0.860872,1000,500,False,bikewale
0,openai__,PCA_30,HistGradientBoostingClassifier,0.858,0.856093,1000,500,False,bikewale
0,openai__,PCA_30,HistGradientBoostingClassifier,0.858,0.85715,1000,500,False,bikewale
0,openai__,PCA_30,HistGradientBoostingClassifier,0.904,0.904082,1000,500,False,bikewale
...,...,...,...,...,...,...,...,...,...
83,skrub__minhash_30,passthrough,HistGradientBoostingClassifier,0.84,0.812342,1000,500,True,zomato
83,skrub__minhash_30,passthrough,HistGradientBoostingClassifier,0.852,0.841829,1000,500,True,zomato
83,skrub__minhash_30,passthrough,HistGradientBoostingClassifier,0.784,0.76774,1000,500,True,zomato
83,skrub__minhash_30,passthrough,HistGradientBoostingClassifier,0.81,0.791534,1000,500,True,zomato


In [90]:
# filter for new_encoder_name openai__
results = df[df.encoding == "openai__"]

group_cols = ['dataset', 'model', "no_interaction_between_enc_and_rest"]
# show counts per group
print(results.groupby(group_cols)['accuracies'].count().reset_index())
melted_results = results.groupby(group_cols)['accuracies'].mean().reset_index()
print(melted_results.shape)

                                              dataset  \
0                                            bikewale   
1                                            bikewale   
2                                        clear_corpus   
3                                        clear_corpus   
4                                   company_employees   
5                                   company_employees   
6   employee-remuneration-and-expenses-earning-ove...   
7   employee-remuneration-and-expenses-earning-ove...   
8                                     employee_salary   
9                                     employee_salary   
10                                          goodreads   
11                                          goodreads   
12                                    journal_jcr_cls   
13                                    journal_jcr_cls   
14                                      ramen_ratings   
15                                      ramen_ratings   
16                             

In [91]:
import plotly.express as px
fig = px.strip(
    melted_results, x="dataset", y="accuracies", color="no_interaction_between_enc_and_rest"
)
fig.show()

In [89]:
melted_results

Unnamed: 0,dataset,model,no_interaction_between_enc_and_rest,roc_auc
0,bikewale,HistGradientBoostingClassifier,False,0.865893
1,bikewale,HistGradientBoostingClassifier,True,0.854358
2,clear_corpus,HistGradientBoostingClassifier,False,0.771338
3,clear_corpus,HistGradientBoostingClassifier,True,0.766028
4,company_employees,HistGradientBoostingClassifier,False,0.588066
5,company_employees,HistGradientBoostingClassifier,True,0.603068
6,employee-remuneration-and-expenses-earning-ove...,HistGradientBoostingClassifier,False,0.79545
7,employee-remuneration-and-expenses-earning-ove...,HistGradientBoostingClassifier,True,0.791277
8,employee_salary,HistGradientBoostingClassifier,False,0.917534
9,employee_salary,HistGradientBoostingClassifier,True,0.905808
