In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
from src.data_loading import load_data
from skrub import MinHashEncoder
from sklearn.decomposition import PCA
from src.utils import FeaturesExtractor, FixedSizeSplit
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [2]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from tabpfn import TabPFNClassifier
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
import time
from sentence_transformers import SentenceTransformer

In [3]:
from src.encodings import encode, encode_high_cardinality_features
from src.utils import run_on_encoded_data, FeaturesExtractor

In [4]:
from skrub import TableVectorizer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.random_projection import GaussianRandomProjection

In [5]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [6]:
import submitit
from functools import partial
encodings = ["skrub__minhash_30", "lm__all-distilroberta-v1", "lm__all-mpnet-base-v2", "openai__"]
#encodings = []
model_names = [
    "BAAI/bge-large-en-v1.5",
    "BAAI/bge-base-en-v1.5",
     "llmrails/ember-v1",
    # "thenlper/gte-large",
    # "thenlper/gte-base",
    # "intfloat/e5-large-v2",
    # "BAAI/bge-small-en-v1.5",
    # "hkunlp/instructor-xl",
    # "hkunlp/instructor-large",
    # "intfloat/e5-base-v2",
    # "intfloat/multilingual-e5-large",
    # "intfloat/e5-large",
    # "thenlper/gte-small",
    # "intfloat/e5-base",
    # "intfloat/e5-small-v2",
    # "hkunlp/instructor-base",
    # #"sentence-t5-xxl",
    # "intfloat/multilingual-e5-base",
    # #"XLM-3B5-embedding",
    # #"gtr-t5-xxl",
    # #"SGPT-5.8B-weightedmean-msmarco-specb-bitfit",
    # "intfloat/e5-small",
    # "TaylorAI/gte-tiny",
    # #"gtr-t5-xl",
    # "gtr-t5-large",
    # #"XLM-0B6-embedding",
    # "intfloat/multilingual-e5-small",
    # #"sentence-t5-xl",
    # "all-mpnet-base-v2",
    # #"sgpt-bloom-7b1-msmarco",
    # "jinaai/jina-embedding-l-en-v1",
    # #"SGPT-2.7B-weightedmean-msmarco-specb-bitfit",
    # "sentence-t5-large",
    # #"MegatronBert-1B3-embedding",
    # "TaylorAI/bge-micro-v2",
    # "all-MiniLM-L12-v2",
    # "all-MiniLM-L6-v2",
    # "jinaai/jina-embedding-b-en-v1",
    # #"SGPT-1.3B-weightedmean-msmarco-specb-bitfit",
    # "gtr-t5-base",
    # "nthakur/contriever-base-msmarco",
    # "TaylorAI/bge-micro",
    # "sentence-t5-base",
    # "paraphrase-multilingual-mpnet-base-v2",
    # "Hum-Works/lodestone-base-4096-v1",
    # #"SGPT-5.8B-weightedmean-nli-bitfit",
    # "paraphrase-multilingual-MiniLM-L12-v2",
    # "msmarco-bert-co-condensor",
    # "jinaai/jina-embedding-s-en-v1"
]

for model_name in model_names:
    #if "e5" in model_name:
    encodings.append("lm__" + model_name)



#encodings.extend(["skrub__minhash_30_word_none", "skrub__minhash_30_tokenizer_gpt2"])

#encodings = ["lm__all-distilroberta-v1", "lm__all-mpnet-base-v2"]

#datasets = ["journal_jcr_cls", "movies", "michelin", "spotify", "employee_salary", "museums", "fifa_footballplayers_22", "jp_anime", "clear_corpus", "company_employees", "us_presidential", "us_accidents_severity", "us_accidents_counts", "wine_review"]
#datasets.extend(["building_permits", "agora", "met_objects", "public", "kickstarter", "colleges", "drug_directory", "medical_charge", "traffic_violations"])
datasets = ["european_cities_pop_log"]
#datasets = ["drug_directory", "met_objects"]
print(len(datasets))

executor = submitit.AutoExecutor(folder="logs")
executor.update_parameters(timeout_min=400, slurm_partition='parietal,normal', slurm_array_parallelism=300, cpus_per_task=2,
                            exclude="margpu009")

def encoding_dataset(dataset, encoding):
    X, y = load_data(dataset, max_rows=10000)
    #X_enc = encode(X_text, encoding, dataset_name=dataset, override_cache=True)
    X_enc, X_rest = encode_high_cardinality_features(X, encoding, dataset_name=dataset, override_cache=False, cardinality_threshold=30)#, fail_if_not_cached=True)

    #return X_enc, X_rest, y
    

jobs = []

with executor.batch():
    for dataset in tqdm(datasets):
        for encoding in tqdm(encodings, leave=False):
            print(dataset, encoding)
            job = executor.submit(encoding_dataset, dataset, encoding)
            jobs.append(job)



2


  0%|          | 0/2 [00:00<?, ?it/s]

european_cities_pop_log skrub__minhash_30
european_cities_pop_log lm__all-distilroberta-v1
european_cities_pop_log lm__all-mpnet-base-v2
european_cities_pop_log openai__
european_cities_pop_log lm__BAAI/bge-large-en-v1.5
european_cities_pop_log lm__BAAI/bge-base-en-v1.5
european_cities_pop_log lm__llmrails/ember-v1


100%|██████████| 2/2 [00:00<00:00, 138.89it/s]

all_cities_pop_log skrub__minhash_30
all_cities_pop_log lm__all-distilroberta-v1
all_cities_pop_log lm__all-mpnet-base-v2
all_cities_pop_log openai__
all_cities_pop_log lm__BAAI/bge-large-en-v1.5
all_cities_pop_log lm__BAAI/bge-base-en-v1.5
all_cities_pop_log lm__llmrails/ember-v1





In [8]:
dataset = "european_cities_pop"
X, y = load_data(dataset, max_rows=10000, regression=True)
cv_classic = 5
cv_custom = GroupKFold(n_splits=5)
cv_custom = list(cv_custom.split(X, groups=X["Country Code"])) # list for pickle
cvs = {"classic": cv_classic, "custom": cv_custom}

# test expected behaviour
# iterate on splits and print size
from sklearn.model_selection import KFold
# classic
cross_val = KFold(n_splits=5)
for train_index, test_index in cross_val.split(X):
    print("TRAIN:", len(train_index), "TEST:", len(test_index))
# custom
for train_index, test_index in cv_custom:
    print("TRAIN:", len(train_index), "TEST:", len(test_index))


Removed 1 columns with missing values on 10 columns
Removed 24 rows with missing values on 10876 rows
Removed 24 rows with missing values on 10852 rows
Removed 1 columns with missing values on 8 columns
New shape: (10852, 9)
Original task: regression for european_cities_pop
X shape: (10000, 9), y shape: (10000,)
TRAIN: 8000 TEST: 2000
TRAIN: 8000 TEST: 2000
TRAIN: 8000 TEST: 2000
TRAIN: 8000 TEST: 2000
TRAIN: 8000 TEST: 2000
TRAIN: 8000 TEST: 2000
TRAIN: 7999 TEST: 2001
TRAIN: 8000 TEST: 2000
TRAIN: 8000 TEST: 2000
TRAIN: 8001 TEST: 1999


In [22]:
import submitit
from functools import partial
import time
from sklearn.model_selection import PredefinedSplit
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GroupKFold
from src.encodings import encode_high_cardinality_features
from sklearn.decomposition import PCA

encodings = ["skrub__minhash_100", "lm__all-mpnet-base-v2", "openai__"]


#encodings.extend(["skrub__minhash_30_word_none", "skrub__minhash_30_tokenizer_gpt2"])
#encodings.extend(["skrub__minhash_30_word_none"])

#encodings = ["lm__all-distilroberta-v1", "lm__all-mpnet-base-v2"]

#datasets = ["journal_jcr_cls", "movies", "michelin", "spotify", "employee_salary", "museums", "fifa_footballplayers_22", "jp_anime", "clear_corpus", "company_employees", "us_presidential", "us_accidents_severity", "us_accidents_counts", "wine_review"]
#datasets.extend(["building_permits", "agora", "met_objects", "public", "kickstarter", "colleges", "drug_directory", "medical_charge", "traffic_violations"])
#datasets = ["us_place"]
datasets = ["european_cities_pop_log"]
#datasets = ["drug_directory", "met_objects"]
print(len(datasets))
models = {"LinearRegression": LinearRegression(), "GradientBoostingRegressor": GradientBoostingRegressor()}
       # "TabPFNClassifier_basic": TabPFNClassifier(device="cpu", N_ensemble_configurations=1, no_preprocess_mode=True)}

dim_reductions = {"PCA_100": PCA(n_components=100),
                  #"PCA_10": PCA(n_components=10),
                   "passthrough": "passthrough"}

print("Number of iterations: ", len(datasets) * len(encodings) * len(dim_reductions) * len(models))
jobs = []

executor = submitit.AutoExecutor(folder="logs")
executor.update_parameters(timeout_min=100, slurm_partition='parietal,normal', slurm_array_parallelism=300, cpus_per_task=2,
                            exclude="margpu009")


#TODO: only keep the name


for dataset in tqdm(datasets):
    #print(f"Dataset: {dataset}, Encoding: {encoding}")
    #TODO: maybe possible to do better with sklearn k fold but we never want the other group in the test
    X, y = load_data(dataset, max_rows=10000, regression=True)
    cv_classic = 5
    cv_custom = GroupKFold(n_splits=5)
    cv_custom = list(cv_custom.split(X, groups=X["Country Code"])) # list for pickle
    cvs = {"classic": cv_classic, "custom": cv_custom}
    for cv_name, cv in cvs.items():
        for encoding in tqdm(encodings, leave=False):
            print("Dataset", dataset)
            print("Encoding", encoding)
            #if len(X_text) > n_train + n_test:
            try:
                #X_enc = encode(X_text, encoding, dataset_name=dataset)
                X_enc, X_rest = encode_high_cardinality_features(X, encoding, dataset_name=dataset, override_cache=False, cardinality_threshold=30)#, fail_if_not_cached=True)
            except:
                print(f"Encoding {encoding} failed for dataset {dataset}")
                continue
                #n_test = 2000
                #cv = FixedSizeSplit(n_splits=5, n_train=n_train, n_test=n_test, random_state=42)
                # create a predifined split

                # restrict train
            for dim_reduction_name, dim_reduction in dim_reductions.items():
                for model_name, model in models.items():
                    job_func = partial(run_on_encoded_data, X_enc, X_rest, y, dim_reduction_name, dim_reduction, model_name, model, encoding, cv,
                                    dataset = dataset, features = "all", regression=True, cv_name=cv_name)
                    job = executor.submit(job_func)
                    jobs.append(job)
                    print(f"Submitted job {job.job_id} to the cluster.")
            
            #just X_enc
            for dim_reduction_name, dim_reduction in dim_reductions.items():
                for model_name, model in models.items():
                    job_func = partial(run_on_encoded_data, X_enc, None, y, dim_reduction_name, dim_reduction, model_name, model, encoding, cv,
                                    dataset = dataset, features = "text_only", regression=True, cv_name=cv_name) #TODO use the features argument?
                    job = executor.submit(job_func)
                    jobs.append(job)
                    print(f"Submitted job {job.job_id} to the cluster.")
            
            #just X_rest
            for dim_reduction_name, dim_reduction in dim_reductions.items():
                for model_name, model in models.items():
                    job_func = partial(run_on_encoded_data, None, X_rest, y, dim_reduction_name, dim_reduction, model_name, model, encoding, cv,
                                    dataset = dataset, features = "rest_only", regression=True, cv_name=cv_name)
                    job = executor.submit(job_func)
                    jobs.append(job)
                    print(f"Submitted job {job.job_id} to the cluster.")



Setting 'exclude' is deprecated. Use 'slurm_exclude' instead.



1
Number of iterations:  12


  0%|          | 0/1 [00:00<?, ?it/s]

Removed 1 columns with missing values on 10 columns
Removed 24 rows with missing values on 10876 rows
Removed 24 rows with missing values on 10852 rows
Removed 1 columns with missing values on 8 columns
New shape: (10852, 9)
Original task: regression for european_cities_pop_log
X shape: (10000, 9), y shape: (10000,)




Dataset european_cities_pop_log
Encoding skrub__minhash_100
numeric ['DIgital Elevation Model', 'latitude', 'longitude']
low_card_cat ['Feature Class', 'Feature Code']
high_card_cat ['Name', 'Country Code', 'Country name EN', 'Timezone']
High cardinality columns ['Name', 'Country Code', 'Country name EN', 'Timezone']
working dir /scratch/lgrinszt/lm_tab/scripts
Cache not found, computing
Saving to cache
working dir /scratch/lgrinszt/lm_tab/scripts
Cache not found, computing
Saving to cache
working dir /scratch/lgrinszt/lm_tab/scripts
Cache not found, computing
Saving to cache
working dir /scratch/lgrinszt/lm_tab/scripts
Cache not found, computing
Saving to cache
Submitted job 1880651 to the cluster.
Submitted job 1880652 to the cluster.
Submitted job 1880653 to the cluster.
Submitted job 1880654 to the cluster.
Submitted job 1880655 to the cluster.
Submitted job 1880656 to the cluster.
Submitted job 1880657 to the cluster.
Submitted job 1880658 to the cluster.
Submitted job 1880659 to 



Submitted job 1880662 to the cluster.
Dataset european_cities_pop_log
Encoding lm__all-mpnet-base-v2
numeric ['DIgital Elevation Model', 'latitude', 'longitude']
low_card_cat ['Feature Class', 'Feature Code']
high_card_cat ['Name', 'Country Code', 'Country name EN', 'Timezone']
High cardinality columns ['Name', 'Country Code', 'Country name EN', 'Timezone']
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
Submitted job 1880663 to the cluster.
Submitted job 1880664 to the cluster.
Submitted job 1880665 to the cluster.
Submitted job 1880666 to the cluster.
Submitted job 1880667 to the cluster.
Submitted job 1880668 to the cluster.
Submitted job 1880669 to the cluster.
Submitted job 1880670 to the cluster.
Submitted job 1880671 to the cluster.
Submitted job 1880672 to the cluster.
Submitte



Submitted job 1880674 to the cluster.
Dataset european_cities_pop_log
Encoding openai__
numeric ['DIgital Elevation Model', 'latitude', 'longitude']
low_card_cat ['Feature Class', 'Feature Code']
high_card_cat ['Name', 'Country Code', 'Country name EN', 'Timezone']
High cardinality columns ['Name', 'Country Code', 'Country name EN', 'Timezone']
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
Submitted job 1880675 to the cluster.
Submitted job 1880676 to the cluster.
Submitted job 1880677 to the cluster.
Submitted job 1880678 to the cluster.
Submitted job 1880679 to the cluster.
Submitted job 1880680 to the cluster.
Submitted job 1880681 to the cluster.
Submitted job 1880682 to the cluster.
Submitted job 1880683 to the cluster.
Submitted job 1880684 to the cluster.
Submitted job 1880685



Submitted job 1880686 to the cluster.




Dataset european_cities_pop_log
Encoding skrub__minhash_100
numeric ['DIgital Elevation Model', 'latitude', 'longitude']
low_card_cat ['Feature Class', 'Feature Code']
high_card_cat ['Name', 'Country Code', 'Country name EN', 'Timezone']
High cardinality columns ['Name', 'Country Code', 'Country name EN', 'Timezone']
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
Submitted job 1880687 to the cluster.
Submitted job 1880688 to the cluster.
Submitted job 1880689 to the cluster.
Submitted job 1880690 to the cluster.
Submitted job 1880691 to the cluster.
Submitted job 1880692 to the cluster.
Submitted job 1880693 to the cluster.
Submitted job 1880694 to the cluster.
Submitted job 1880695 to the cluster.
Submitted job 1880696 to the cluster.
Submitted job 1880697 to the cluster.




Submitted job 1880698 to the cluster.
Dataset european_cities_pop_log
Encoding lm__all-mpnet-base-v2
numeric ['DIgital Elevation Model', 'latitude', 'longitude']
low_card_cat ['Feature Class', 'Feature Code']
high_card_cat ['Name', 'Country Code', 'Country name EN', 'Timezone']
High cardinality columns ['Name', 'Country Code', 'Country name EN', 'Timezone']
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
Submitted job 1880699 to the cluster.
Submitted job 1880700 to the cluster.
Submitted job 1880701 to the cluster.
Submitted job 1880702 to the cluster.
Submitted job 1880703 to the cluster.
Submitted job 1880704 to the cluster.
Submitted job 1880705 to the cluster.
Submitted job 1880706 to the cluster.
Submitted job 1880707 to the cluster.
Submitted job 1880708 to the cluster.
Submitte



Submitted job 1880710 to the cluster.
Dataset european_cities_pop_log
Encoding openai__
numeric ['DIgital Elevation Model', 'latitude', 'longitude']
low_card_cat ['Feature Class', 'Feature Code']
high_card_cat ['Name', 'Country Code', 'Country name EN', 'Timezone']
High cardinality columns ['Name', 'Country Code', 'Country name EN', 'Timezone']
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
working dir /scratch/lgrinszt/lm_tab/scripts
Loaded from cache
Submitted job 1880711 to the cluster.
Submitted job 1880712 to the cluster.
Submitted job 1880713 to the cluster.
Submitted job 1880714 to the cluster.
Submitted job 1880715 to the cluster.
Submitted job 1880716 to the cluster.
Submitted job 1880717 to the cluster.
Submitted job 1880718 to the cluster.
Submitted job 1880719 to the cluster.
Submitted job 1880720 to the cluster.
Submitted job 1880721

100%|██████████| 1/1 [00:49<00:00, 49.25s/it]

Submitted job 1880722 to the cluster.





In [23]:
results = []

for job in jobs:
    try:
        result = job.result()
        results.append(result)
    except Exception as e:
        print(f"Job {job.job_id} failed with exception: {e}")

# remove None
print(len(results))
results = [r for r in results if r is not None]
print(len(results))
df = pd.DataFrame(results)
melted_results = df.explode(['neg_mean_squared_error', "r2"])
#melted_results = melted_results.drop(columns=["scores"])
#melted_results.to_csv("../results/results_all_01_10.csv", index=False)
#melted_results.to_csv("../results/results_all_02_10_bert_pooling.csv", index=False)
# append to "../results/results_all_02_10_bert_pooling.csv"
#melted_results.to_csv("../results/results_all_04_10.csv", index=False)
#melted_results.to_csv("../results/results_movies_2023.csv", index=False)

sacct: error: slurm_persist_conn_open_without_init: failed to open persistent connection to host:localhost:6819: Connection refused
sacct: error: Sending PersistInit msg: Connection refused
sacct: error: Problem talking to the database: Connection refused




sacct: error: slurm_persist_conn_open_without_init: failed to open persistent connection to host:localhost:6819: Connection refused
sacct: error: Sending PersistInit msg: Connection refused
sacct: error: Problem talking to the database: Connection refused




sacct: error: slurm_persist_conn_open_without_init: failed to open persistent connection to host:localhost:6819: Connection refused
sacct: error: Sending PersistInit msg: Connection refused
sacct: error: Problem talking to the database: Connection refused




sacct: error: slurm_persist_conn_open_without_init: failed to open persistent connection to host:localhost:6819: Connection refused
sacct: error: Sending PersistInit msg: Connection refused
sacct: error: Problem talking to the database: Connection refused




sacct: error: slurm_persist_conn_open_without_init: failed to open persistent connection to host:localhost:6819: Connection refused
sacct: error: Sending PersistInit msg: Connection refused
sacct: error: Problem talking to the database: Connection refused




sacct: error: slurm_persist_conn_open_without_init: failed to open persistent connection to host:localhost:6819: Connection refused
sacct: error: Sending PersistInit msg: Connection refused
sacct: error: Problem talking to the database: Connection refused




sacct: error: slurm_persist_conn_open_without_init: failed to open persistent connection to host:localhost:6819: Connection refused
sacct: error: Sending PersistInit msg: Connection refused
sacct: error: Problem talking to the database: Connection refused




sacct: error: slurm_persist_conn_open_without_init: failed to open persistent connection to host:localhost:6819: Connection refused
sacct: error: Sending PersistInit msg: Connection refused
sacct: error: Problem talking to the database: Connection refused




sacct: error: slurm_persist_conn_open_without_init: failed to open persistent connection to host:localhost:6819: Connection refused
sacct: error: Sending PersistInit msg: Connection refused
sacct: error: Problem talking to the database: Connection refused


72
48


In [25]:
# save to csv
melted_results.to_csv("../results/results_all_cities_pop_log_100.csv", index=False)

In [1]:
import pandas as pd
melted_results = pd.read_csv("../results/results_european_cities_pop_log.csv")

In [24]:
import plotly.graph_objects as go

# Filter out the data
res_filtered = res[res["cv_name"] == "custom"]
res_filtered = res_filtered[res_filtered["encoding"].isin(["openai__", "skrub__minhash_30"])]

# Rename the labels
res_filtered['encoding'] = res_filtered['encoding'].replace('skrub__minhash_30', 'MinHashEncoder')
res_filtered['encoding'] = res_filtered['encoding'].replace('openai__', 'OpenAI')

# Create boxplot
fig = go.Figure()

# Add box traces
for encoding in res_filtered['encoding'].unique():
    fig.add_trace(go.Box(
        x=res_filtered[res_filtered['encoding'] == encoding]['r2'],
        name=encoding,
        boxpoints='outliers', # show only outliers
        jitter=0.5, # spread out data points for visibility
        whiskerwidth=0.2, # reduce whisker width
        #fillcolor='cls', # fill box with classification color
        marker=dict(
            size=2, # reduce marker size
            color="darkblue"
        ),
        line=dict(width=1), # reduce line width,
    ))

# Add vertical line at x=0
fig.add_shape(type="line",
    xref="x", yref="paper",
    x0=0, y0=0, x1=0, y1=1,
    line=dict(
        color="LightSeaGreen",
        width=1.5,
        dash="dash",
    )
)

# Set plot title and axis labels
fig.update_layout(
    title="R2 Score on European Cities' Population (log)",
    yaxis_title='',
    xaxis_title='R2 Score',
    autosize=False,
    width=800,
    height=600,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="white",
    font=dict(
        family="Arial, monospace",
        size=23,
        color="black"
    ),
    showlegend=False,
    template="simple_white" # use the simple_white template

)

fig.show()

# save as pdf
fig.write_image("../figures/ood.pdf")