In [None]:
def encode_high_cardinality_features(X, encoder_name, dataset_name=None, use_cache=True, override_cache=False, cardinality_threshold=30, fail_if_not_cached=False):

    # encode the high cardinality columns
    res = []
    lengths = []
    for col in high_cardinality_columns:
        new_enc = encode(X, col, encoder_name, dataset_name=dataset_name, use_cache=use_cache, override_cache=override_cache, fail_if_not_cached=fail_if_not_cached)
        res.append(new_enc)
        lengths.append(new_enc.shape[1])
    # create a dataframe with name original_col_name__index
    df = pd.DataFrame(np.concatenate(res, axis=1))

    #df = pd.DataFrame(np.concatenate(res, axis=1))
    # for i in range(len(res)):
    #     for j in range(lengths[i]):
    #         df.rename(columns={i*lengths[i] + j: high_cardinality_columns[i] + "__" + str(j)}, inplace=True)
    new_column_names = []
    for i in range(len(res)):
        for j in range(lengths[i]):
            new_column_names.append(high_cardinality_columns[i] + "__" + str(j))
    df.columns = new_column_names
    return df, X.drop(high_cardinality_columns, axis=1)

In [17]:
from src.data_loading import load_data
from skrub import TableVectorizer, MinHashEncoder
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.base import clone
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score
def run_with_hv(dataset, analyzer, ngram_range, dim_reduction_name, dim_reduction, cv, model_name, model, cardinality_threshold=30):
    X, y = load_data(dataset, max_rows=10000)
    tb = TableVectorizer(cardinality_threshold=cardinality_threshold,
                        high_card_cat_transformer = "passthrough",
                        low_card_cat_transformer = "passthrough",
                        numerical_transformer = "passthrough",
                        datetime_transformer = "passthrough",
    ) #just to get the high cardinality columns
    tb.fit(X)
    # get high cardinality columns
    high_cardinality_columns = []
    for name, trans, cols in tb.transformers_:
        if "high" in name:
            high_cardinality_columns.extend(cols)
            break
    print("High cardinality columns", high_cardinality_columns)
    all_enc_cols = []
    for col in high_cardinality_columns:
        vectorizer = HashingVectorizer(analyzer=analyzer, ngram_range=ngram_range)
        res = vectorizer.fit_transform(X[col])
        all_enc_cols.append(res)

    X_rest = X.drop(high_cardinality_columns, axis=1)

    rest_trans = TableVectorizer(high_card_cat_transformer = MinHashEncoder(n_components=10, analyzer='char'),
                            cardinality_threshold=30)
    # cv by hand
    # split X
    accuracies = []
    roc_aucs = []
    for i, (train, test) in enumerate(cv.split(X, y)):
        print(f"Fold {i}")
        X_rest_train = rest_trans.fit_transform(X_rest.iloc[train])
        X_rest_test = rest_trans.transform(X_rest.iloc[test])
        X_high_train, X_high_test = [], []
        for j, col in enumerate(high_cardinality_columns):
            dim_rec = clone(dim_reduction)
            X_high_train.append(dim_rec.fit_transform(all_enc_cols[j][train]))
            X_high_test.append(dim_rec.transform(all_enc_cols[j][test]))
        X_high_train = np.concatenate(X_high_train, axis=1)
        X_high_test = np.concatenate(X_high_test, axis=1)
        X_train = np.concatenate([X_rest_train, X_high_train], axis=1)
        X_test = np.concatenate([X_rest_test, X_high_test], axis=1)
        print("X_train shape", X_train.shape)
        print("X_test shape", X_test.shape)
        model.fit(X_train, y[train])
        y_pred = model.predict(X_test)
        # accuracy and roc_auc
        accuracy = accuracy_score(y[test], y_pred)
        roc_auc = roc_auc_score(y[test], y_pred)
        accuracies.append(accuracy)
        roc_aucs.append(roc_auc)
    
    return {
        "dataset": dataset,
        "analyzer": analyzer,
        "ngram_range": ngram_range,
        "dim_reduction_name": dim_reduction_name,
        "dim_reduction": dim_reduction,
        "cv": cv,
        "model_name": model_name,
        "model": model,
        "accuracy": accuracies,
        "roc_auc": roc_aucs
    }


In [18]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.model_selection import StratifiedKFold
run_with_hv("goodreads", "char", (1, 2), "PCA_30", TruncatedSVD(n_components=30), StratifiedKFold(n_splits=5), "GradientBoostingClassifier", GradientBoostingClassifier())

Removed 2 columns with missing values on 12 columns
Removed 1652 rows with missing values on 3967 rows
Removed 1652 rows with missing values on 2315 rows
Removed 2 columns with missing values on 9 columns
New shape: (2315, 10)
Original task: regression for goodreads
Converting to binary classification
Classes (array([0, 1]), array([1167, 1148]))
X shape: (2315, 10), y shape: (2315,)
High cardinality columns ['Title', 'Description', 'FirstAuthor', 'NumberofReviews', 'Publisher', 'PublishDate']
Fold 0




X_train shape (1852, 204)
X_test shape (463, 204)
Fold 1




X_train shape (1852, 207)
X_test shape (463, 207)
Fold 2




X_train shape (1852, 207)
X_test shape (463, 207)
Fold 3




X_train shape (1852, 204)
X_test shape (463, 204)
Fold 4




X_train shape (1852, 207)
X_test shape (463, 207)


{'dataset': 'goodreads',
 'analyzer': 'char',
 'ngram_range': (1, 2),
 'dim_reduction_name': 'PCA_30',
 'dim_reduction': TruncatedSVD(n_components=30),
 'cv': StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
 'model_name': 'GradientBoostingClassifier',
 'model': GradientBoostingClassifier(),
 'accuracy': [0.6479481641468683,
  0.6004319654427646,
  0.591792656587473,
  0.6220302375809935,
  0.6414686825053996],
 'roc_auc': [0.6485369312880229,
  0.6012018064419811,
  0.5921813771225976,
  0.6220563537973502,
  0.6417615226721404]}

In [1]:
from src.data_loading import load_data
from src.encodings import encode, encode_high_cardinality_features
encoding = 'hashing_vectorizer__char_(2,4)'
dataset = "goodreads"
X, y = load_data(dataset, max_rows=10000)
X_enc, X_rest = encode_high_cardinality_features(X, encoding, dataset_name=dataset, override_cache=True, cardinality_threshold=30)



Removed 2 columns with missing values on 12 columns
Removed 1652 rows with missing values on 3967 rows
Removed 1652 rows with missing values on 2315 rows
Removed 2 columns with missing values on 9 columns
New shape: (2315, 10)
Original task: regression for goodreads
Converting to binary classification
Classes (array([0, 1]), array([1167, 1148]))
X shape: (2315, 10), y shape: (2315,)
numeric ['PageCount', 'NumberofRatings']
low_card_cat ['Format', 'Language']
high_card_cat ['Title', 'Description', 'FirstAuthor', 'NumberofReviews', 'Publisher', 'PublishDate']
High cardinality columns ['Title', 'Description', 'FirstAuthor', 'NumberofReviews', 'Publisher', 'PublishDate']
working dir /scratch/lgrinszt/lm_tab/scripts
Encoder type hashing_vectorizer
Encoder params char_(2,4)
Hashing vectorizer shape (2315, 1048576)
Saving to cache
working dir /scratch/lgrinszt/lm_tab/scripts
Encoder type hashing_vectorizer
Encoder params char_(2,4)
Hashing vectorizer shape (2315, 1048576)
Saving to cache
work

ValueError: Length mismatch: Expected axis has 1 elements, new values have 6291456 elements

In [None]:
model_names = [
    "EleutherAI/pythia-70m",
    "EleutherAI/pythia-160m",
    "EleutherAI/pythia-410m",
    "EleutherAI/pythia-1b",
    "EleutherAI/pythia-1.4b",
    "EleutherAI/pythia-2.8b",
    "EleutherAI/pythia-6.9b",
]
for model_name in model_names:
    encodings.append(f"lm__{model_name}")


datasets = ['bikewale', 'clear_corpus', 'company_employees',
       'employee-remuneration-and-expenses-earning-over-75000',
       'employee_salary', 'goodreads', 'journal_jcr_cls', 'ramen_ratings',
       'spotify', 'us_accidents_counts', 'us_accidents_severity',
       'us_presidential', 'wine_review', 'zomato']



executor = submitit.AutoExecutor(folder="logs")
executor.update_parameters(timeout_min=300, slurm_partition='parietal,normal,gpu',
                           exclude="margpu001,margpu002,margpu003,margpu004",
                           gpus_per_node=1)

def encoding_dataset(dataset, encoding):
    X, y = load_data(dataset, max_rows=10000)
    #X_enc = encode(X_text, encoding, dataset_name=dataset, override_cache=False)
    X_enc, X_rest = encode_high_cardinality_features(X, encoding, dataset_name=dataset, override_cache=False, cardinality_threshold=30)
    #return X_enc, X_rest, y

In [None]:
!nvidia-smi

In [1]:
from src.data_loading import load_data
from src.encodings import encode, encode_high_cardinality_features
encoding = "hf__EleutherAI/pythia-70m"
dataset = "bikewale"
X, y = load_data(dataset, max_rows=10000)
X_enc, X_rest = encode_high_cardinality_features(X, encoding, dataset_name=dataset, override_cache=True, cardinality_threshold=30)

Removed 0 columns with missing values on 8 columns
Removed 2 rows with missing values on 9003 rows
Removed 2 rows with missing values on 9001 rows
Removed 0 columns with missing values on 7 columns
New shape: (9001, 8)
Original task: regression for bikewale
Converting to binary classification
Classes (array([0, 1]), array([4645, 4356]))
X shape: (9001, 8), y shape: (9001,)
numeric ['km_driven', 'model_year']
low_card_cat ['city_posted', 'fuel_type', 'owner_type']
high_card_cat ['bike_name', 'color', 'url']
High cardinality columns ['bike_name', 'color', 'url']
working dir /scratch/lgrinszt/lm_tab/scripts
Encoder type hf
Encoder params EleutherAI/pythia-70m
Encoding with HF


Using pad_token, but it is not set yet.


Setting padding token
Saving to cache
working dir /scratch/lgrinszt/lm_tab/scripts
Encoder type hf
Encoder params EleutherAI/pythia-70m
Encoding with HF


Using pad_token, but it is not set yet.


Setting padding token
Saving to cache
working dir /scratch/lgrinszt/lm_tab/scripts
Encoder type hf
Encoder params EleutherAI/pythia-70m
Encoding with HF


Using pad_token, but it is not set yet.


Setting padding token
Saving to cache


In [2]:
from transformers import AutoTokenizer, AutoModel
import torch


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask



#Sentences we want sentence embeddings for
sentences = ['This framework generates embeddings for each input sentence',
             'Sentences are passed as a list of string.',
             'The quick brown fox jumps over the lazy dog.']

#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")
model = AutoModel.from_pretrained("EleutherAI/pythia-70m")

# Set padding token if not set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

#Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')

# move to gpu if available
if torch.cuda.is_available():
    encoded_input.to('cuda')
    model.to('cuda')

#Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

#Perform pooling. In this case, mean pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

Using pad_token, but it is not set yet.


In [3]:
batch_size = 16
from torch.utils.data import DataLoader
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")
model = AutoModel.from_pretrained("EleutherAI/pythia-70m")

# Set padding token if not set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Make sure model and tokenizer are on the same device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

# Create a DataLoader to handle batching of the sentences
sentences_loader = DataLoader(sentences, batch_size=batch_size, shuffle=False)

# List to store all embeddings
all_embeddings = []

for sentence_batch in sentences_loader:
    # Tokenize sentences
    encoded_input = tokenizer(sentence_batch, padding=True, truncation=True, max_length=128, return_tensors='pt')

    # Move tensors to the same device as the model
    encoded_input = encoded_input.to(device)

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling. In this case, mean pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Move embeddings to CPU, convert to numpy and store
    all_embeddings.extend(sentence_embeddings.cpu().numpy())

Using pad_token, but it is not set yet.


In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


In [None]:
model_output

In [None]:
model_output[1].shape

In [None]:
# compute sentence similarity
sentences = ['The cat sits outside',
             "The cat sits inside",
]

sentence_embeddings = model.encode(sentences)

for sent in sentences:
    print("\n" + sent)
    for i, sent_emb in enumerate(sentence_embeddings):
        print(f"{i}: {sent_emb}")

In [None]:
from autofj.datasets import load_data
#from skrub import fuzzy_join
from src.fuzzy_join_custom import fuzzy_join
from src.encodings import encode, get_batch_embeddings
from autofj import AutoFJ
def autofj_merge(left, right, target=0.9):
    """Merging using AutomaticFuzzyJoin"""
    autofj = AutoFJ(precision_target=target, verbose=True)
    autofj_joins = autofj.join(left, right, id_column="id")
    return autofj_joins

In [None]:
# get all directory names at https://github.com/chu-data-lab/AutomaticFuzzyJoin/tree/master/src/autofj/benchmark
from urllib.request import urlopen
import json
url = "https://api.github.com/repos/chu-data-lab/AutomaticFuzzyJoin/contents/src/autofj/benchmark"
response = urlopen(url)
data = json.loads(response.read())
print("Available datasets:")
dataset_list = []
for d in data:
    dataset_list.append(d["name"])

In [None]:
dataset_list

In [None]:
import submitit
executor = submitit.AutoExecutor(folder="logs")
executor.update_parameters(timeout_min=100, slurm_partition='parietal,normal', slurm_array_parallelism=100, cpus_per_task=2,
                            exclude="margpu009,marg00[1-9],marg0[11-12],marg0[14-15],marg0[16-20],marg0[25-32]")
# change name of job
executor.update_parameters(name="pipeline")

encodings = ["openai__", "lm__BAAI/bge-large-en-v1.5", "lm__llmrails/ember-v1"]

def encode_join(dataset, column, encoder_name):
    """Encode a dataset and save it to disk"""
    left_table, right_table, gt = load_data(dataset)
    # encode both tables with fasttext (column title)
    main_str_enc = encode(left_table, column, encoder_name=encoder_name, dataset_name="join_left_" + dataset)
    aux_str_enc = encode(right_table, column, encoder_name=encoder_name, dataset_name="join_right_" + dataset)
    return main_str_enc, aux_str_enc

with executor.batch():
    for encoding in encodings:
        for dataset in dataset_list:
            for column in ["title"]:
                executor.submit(encode_join, dataset, column, encoding)


In [None]:
import pandas as pd
dataset="PoliticalParty"
encoder_name="lm__BAAI/bge-large-en-v1.5"
match_score = 0.3
left_table, right_table, gt = load_data(dataset)
# encode both tables with fasttext (column title)
main_str_enc = encode(left_table, column, encoder_name=encoder_name, dataset_name="join_left_" + dataset)
aux_str_enc = encode(right_table, column, encoder_name=encoder_name, dataset_name="join_right_" + dataset)

joined_fj = fuzzy_join(left_table, right_table, left_on="title",
            match_score=match_score,
            right_on="title",
            return_score=True,
            suffixes=("_l", "_r"),
            main_str_enc_arg=main_str_enc,
            aux_str_enc_arg=aux_str_enc,
)

df_all = pd.merge(joined_fj, gt, on=["id_l"], suffixes=("", "_gt"))
df_all["correct"] = df_all["id_r"] == df_all["id_r_gt"]
# drop na in correct
df_all = df_all.dropna(subset=["correct"])

recall = df_all["correct"].sum() / len(gt)
precision = df_all["correct"].sum() / len(df_all)
f1 = 2 * (precision * recall) / (precision + recall)



In [None]:
recall, precision, f1

In [None]:
import pandas as pd
df = pd.DataFrame({"text": ["Mary, Queen of the World, Cathedral", "Mary, Queen of the World Cathedral", "Birmingham Orthodox Cathedral"]})
encode(df, "text", "openai__encoder")

In [None]:
import numpy as np
embed_1, embed_2, embed_3 = get_batch_embeddings(["Mary, Queen of the World, Cathedral", "Mary, Queen of the World Cathedral", "Birmingham Orthodox Cathedral"])#, model="text-embedding-ada-002")
# compute distances
dist_1_2 = np.linalg.norm(embed_1 - embed_2)
dist_1_3 = np.linalg.norm(embed_1 - embed_3)
dist_2_3 = np.linalg.norm(embed_2 - embed_3)
print("Distances between pairs of strings:")
print(f"1-2: {dist_1_2}")
print(f"1-3: {dist_1_3}")
print(f"2-3: {dist_2_3}")

In [None]:
dataset = "HistoricBuilding"
left_table, right_table, gt = load_data(dataset)

In [None]:
right_table

In [None]:
# encode both tables with fasttext (column title)
main_str_enc = encode(left_table, "title", encoder_name="openai__", dataset_name="join_left_" + dataset)
aux_str_enc = encode(right_table, "title", encoder_name="openai__", dataset_name="join_right_" + dataset)

In [None]:
left_table

In [None]:
right_table

In [None]:
joined_fj = fuzzy_join(left_table, right_table, left_on="title",
            match_score=0.7,
            right_on="title",
            return_score=True,
            suffixes=("_l", "_r"),
            main_str_enc_arg=main_str_enc,
            aux_str_enc_arg=aux_str_enc,
)

joined_fj_autofj = autofj_merge(
            left_table,
            right_table,
            target=0.5,
        )

In [None]:
joined_fj_autofj

In [None]:
df_all = pd.merge(joined_fj_autofj, gt, on=["id_l"], suffixes=("", "_gt"))
df_all["correct"] = df_all["id_r"] == df_all["id_r_gt"]
# replace nans by False in correct column
#df_all["correct"] = df_all["correct"].fillna(False)
# drop nans in correct column
#df_all = df_all.dropna(subset=["correct"])

In [None]:
df_all

In [None]:
df_all["correct"].sum()

In [None]:
df_all["correct"].mean()

In [None]:
gt

In [None]:
joined_fj[joined_fj["title_r"] == "Westminster Central Hall"]

In [None]:
joined_fj_autofj

In [None]:
from src.utils import evaluate

pr, re, f1 = evaluate(
            list(zip(joined_fj["title_l"], joined_fj["title_r"])),
            list(zip(gt["title_l"], gt["title_r"])),
        )

In [None]:
print(f"Precision: {pr:.3f}, Recall: {re:.3f}, F1: {f1:.3f}")

In [None]:
print(f"Precision: {pr:.3f}, Recall: {re:.3f}, F1: {f1:.3f}")

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score, cross_val_predict
from skrub import MinHashEncoder, TableVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import submitit
from src.data_loading import load_data
from src.utils import FixedSizeSplit
from itertools import product
import time
import os

def run_catboost(X, y, cv, features, **kwargs):
    """
    X_text: np array of shape (n_samples, 1), the text feature
    X_rest: np array of shape (n_samples, n_features), additional tabular data
    y: np array of shape (n_samples,), the classifcation target
    dim_reduction_name: str, the name of the dim reduction method
    dim_reduction: sklearn transformer, the dim reduction method
    model_name: str, the name of the model
    model: sklearn model, the model
    encoding: str, the name of the encoding which was used to create X_enc
    cv: sklearn cross validator, the cross validator to use
    """
    tb = TableVectorizer(cardinality_threshold=len(X) * 0.8, #TODO: don't hardcode
                        high_card_cat_transformer = "passthrough",
                        low_card_cat_transformer = "passthrough",
                        numerical_transformer = "passthrough",
                        datetime_transformer = "passthrough",
    ) #just to get the high cardinality columns
    tb.fit(X)
    # get high cardinality columns
    high_cardinality_columns = []
    low_cardinality_columns = []
    datetime_columns = []
    for name, trans, cols in tb.transformers_:
        print(name, cols)
        if "high" in name:
            high_cardinality_columns.extend(cols)
        elif "low" in name:
            low_cardinality_columns.extend(cols)
        elif "datetime" in name:
            datetime_columns.extend(cols)
    print("Low cardinality columns", low_cardinality_columns)
    print("High cardinality columns", high_cardinality_columns)
    print("Datetime columns", datetime_columns)

    # drop datetime columns
    X = X.drop(columns=datetime_columns)
    

    model = CatBoostClassifier()

    if features == "text_only":
        X = X.drop(columns=[col for col in X.columns if (col not in high_cardinality_columns)])
    else:
        assert features == "all"
    
    # do cv by hand
    accuracies = []
    roc_aucs = []
    #TODO, maybe I can specify in the model and use sklearn cross_val score
    for train_index, test_index in cv.split(X, y):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train, cat_features=low_cardinality_columns, text_features=high_cardinality_columns)

        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]

        accuracy = accuracy_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_proba)

        accuracies.append(accuracy)
        roc_aucs.append(roc_auc)

    return {
        'encoding': "catboost",
        'dim_reduction': "none",
        'model': "catboost",
        'accuracies': accuracies,
        'roc_auc': roc_aucs,
        'n_train': cv.n_train,
        'n_test': cv.n_test,
        **kwargs
    }

def pipeline(config):#dataset, encoding, n_test, dim_reduction_name, model_name, n_train, features):
    print(config)
    dataset, n_test, n_train, features = config

    X, y = load_data(dataset, max_rows=10000)
    X = X.reset_index()
    cv = FixedSizeSplit(n_splits=7, n_train=n_train, n_test=n_test, random_state=42)
    return run_catboost(X, y, cv, dataset=dataset, features=features)


In [None]:
X = X.reset_index()

In [None]:
X

In [None]:
from catboost.datasets import rotten_tomatoes

In [None]:
df = rotten_tomatoes()[0]
# drop missing
df = df.dropna(axis=0).reset_index()
y = df["rating"]
X = df.drop(columns=["rating", "rating_10"])

In [None]:
dataset = "zomato"
n_train = 2000
n_test = 500
features = "all"
X, y = load_data(dataset, max_rows=10000)
X = X.reset_index()
cv = FixedSizeSplit(n_splits=7, n_train=n_train, n_test=n_test, random_state=42)
run_catboost(X, y, cv, dataset=dataset, features=features)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from src.encodings import encode_high_cardinality_features
from src.data_loading import load_data

dataset = "goodreads"
X, y = load_data(dataset, max_rows=10_000)
encoding = "openai__"
X_enc, X_rest = encode_high_cardinality_features(X, encoding, dataset_name=dataset, override_cache=False, cardinality_threshold=30, fail_if_not_cached=True)
from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier, GradientBoostingRegressor, HistGradientBoostingRegressor
from src.utils import FixedSizeSplit
cv = FixedSizeSplit(n_train=1000, n_test=500, n_splits=3)
#run_on_encoded_data_with_cst(X_enc, X_rest, y, "passthrough", "passthrough", "HistGradientBoostingClassifier", HistGradientBoostingClassifier(), encoding, cv=cv, regression=False, no_interaction_between_enc_and_rest=False)

In [None]:
from src.utils import run_on_encoded_data_ensemble
from sklearn.linear_model import LogisticRegression

run_on_encoded_data_ensemble(X_enc, X_rest, y, "passthrough", "passthrough", enc_model_name="LogisticRegression", enc_model=LogisticRegression(),
                             rest_model_name="GradientBoostingClassifier", rest_model=GradientBoostingClassifier(), encoding=encoding, cv=cv,
                             aggregation="stacking",
                                regression=False, no_interaction_between_enc_and_rest=False, n_jobs=1, verbose=1)

In [None]:
from sklearn.decomposition import PCA
from src.utils import run_on_encoded_data
run_on_encoded_data(X_enc, X_rest, y, "PCA_30", PCA(n_components=30),
                             model_name="GradientBoostingClassifier", model=GradientBoostingClassifier(), encoding=encoding, cv=cv,
                                regression=False, no_interaction_between_enc_and_rest=False, n_jobs=1, verbose=1)

In [None]:
from src.data_loading import load_data
from skrub import TableVectorizer

# populate the table with features on each column
datasets = ['bikewale', 'clear_corpus', 'company_employees',
       'employee-remuneration-and-expenses-earning-over-75000',
       'employee_salary', 'goodreads', 'journal_jcr_cls', 'ramen_ratings',
       'spotify', 'us_accidents_counts', 'us_accidents_severity',
       'us_presidential', 'wine_review', 'zomato']
for dataset in datasets:
    print(dataset)
    print("--------------")
    X, y = load_data(dataset, max_rows=10_000)
    tb = TableVectorizer(cardinality_threshold=30,
                        high_card_cat_transformer = "passthrough",
                        low_card_cat_transformer = "passthrough",
                        numerical_transformer = "passthrough",
                        datetime_transformer = "passthrough",
    ) #just to get the high cardinality columns
    tb.fit(X)
    # get high cardinality columns
    high_cardinality_columns = []
    for name, trans, cols in tb.transformers_:
        print(name, cols)
        if "high" in name:
            high_cardinality_columns.extend(cols)
            break
    print("High cardinality columns", high_cardinality_columns)
    # find all columns for this dataset
    # for each column, get the features
    for col in high_cardinality_columns:
        # get the features for this column
        print(col)
        print(X[col])

In [None]:
print(2)