Data labeled as "greeks" is from the competition available only for training set, not for testing set. In this notebook, so far without significant succes, I'm trying to create an ensemble classification model that could predict the missing labels.

In [1]:
!pip install -q /kaggle/input/tabpfn-019-whl/tabpfn-0.1.9-py3-none-any.whl
!mkdir /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
!cp /kaggle/input/tabpfn-019-whl/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

[0m

In [2]:
# importing necessary libraries
import pandas as pd, numpy as np, catboost as ctb, pandas as pd, time, warnings

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import make_scorer, log_loss, balanced_accuracy_score, make_scorer, f1_score
from sklearn import svm, neighbors, naive_bayes, ensemble, neural_network, preprocessing, impute, gaussian_process, linear_model, compose, cluster
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline

from tabpfn import TabPFNClassifier
from joblib import dump

# Loading Data

In [3]:
train = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/train.csv")
greeks = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/greeks.csv")
greeks = greeks.drop(['Epsilon'], axis = 1)
train = pd.merge(train, greeks, on='Id', how='inner')
classes = train[['Alpha', 'Beta', 'Gamma', 'Delta']]
display(classes)
classes = pd.get_dummies(classes, columns=['Alpha', 'Beta', 'Gamma', 'Delta'])
class_columns = classes.columns
display(classes)
classes =  classes.to_numpy()
print(np.shape(classes))
train = train.drop(['Class', 'Id','Alpha', 'Beta', 'Gamma', 'Delta'], axis=1)
features = train.columns

numeric_transformer = Pipeline(
    steps=[('imputer', impute.SimpleImputer(missing_values=pd.NA, strategy='mean')),
           ('scaler', preprocessing.MinMaxScaler())])

categoric_transformer = Pipeline(
    steps=[('imputer', impute.SimpleImputer(missing_values=pd.NA, strategy='most_frequent')),
           ('onehot', preprocessing.OneHotEncoder(handle_unknown='ignore'))])

preprocessor = compose.ColumnTransformer(transformers=[
        ("num", numeric_transformer, features.drop(['EJ'])),
        ("cat", categoric_transformer, ['EJ'])])

Unnamed: 0,Alpha,Beta,Gamma,Delta
0,B,C,G,D
1,A,C,M,B
2,A,C,M,B
3,A,C,M,B
4,D,B,F,B
...,...,...,...,...
612,A,B,M,B
613,A,B,M,B
614,A,C,M,B
615,A,C,M,B


Unnamed: 0,Alpha_A,Alpha_B,Alpha_D,Alpha_G,Beta_A,Beta_B,Beta_C,Gamma_A,Gamma_B,Gamma_E,Gamma_F,Gamma_G,Gamma_H,Gamma_M,Gamma_N,Delta_A,Delta_B,Delta_C,Delta_D
0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1
1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0
2,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0
3,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0
4,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0
613,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0
614,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0
615,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0


(617, 19)


In [4]:
svc_weight = 1/10
lr_weight = 1/26
gp_weight = 1/28
knn_weight = 1/25
tab_weight = 1/30
ctb_weight = 1/18

In [5]:
gp_pipe = Pipeline([("preprocessor", preprocessor),
                 ('gp', ensemble.BaggingClassifier(estimator=gaussian_process.GaussianProcessClassifier(max_iter_predict = 1000),n_estimators=100))]) #should be 800

svc_pipe = Pipeline([("preprocessor", preprocessor),
                     ('svc', svm.SVC(max_iter = 10**9, C = 1, gamma = 1, kernel = 'poly', class_weight = 'balanced', probability = True))])

lr_pipe = Pipeline([("preprocessor", preprocessor),
                     ('lr', ensemble.BaggingClassifier(estimator=linear_model.LogisticRegression(max_iter = 1000_000, solver = 'saga', class_weight = 'balanced'),n_estimators=1000))]) 

kn_pipe = Pipeline([("preprocessor", preprocessor),
                 ('kn', ensemble.BaggingClassifier(estimator=neighbors.KNeighborsClassifier(weights = 'distance', algorithm = 'kd_tree', metric = 'manhattan', n_neighbors = 5),n_estimators=1000, max_samples = 0.7))])
ctb_pipe = Pipeline([("preprocessor", preprocessor),
                 ('ctb', ctb.CatBoostClassifier(n_estimators = 1000, learning_rate = 0.1, verbose = False))])

tab_pipe = Pipeline([("preprocessor", preprocessor),
                 ('tab', TabPFNClassifier())]);

Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters


In [6]:
#lr_pipe = Pipeline([("preprocessor", preprocessor),
#                     ('lr', MultiOutputClassifier(ensemble.BaggingClassifier(estimator=linear_model.LogisticRegression(max_iter = 1000_000, solver = 'saga', class_weight = 'balanced'),n_estimators=1000), n_jobs = -1))]) 
#lr_pipe.fit(train, classes)

In [7]:
#with warnings.catch_warnings():
#    warnings.simplefilter("ignore")
#    ensemble_model = MultiOutputClassifier(ensemble.VotingClassifier(estimators=[('svc', svc_pipe), ('lr', lr_pipe), ('gp', gp_pipe), ('knn', kn_pipe),('tab_pipe', tab_pipe), ('ctb_pipe', ctb_pipe)],voting='soft', weights =[svc_weight, lr_weight, gp_weight, knn_weight, tab_weight, ctb_weight]), n_jobs = -1) #0.207
#    #ensemble_model = MultiOutputClassifier(ensemble.VotingClassifier(estimators=[('svc', svc_pipe),('tab_pipe', tab_pipe), ('ctb_pipe', ctb_pipe)],voting='soft', weights =[svc_weight, tab_weight, ctb_weight]), n_jobs = -1) #0.207
#    ensemble_model.fit(train, classes)
svc_model = Pipeline([("preprocessor", preprocessor),
                     ('svc', MultiOutputClassifier(svm.SVC(max_iter = 10**9, C = 1, gamma = 1, kernel = 'poly', class_weight = 'balanced', probability = True)))])
tab_model = Pipeline([("preprocessor", preprocessor),
                 ('tab', MultiOutputClassifier(TabPFNClassifier()))]);
ensemble_model = tab_model
ensemble_model.fit(train, classes)
predictions = np.array(ensemble_model.predict_proba(train))
predictions = pd.DataFrame(predictions[:, :, 1].T, columns= class_columns)
alphas = ['Alpha_A','Alpha_B','Alpha_D','Alpha_G']
betas = ['Beta_A','Beta_B','Beta_C']
gammas = ['Gamma_A','Gamma_B','Gamma_E','Gamma_F','Gamma_G','Gamma_H','Gamma_M','Gamma_N']
deltas = ['Delta_A','Delta_B','Delta_C','Delta_D']

def singularize(dataframe, columns):
    prediction = predictions[columns]
    arr = prediction.to_numpy()
    max_indices = np.argmax(arr, axis=1)
    result = np.zeros_like(arr)
    result[np.arange(result.shape[0]), max_indices] = 1
    return pd.DataFrame(result, columns = columns).astype('int32')

alphas = singularize(predictions, alphas)
betas = singularize(predictions, betas)
gammas = singularize(predictions, gammas)
deltas = singularize(predictions, deltas)

new_classes = alphas.join([betas, gammas, deltas])
print(np.sum(np.square(new_classes.to_numpy() - classes)))

categories = new_classes.columns.str.split('_').str[0].unique()
collapsed_df = pd.DataFrame({category: new_classes[[col for col in new_classes.columns if category in col]].idxmax(axis=1).str.split('_').str[1] for category in categories})
display(collapsed_df)

Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Tr

Unnamed: 0,Alpha,Beta,Gamma,Delta
0,B,C,G,D
1,A,C,M,B
2,A,C,M,B
3,A,C,M,B
4,D,B,F,B
...,...,...,...,...
612,A,B,M,B
613,A,B,M,B
614,A,C,M,B
615,A,C,M,B


In [8]:
dump(ensemble_model, 'greeks_augment_model.joblib')

['greeks_augment_model.joblib']