<a href="https://www.kaggle.com/code/johnycoder/icr-inference?scriptVersionId=160890073" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!pip install -q /kaggle/input/tabpfn-019-whl/tabpfn-0.1.9-py3-none-any.whl
!mkdir /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
!cp /kaggle/input/tabpfn-019-whl/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

In [None]:
import pandas as pd, numpy as np, catboost as ctb, pandas as pd, time, warnings
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import make_scorer, log_loss
from sklearn import svm, neighbors, naive_bayes, ensemble, neural_network, preprocessing, impute, gaussian_process, linear_model, compose, cluster
from sklearn.multioutput import MultiOutputClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier

from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

from tabpfn import TabPFNClassifier
from joblib import dump, load

In [None]:
train = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/train.csv")
greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')
train = pd.merge(train, greeks, on='Id', how='inner')
classes = train.Class
train = train.drop(['Class', 'Id', 'Epsilon'], axis=1)
display(train)
features = train.columns
numeric_transformer = Pipeline(
    steps=[('imputer', impute.SimpleImputer(missing_values=pd.NA, strategy='mean')),
           ('scaler', preprocessing.MinMaxScaler())])

categoric_transformer = Pipeline(
    steps=[('imputer', impute.SimpleImputer(missing_values=pd.NA, strategy='most_frequent')),
           ('onehot', preprocessing.OneHotEncoder(handle_unknown='ignore'))])

preprocessor = compose.ColumnTransformer(transformers=[
        ("num", numeric_transformer, features.drop(['EJ', 'Alpha', 'Beta', 'Gamma', 'Delta'])),
        ("cat", categoric_transformer, ['EJ', 'Alpha', 'Beta', 'Gamma', 'Delta'])])

In [None]:
def balanced_log_loss(y_true, y_pred):
    #https://www.kaggle.com/code/dan3dewey/icr-2023-balanced-log-loss
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    w_0 = 1 / N_0
    w_1 = 1 / N_1
    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = np.clip(1 - p_1, 1e-15, 1 - 1e-15) # had to clip again, numbers misbehaving
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0))
    log_loss_1 = -np.sum(y_true * np.log(p_1))
    balanced_log_loss = 2*(w_0 * log_loss_0 + w_1 * log_loss_1) / (w_0 + w_1)
    return balanced_log_loss/(N_0+N_1)
my_scorer = make_scorer(balanced_log_loss, needs_proba = True)

## Parallel

In [None]:
import concurrent.futures
import time 

gp_pipe = Pipeline([("preprocessor", preprocessor),
                 ('oversampler', SMOTE()),
                 ('gp', ensemble.BaggingClassifier(estimator=gaussian_process.GaussianProcessClassifier(max_iter_predict = 1000),n_estimators=800))])
  
svc_pipe = Pipeline([("preprocessor", preprocessor),
                     ('svc', svm.SVC(kernel='rbf', max_iter = 10**9, C = 6, gamma = 0.7, probability = True, class_weight = 'balanced'))])

lr_pipe = Pipeline([("preprocessor", preprocessor),
                     ('oversampler', SMOTE()),
                     ('lr', ensemble.BaggingClassifier(estimator=linear_model.LogisticRegression(l1_ratio = 0.7, max_iter = 1000_000, solver = 'saga', penalty='elasticnet'),n_estimators=1000))])
  
kn_pipe = Pipeline([("preprocessor", preprocessor),
                 ('oversampler', SMOTE()),
                 ('kn', ensemble.BaggingClassifier(estimator=neighbors.KNeighborsClassifier(n_neighbors = 20, metric = 'manhattan', weights = 'distance', algorithm = 'kd_tree'),n_estimators=1000, max_samples = 0.7))])

ctb_pipe = Pipeline([("preprocessor", preprocessor),
                 ('oversampler', SMOTE()),
                 ('ctb', ctb.CatBoostClassifier(n_estimators = 1000, learning_rate = 0.01, verbose = False))])

tab_pipe = Pipeline([("preprocessor", preprocessor),
                 ('oversampler', SMOTE()),
                 ('tab', TabPFNClassifier())])

def evaluate(pipe, model, train=train, classes=classes, cv=3, scorer=my_scorer):
    print(model+' model execution started')
    t1 = time.time()
    score = cross_val_score(pipe, train, classes, cv=cv, scoring = scorer)
    t2 = time.time()
    print(model+' mean score: '+str(np.round(np.mean(score),2))+', score deviation: '+str(np.round(np.std(score),2))+', scoring time: '+str(np.round(t2-t1, 2))+'s')

#list of arguments to parallely executed function
pipes = [ lr_pipe, kn_pipe, ctb_pipe, tab_pipe, svc_pipe, gp_pipe]
names = [ 'LR', 'KNN', 'CTB', 'TAB', 'SVC', 'GP']

In [None]:
import concurrent.futures

t1 = time.time()
# Use ThreadPoolExecutor for parallel processing
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Map the multiply function to the corresponding pairs of numbers in parallel
    results = list(executor.map(evaluate, pipes, names))
t2 = time.time()
print('Scoring time with multiprocessing: '+str(np.round(t2-t1, 2))+'s')

# Print the results
print(results)


In [None]:
svc_weight = 1/10
lr_weight = 1/26
gp_weight = 1/28
knn_weight = 1/25
tab_weight = 1/30
ctb_weight = 1/18

## Ensemble

In [None]:
ensemble_model = ensemble.VotingClassifier(estimators=[('svc', svc_pipe), ('lr', lr_pipe), ('gp', gp_pipe), ('knn', kn_pipe),('tab_pipe', tab_pipe), ('ctb_pipe', ctb_pipe)],voting='soft', weights =[svc_weight, lr_weight, gp_weight, knn_weight, tab_weight, ctb_weight]) #0.207
ensemble_score = cross_val_score(ensemble_model, train, classes, cv=3, scoring = my_scorer)
print(np.mean(ensemble_score), np.std(ensemble_score))

In [None]:
ensemble_model.fit(train, classes)

In [None]:
#stacking_model = ensemble.StackingClassifier(estimators=[('svc', svc_pipe), ('lr', lr_pipe), ('gp', gp_pipe), ('knn', kn_pipe),('tab_pipe', tab_pipe), ('ctb_pipe', ctb_pipe)], final_estimator=linear_model.LogisticRegression())
#stacking_score = cross_val_score(stacking_model, train, classes, cv=3, scoring = my_scorer)
#print(np.mean(stacking_score), np.std(stacking_score))

In [None]:
#stacking_model.fit(train, classes)

## Submission

In [None]:
greeks_ensemble_model = load('/kaggle/input/icr-data-augment/greeks_augment_model.joblib')
test_data = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
classes = train.Class
test = test_data.drop(['Id'], axis=1)
predictions = np.array(greeks_ensemble_model.predict_proba(test))
class_columns = ['Alpha_A','Alpha_B','Alpha_D','Alpha_G','Beta_A','Beta_B','Beta_C','Gamma_A','Gamma_B','Gamma_E','Gamma_F','Gamma_G','Gamma_H','Gamma_M','Gamma_N','Delta_A','Delta_B','Delta_C','Delta_D']
predictions = pd.DataFrame(predictions[:, :, 1].T, columns= class_columns)
alphas = ['Alpha_A','Alpha_B','Alpha_D','Alpha_G']
betas = ['Beta_A','Beta_B','Beta_C']
gammas = ['Gamma_A','Gamma_B','Gamma_E','Gamma_F','Gamma_G','Gamma_H','Gamma_M','Gamma_N']
deltas = ['Delta_A','Delta_B','Delta_C','Delta_D']

def singularize(dataframe, columns):
    prediction = predictions[columns]
    arr = prediction.to_numpy()
    max_indices = np.argmax(arr, axis=1)
    result = np.zeros_like(arr)
    result[np.arange(result.shape[0]), max_indices] = 1
    return pd.DataFrame(result, columns = columns).astype('int32')

alphas = singularize(predictions, alphas)
betas = singularize(predictions, betas)
gammas = singularize(predictions, gammas)
deltas = singularize(predictions, deltas)

new_classes = alphas.join([betas, gammas, deltas])
categories = new_classes.columns.str.split('_').str[0].unique()
greeks = pd.DataFrame({category: new_classes[[col for col in new_classes.columns if category in col]].idxmax(axis=1).str.split('_').str[1] for category in categories})
display(greeks)

In [None]:
ensemble_proba = pd.DataFrame(ensemble_model.predict_proba(test), columns = ['class_0', 'class_1'])

submission = pd.DataFrame()
submission['Id'] = test_data['Id']
submission['class_0'] = ensemble_proba['class_0']
submission['class_1'] = ensemble_proba['class_1']
display(submission)

submission.to_csv('submission.csv', index=False)