In [None]:
!pip install tabpfn --no-index --find-links=file:///kaggle/input/pip-packages-icr/pip-packages
!mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
!cp /kaggle/input/pip-packages-icr/pip-packages/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder,normalize
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
import imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import xgboost
import inspect
from collections import defaultdict
from tabpfn import TabPFNClassifier
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')

greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')

In [None]:
first_category = train.EJ.unique()[0]
train.EJ = train.EJ.eq(first_category).astype('int')
test.EJ = test.EJ.eq(first_category).astype('int')

In [None]:
ID_for_sub=test['Id']

In [None]:
x= train.drop(['Id','Class'],axis=1)
y = train['Class']

test=test.drop(['Id'],axis=1)

In [None]:
def balanced_log_loss(y_true, y_pred):
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)

    w_0 = 1 / N_0
    w_1 = 1 / N_1

    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1

    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0))
    log_loss_1 = -np.sum(y_true * np.log(p_1))

    balanced_log_loss = 2*(w_0 * log_loss_0 + w_1 * log_loss_1) / (w_0 + w_1)

    return balanced_log_loss/(N_0+N_1)

In [None]:
class Ensemble():
    def __init__(self):
        self.imputer = SimpleImputer(missing_values=np.nan, strategy='median')

        self.classifiers =[xgboost.XGBClassifier(n_estimators=100,max_depth=3,learning_rate=0.2,subsample=0.9,colsample_bytree=0.85),
                           xgboost.XGBClassifier(),
                           TabPFNClassifier(N_ensemble_configurations=24),
                          TabPFNClassifier(N_ensemble_configurations=64)]
    
    def fit(self,X,y):
        
        X = self.imputer.fit_transform(X)

        for classifier in self.classifiers:
            if classifier==self.classifiers[2] or classifier==self.classifiers[3]:
                classifier.fit(X,y,overwrite_warning =True)
            else :
                classifier.fit(X, y)
     
    def predict_proba(self, x):
        x = self.imputer.transform(x)
        
        probabilities = np.stack([classifier.predict_proba(x) for classifier in self.classifiers])
        averaged_probabilities = np.mean(probabilities, axis=0)
        
        class_0_est_instances = averaged_probabilities[:, 0].sum()
        others_est_instances = len(averaged_probabilities)-class_0_est_instances

        new_probabilities = averaged_probabilities * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(averaged_probabilities.shape[1])]])
        return new_probabilities / np.sum(new_probabilities, axis=1, keepdims=1) 

In [None]:
from tqdm.notebook import tqdm

In [None]:
from sklearn.model_selection import KFold as KF, GridSearchCV

In [None]:
def training(model, x,y,y_meta):
    
    outer_results = list()
    
    best_loss = np.inf
    
    split = 0
    splits = 5
    
    cv_inner = KF(n_splits = splits, shuffle=True, random_state=42)
    
    for train_idx,val_idx in tqdm(cv_inner.split(x), total = splits):
        
        split+=1
        
        x_train, x_val = x.iloc[train_idx],x.iloc[val_idx]
        y_train, y_val = y_meta[train_idx], y.iloc[val_idx]
                
        model.fit(x_train, y_train)
        
        y_pred = model.predict_proba(x_val)
        
        p0=y_pred[:,0]
        
        p0=np.where(p0>=0.5,0,1)
        
        p0=p0.reshape(len(p0))
        
        loss = balanced_log_loss(y_val,p0)

        if loss<best_loss:
            best_model = model
            best_loss = loss
            print('best_model_saved')
        outer_results.append(loss)
        print('>val_loss=%.5f, split = %.1f' % (loss,split))
    print('LOSS: %.5f' % (np.mean(outer_results)))
    return best_model
    

In [None]:
from datetime import datetime
times = greeks.Epsilon.copy()
times[greeks.Epsilon != 'Unknown'] = greeks.Epsilon[greeks.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal())
times[greeks.Epsilon == 'Unknown'] = np.nan

In [None]:
train['Epsilon']=times
test['Epsilon']=max(times)+1

In [None]:
ros = RandomOverSampler(random_state=42)

train_ros, y_ros = ros.fit_resample(train, greeks.Alpha)

_, y_ros = np.unique(y_ros, return_inverse=True)

In [None]:
x_ros = train_ros.drop(['Class', 'Id'],axis=1)
y_ = train_ros.Class

In [None]:
yt = Ensemble()

In [None]:
m = training(yt,x_ros,y_,y_ros)

In [None]:
y_pred = m.predict_proba(test)

p0 = y_pred[:,0]

p0[p0 > 0.62] = 1
p0[p0 < 0.26] = 0

In [None]:
submission = pd.DataFrame(ID_for_sub, columns=["Id"])

submission["class_0"] = p0
submission["class_1"] = 1 - p0

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
submission_df = pd.read_csv('submission.csv')
submission_df