# Library import

In [148]:
#Import de librerias basicas tablas y matrices
import numpy as np 
import pandas as pd 

#Gradient Boosting
import lightgbm as lgb

#Funciones auxiliares sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold #Split y cross Validation
from sklearn.metrics import cohen_kappa_score, accuracy_score, balanced_accuracy_score #Metricas
from sklearn.utils import shuffle 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn import set_config



#Visualizacióon
from plotly import express as px

#Plot de matriz de confusion normalizada en actuals
import sys
sys.path.append('../Scripts')
from utils import plot_confusion_matrix

import os

#Optimizacion de hiperparametros
import optuna
from optuna.artifacts import FileSystemArtifactStore, upload_artifact

#Guardado de objetos en archivos joblib
from joblib import load, dump

# Env

In [149]:
# Paths para acceso archivos
#Este notebook asume la siguiente estructura de carpetas a partir de la ubicacion de base_dir 
#(dos niveles arriba de la carpeta donde se ejecuta el notebook). 
# /UA_MDM_LDI_II/
# /UA_MDM_LDI_II/input
# /UA_MDM_LDI_II/input/petfinder-adoption-prediction/            <- Aca deben ir todos los archivos de datos de la competencia 
# /UA_MDM_LDI_II/tutoriales/                       <- Aca deben poner los notebooks y scripts que les compartimos
# /UA_MDM_LDI_II/work/                             <- Resultados de notebooks iran dentro de esta carpeta en subcarpetas
# /UA_MDM_LDI_II/work/models/                     <- Modelos entrenados en archivos joblibs
# /UA_MDM_LDI_II/work/optuna_temp_artifacts/      <- Archivos que queremos dejar como artefacto de un trial de optuna (optuna los copiara a la carpeta de abajo)
# /UA_MDM_LDI_II/work/optuna_artifacts/           <- Archivos con artefactos que sibimos a optuna

#Subimos dos niveles para quedar en la carpeta que contiene input y UA_MDM_LDI_II
BASE_DIR = '../'

#Datos de entrenamiento 
PATH_TO_TRAIN = os.path.join(BASE_DIR, "input/petfinder-adoption-prediction/train/train.csv")
PATH_TO_TEST = os.path.join(BASE_DIR, "input/petfinder-adoption-prediction/test/test.csv")

#Salida de modelos entrenados
PATH_TO_MODELS = os.path.join(BASE_DIR, "work/models")

#Artefactos a subir a optuna
PATH_TO_TEMP_FILES = os.path.join(BASE_DIR, "work/optuna_temp_artifacts")

#Artefactos que optuna gestiona
PATH_TO_OPTUNA_ARTIFACTS = os.path.join(BASE_DIR, "work/optuna_artifacts")


SEED = 42 #Semilla de procesos aleatorios (para poder replicar exactamente al volver a correr un modelo)
TEST_SIZE = 0.2 #Facción para train/test= split

# Config for pandas output from pipelines
set_config(transform_output = "pandas")

In [150]:
# Datos Tabulares
df_train = pd.read_csv(PATH_TO_TRAIN)
df_train.shape

(14993, 24)

In [151]:
# Datos Tabulares
df_test = pd.read_csv(PATH_TO_TEST)
df_test.shape

(3972, 23)

In [152]:
# Otros archivos

# States diccionary
df_state_labels = pd.read_csv(os.path.join(BASE_DIR, "input/petfinder-adoption-prediction/StateLabels.csv"))
df_state_labels['State_Pop'] = [
    3794,
    2194,
    1929,
    1746,
    100,
    937,
    1129,
    1685,
    2509,
    255,
    1774,
    3833,
    2822,
    6555,
    1275
]

### Categorical variables


| Variable       | Type         | Description                                                                           |
|----------------|--------------|---------------------------------------------------------------------------------------|
| `PetID`        | Categorical  | ID (Should be dropped)                                                                |
| `AdoptionSpeed`| Categorical  | Target variable                                                                       |
| `Type`         | Categorical  | 1 = Cat, 2 = Dog                                                                      |
| `Name`         | Categorical  | Name of pet                                                                           |
| `Breed1`       | Categorical  | See BreedLabels dictionary                                                            |
| `Breed2`       | Categorical  | See BreedLabels dictionary                                                            |
| `Gender`       | Categorical  | 1 = Male, 2 = Female, 3 = Mixed (used for groups)                                     |
| `Color1`       | Categorical  | See ColorLabels dictionary                                                            |
| `Color2`       | Categorical  | See ColorLabels dictionary                                                            |
| `Color3`       | Categorical  | See ColorLabels dictionary                                                            |
| `MaturitySize` | Categorical  | 1 = Small, 2 = Medium, 3 = Large, 4 = Extra Large, 0 = Not Specified                  |
| `FurLength`    | Categorical  | 1 = Short, 2 = Medium, 3 = Long, 0 = Not Specified                                    |
| `Vaccinated`   | Categorical  | 1 = Yes, 2 = No, 3 = Not Sure                                                         |
| `Dewormed`     | Categorical  | 1 = Yes, 2 = No, 3 = Not Sure                                                         |
| `Sterilized`   | Categorical  | 1 = Yes, 2 = No, 3 = Not Sure                                                         |
| `Health`       | Categorical  | 1 = Healthy, 2 = Minor Injury, 3 = Serious Injury, 0 = Not Specified                  |
| `State`        | Categorical  | See StateLabels dictionary                                                            |
| `RescuerID`    | Categorical  | ID                                                                                    |
| `Description`  | Text         | Profile write-up for this pet. The primary language used is English, with some in Malay or Chinese. |

### Quantitative Variables

| Variable   | Type             |Description                                           |
|------------|------------------|------------------------------------------------------|
| `Age`      |  Numerical       |Age of pet when listed, in months                     |
| `Quantity` |  Numerical       |Number of pets represented in profile                 |
| `Fee`      |  Numerical       |Adoption fee (0 = Free)                               |
| `VideoAmt` |  Numerical       |Total uploaded videos for this pet                    |
| `PhotoAmt` |  Numerical       |Total uploaded photos for this pet                    |

In [153]:
!which python

/home/ge/MCD/LaboII/.venv/bin/python


# FE

In [154]:

def apply_fe(dataset):
    # Name
    # Feature to know if the pet has a name
    unknown_names = 'name|puppies|kitten|puppy|unknown'
    dataset['Name'] = np.where(dataset['Name'].str.lower().str.contains(unknown_names), np.nan, dataset['Name'])
    dataset['Name'] = np.where(dataset['Name'].str.len() < 3, np.nan, dataset['Name'])
    dataset['HasName'] = dataset['Name'].apply(lambda x: 0 if pd.isnull(x) else 1)

    
    # Breed
    # Unify values in Breed1
    dataset['Breed1'] = np.where((dataset['Breed1']==0) & (dataset['Breed2']!=0), dataset['Breed2'], dataset['Breed1'])
    dataset['Breed2'] = np.where((dataset['Breed1']==dataset['Breed2']), 0, dataset['Breed2'])
    
    # Merge Breed1 and Breed2
    dataset['FullBreed'] = dataset['Breed1'].astype(str) + '_' + dataset['Breed2'].astype(str) 
    # Pure breed
    dataset['PureBreed'] = np.where((dataset['Breed2'] == 0) & (~dataset['Breed1'].isin([307, 266, 265, 264])), 1, 0)
    
    # Color
    # Merge Color1, Color2 and Color3
    dataset['Color'] = dataset['Color1'].astype(str) + '_' + dataset['Color2'].astype(str) + '_' + dataset['Color3'].astype(str)
    # Monochromatic
    dataset['Monochromatic'] = np.where((dataset['Color2'] == 0) & (dataset['Color3'] == 0), 1, 0)
    
    # Health
    # Merge Vaccinated, Dewormed and Sterilized to know if the pet is up to date with routine stuff
    dataset['Va_De_St'] = dataset['Vaccinated'].astype(str) + '_' + dataset['Dewormed'].astype(str) + '_' + dataset['Sterilized'].astype(str)

    # Code to add a min age for sterilization.... But EDA shows that they do it anyway
    #df['CanBeSter'] = np.where((df['Age'] > 9) & (df['Type'] == 1) | (df['Age'] > 6) & (df['Type'] == 2), 1, 0)
    
    # Fee
    # Create fee bins using log transformation
    dataset['Fee_bins'] = pd.cut(np.log1p(dataset.Fee), 5, labels=['Fee_{}'.format(e) for e in range(5)])

    #RescuerID
    rescuer_count = dataset.groupby(['RescuerID'])['PetID'].count().reset_index()
    rescuer_count.columns = ['RescuerID', 'Rescuer_count']
    dataset = dataset.merge(rescuer_count, how='left', on='RescuerID')

    # Age
    dataset['RelAge'] = np.where(dataset['Type'] == 1, dataset['Age']/144, dataset['Age']/180) # Vida media de un gato 12 años, de un perro 15

    # Multimedia
    dataset['Total_photo_video'] = dataset['PhotoAmt'] + dataset['VideoAmt']

    # Si tiene todo lo del veterinatrio en orden
    dataset['VetInOrder'] = np.where((dataset['Vaccinated'] == 1) & (dataset['Dewormed'] == 1) & (dataset['Sterilized'] == 1) & (dataset['Health'] == 1), 1, 0)

    # Si le falta info sobre lo relacionado a veterinario
    dataset['NoVet'] = np.where((dataset['Vaccinated'] == 3) | (dataset['Dewormed'] == 3) | (dataset['Sterilized'] == 3) | (dataset['Health'] == 3), 1, 0)

    # Longitud de la descripcion
    dataset['LenDesc'] = dataset['Description'].apply(lambda x: len(str(x)))

    # variable "age_categ" para categorizar age (identica para perros y gatos, sin problemas según fuentes)
    age_categ_cuts = [
        (dataset['Age'].le(1*12)),
        (dataset['Age'].gt(1*12) & dataset['Age'].le(3*12)),
        (dataset['Age'].gt(3*12) & dataset['Age'].le(9*12)),
        (dataset['Age'].gt(9*12))   
    ]
    age_categ_values = ["baby","young","adult","old"]
    dataset['Age_categ'] = np.select(age_categ_cuts, age_categ_values)

    # variable "individual" para saber si es un animal solo o un grupo
    dataset['Individual'] = np.where((dataset['Quantity'].gt(1)),1,0)

    #variable "free" para saber si hay que pagar o no por adoptar
    dataset['Free'] = np.where((dataset['Fee'].gt(0)),1,0)

    # agrego state label y state population
    dataset = pd.merge(dataset, df_state_labels, how = "left", left_on='State',right_on='StateID')

    # Return the augmented dataset
    return dataset

In [155]:
df_train = apply_fe(df_train.copy())
df_test = apply_fe(df_test.copy())

In [156]:
y = df_train['AdoptionSpeed']
X = df_train.drop(columns=['AdoptionSpeed'])

In [157]:
X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                    test_size=TEST_SIZE, 
                                                    random_state=SEED, 
                                                    stratify=y)

In [158]:
# Calculate the mean adoption speed per state and add that value to train and test datasets
state_mean = pd.concat([X_train['State'], y_train], axis=1).groupby('State')['AdoptionSpeed'].mean().reset_index()
state_mean.columns = ['State', 'AdoptionSpeed_mean']
X_train = X_train.merge(state_mean, how='left', on='State')
X_val = X_val.merge(state_mean, how='left', on='State')

In [159]:
X_train.columns

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'HasName', 'FullBreed',
       'PureBreed', 'Color', 'Monochromatic', 'Va_De_St', 'Fee_bins',
       'Rescuer_count', 'RelAge', 'Total_photo_video', 'VetInOrder', 'NoVet',
       'LenDesc', 'Age_categ', 'Individual', 'Free', 'StateID', 'StateName',
       'State_Pop', 'AdoptionSpeed_mean'],
      dtype='object')

In [160]:
#Armo listas con features de texto y numericas
char_feats = ['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized','Health', 'State', 'RescuerID',
       'Description', 'HasName', 'FullBreed',
       'PureBreed', 'Color', 'Monochromatic', 'Va_De_St', 'Fee_bins', 'VetInOrder', 'NoVet','VideoAmt','PhotoAmt','Total_photo_video',
       ]

numeric_feats = ['Age', 'Quantity', 'Fee', 'RelAge',
       'Rescuer_count', 'LenDesc', 'AdoptionSpeed_mean', ]

fe_drop = ['Name','PetID',]

In [161]:
# Para revisar que tenemos todas las columnas en las variables
X_train.shape[1]-len(fe_drop)-len(char_feats)-len(numeric_feats)

6

In [162]:
val_idaaa = pd.read_csv(os.path.join(BASE_DIR, "input/val_id.csv"))

In [163]:
new_train = df_train[~df_train['PetID'].isin(val_idaaa['PetID'])]
new_val = df_train[df_train['PetID'].isin(val_idaaa['PetID'])]

In [169]:
new_val['PetID'].shape

(2999,)

In [123]:
X_train_id = X_train['PetID']
X_train = X_train[char_feats + numeric_feats]

X_val_id = X_val['PetID']

X_val_id.to_csv(os.path.join(PATH_TO_TEMP_FILES, 'X_val_id.csv'), index=False)
X_val = X_val[char_feats + numeric_feats]

In [65]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(min_frequency= 30, handle_unknown= 'use_encoded_value', unknown_value= -1), char_feats)],
        remainder= 'passthrough')

In [66]:
X_train = preprocessor.fit_transform(X_train)

In [67]:
#Entreno un modelo inicial sin modificar hiperparametros. Solamente especifico el numero de clases y el tipo de modelo como clasificacoión
lgb_params = params = {
                        'objective': 'multiclassova',
                        'num_class': 5
                        }


#genero el objeto Dataset que debo pasarle a lightgbm para que entrene
lgb_train_dataset = lgb.Dataset(data=X_train,
                                label=y_train)

#entreno el modelo con los parametros por defecto
lgb_model = lgb.train(lgb_params,
                      lgb_train_dataset)

In [68]:
X_val = preprocessor.transform(X_val)

y_pred = lgb_model.predict(X_val).argmax(axis=1)

#Calculo el Kappa
cohen_kappa_score(y_val,y_pred, weights = 'quadratic')

0.3327498782523731

In [69]:
feature_importance = pd.DataFrame({'feature': preprocessor.get_feature_names_out(), 'weight': lgb_model.feature_importance(importance_type='gain')})
feature_importance.sort_values('weight', ascending=False).head(10)

Unnamed: 0,feature,weight
33,remainder__LenDesc,12887.932647
32,remainder__Rescuer_count,12075.436853
31,remainder__RelAge,9339.059376
1,cat__Breed1,5067.221262
26,cat__PhotoAmt,4940.499122
19,cat__Color,4545.877137
28,remainder__Age,4260.673049
34,remainder__AdoptionSpeed_mean,3117.196303
17,cat__FullBreed,3069.864194
21,cat__Va_De_St,2957.935708


In [70]:
#Genero una metrica para que lightGBM haga la evaluación y pueda hacer early_stopping en el cross validation
def lgb_custom_metric_kappa(dy_pred, dy_true):
    metric_name = 'kappa'
    value = cohen_kappa_score(dy_true.get_label(),dy_pred.argmax(axis=1),weights = 'quadratic')
    is_higher_better = True
    return(metric_name, value, is_higher_better)

#Funcion objetivo a optimizar. En este caso vamos a hacer 5fold cv sobre el conjunto de train. 
# El score de CV es el objetivo a optimizar. Ademas vamos a usar los 5 modelos del CV para estimar el conjunto de test,
# registraremos en optuna las predicciones, matriz de confusion y el score en test.
# CV Score -> Se usa para determinar el rendimiento de los hiperparametros con precision 
# Test Score -> Nos permite testear que esta todo OK, no use (ni debo usar) esos datos para nada en el entrenamiento 
# o la optimizacion de hiperparametros

def cv_es_lgb_objective(trial):

    #PArametros para LightGBM
    lgb_params = {      
                        #PArametros fijos
                        'objective': 'multiclassova',
                        'verbosity':-1,
                        'num_class': len(y_train.unique()),
                        #Hiperparametros a optimizar utilizando suggest_float o suggest_int segun el tipo de dato
                        #Se indica el nombre del parametro, valor minimo, valor maximo 
                        #en elgunos casos el parametro log=True para parametros que requieren buscar en esa escala
                        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
                        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
                        'num_leaves': trial.suggest_int('num_leaves', 10, 256),
                        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 200),
                        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
                        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
                        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
                        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
                        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0.0, 10),
                        } 

    #Voy a generar estimaciones de los 5 modelos del CV sobre los datos test y los acumulo en la matriz scores_ensemble
    scores_ensemble = np.zeros((len(y_val),len(y_train.unique())))

    #Score del 5 fold CV inicializado en 0
    score_folds = 0

    #Numero de splits del CV
    n_splits = 5

    #Objeto para hacer el split estratificado de CV
    skf = StratifiedKFold(n_splits=n_splits)

    for i, (if_index, oof_index) in enumerate(skf.split(X_train, y_train)):
        
        #Dataset in fold (donde entreno) 
        lgb_if_dataset = lgb.Dataset(data=X_train.iloc[if_index],
                                        label=y_train.iloc[if_index],
                                        free_raw_data=False)
        
        #Dataset Out of fold (donde mido la performance del CV)
        lgb_oof_dataset = lgb.Dataset(data=X_train.iloc[oof_index],
                                        label=y_train.iloc[oof_index],
                                        free_raw_data=False)

        #Entreno el modelo
        lgb_model = lgb.train(lgb_params,
                                lgb_if_dataset,
                                valid_sets=lgb_oof_dataset,
                                callbacks=[lgb.early_stopping(10, verbose=False)],
                                feval = lgb_custom_metric_kappa
                                )
        
        #Acumulo los scores (probabilidades) de cada clase para cada uno de los modelos que determino en los folds
        #Se predice el 20% de los datos que separe para tes y no uso para entrenar en ningun fold
        scores_ensemble = scores_ensemble + lgb_model.predict(X_val)
        
        #Score del fold (registros de dataset train que en este fold quedan out of fold)
        score_folds = score_folds + cohen_kappa_score(y_train.iloc[oof_index], 
                                                            lgb_model.predict(X_train.iloc[oof_index]).argmax(axis=1),weights = 'quadratic')/n_splits


    #Guardo prediccion del trial sobre el conjunto de test
    # Genero nombre de archivo
    predicted_filename = os.path.join(PATH_TO_TEMP_FILES,f'test_{trial.study.study_name}_{trial.number}.joblib')
    # Copia del dataset para guardar la prediccion
    predicted_df = pd.DataFrame(X_val,y_val).copy()
    # Genero columna pred con predicciones sumadas de los 5 folds
    predicted_df['pred'] = [scores_ensemble[p,:] for p in range(scores_ensemble.shape[0])]
    # Grabo dataframe en temp_artifacts
    dump(predicted_df, predicted_filename)
    # Indico a optuna que asocie el archivo generado al trial
    upload_artifact(trial, predicted_filename, artifact_store)    

    #Grabo natriz de confusion
    #Nombre de archivo
    cm_filename = os.path.join(PATH_TO_TEMP_FILES,f'cm_{trial.study.study_name}_{trial.number}.jpg')
    #Grabo archivo
    plot_confusion_matrix(y_val,scores_ensemble.argmax(axis=1)).write_image(cm_filename)
    #Asocio al trial
    upload_artifact(trial, cm_filename, artifact_store)

    #Determino score en conjunto de test y asocio como metrica adicional en optuna
    test_score = cohen_kappa_score(y_val,scores_ensemble.argmax(axis=1),weights = 'quadratic')
    trial.set_user_attr("test_score", test_score)

    #Devuelvo score del 5fold cv a optuna para que optimice en base a eso
    return(score_folds)

In [71]:
#Inicio el store de artefactos (archivos) de optuna
artifact_store = FileSystemArtifactStore(base_path=PATH_TO_OPTUNA_ARTIFACTS)

#Genero estudio
study = optuna.create_study(direction='maximize',
                            storage="sqlite:///../work/db.sqlite3",  # Specify the storage URL here.
                            study_name="20240823 - Ge - LGB Multiclass CV",
                            load_if_exists = True)
#Corro la optimizacion
run_optuna = False

if run_optuna:
    study.optimize(cv_es_lgb_objective, n_trials=200)

[I 2024-09-27 11:15:02,510] Using an existing study with name '20240823 - Ge - LGB Multiclass CV' instead of creating a new one.


In [72]:
lgbm_params = {'lambda_l1': 0.07958720818703245, 
'lambda_l2': 0.23359973422583905, 
'num_leaves': 179, 
'min_data_in_leaf': 28, 
'feature_fraction': 0.48667927053386195, 
'bagging_fraction': 0.9896417154375905, 
'bagging_freq': 5, 
'min_child_samples': 58, 
'min_gain_to_split': 0.5653596125981446,
'seed': SEED}

In [73]:
#!optuna-dashboard sqlite:///../work/db1.sqlite3 --artifact-dir ../work/optuna_artifacts --port 8081

In [100]:
#Vamos a replicar el resultado de la optimizacion reentrenando el modelo con el mejor conjunto de hiperparametros
#Generamos parametros incluyendo los fijos y la mejor solución que encontro optuna
lgb_params =  {      **lgbm_params,
                        'objective': 'multiclassova',
                        'verbosity':-1,
                        'num_class': len(y_train.unique())}

lgb_train_dataset = lgb.Dataset(data=X_train,
                                label=y_train)


#Entreno
lgb_model = lgb.train(lgb_params,
                    lgb_train_dataset)

lgb_proba = lgb_model.predict(X_val)
lgb_pred = lgb_model.predict(X_val).argmax(axis=1)

#Muestro matriz de confusion y kappa
display(plot_confusion_matrix(y_val, lgb_pred))

cohen_kappa_score(y_val,lgb_pred,
                  weights = 'quadratic')


0.3361472866256764

In [75]:
lgb_pred.shape

(2999,)

In [76]:
X_val.shape

(2999, 35)

In [77]:
feature_importance = pd.DataFrame({'feature': preprocessor.get_feature_names_out(), 'weight': lgb_model.feature_importance(importance_type='gain')})
feature_importance.sort_values('weight', ascending=False).head(20)

Unnamed: 0,feature,weight
33,remainder__LenDesc,21082.87434
32,remainder__Rescuer_count,12171.763374
31,remainder__RelAge,9845.802496
19,cat__Color,7909.107428
28,remainder__Age,6542.366668
1,cat__Breed1,5800.642553
17,cat__FullBreed,4840.669627
26,cat__PhotoAmt,4689.344049
27,cat__Total_photo_video,4085.247985
21,cat__Va_De_St,4050.56216


## Blend

In [78]:
nn_proba = pd.read_csv(os.path.join(BASE_DIR, "resultados_nn_bert/ge-ResNet-20240921.csv"))
nn_proba = nn_proba[['PetID', 'pred']]

bert_proba = pd.read_csv(os.path.join(BASE_DIR, "resultados_nn_bert/test_06 Bert_1.0_7.csv"))
bert_proba = bert_proba[['PetID', 'pred']]

In [177]:
bert_proba.isna().sum()

PetID    0
pred     0
dtype: int64

In [174]:
lgb_proba_df = pd.DataFrame({'PetID': X_val_id, 
                            'lgbm_proba': lgb_proba.tolist()})

all_proba = lgb_proba_df.merge(bert_proba[['PetID', 'pred']].rename({'pred':'bert_pred_score'},axis=1),
                  on='PetID', how='left').merge(nn_proba[['PetID', 'pred']].rename({'pred':'nn_pred_score'},axis=1), on='PetID', how='left')

In [179]:
all_proba.sample(20)

Unnamed: 0,PetID,lgbm_proba,bert_pred_score,nn_pred_score
1313,c8cc85a15,"[0.10606390053600541, 0.2630563853371444, 0.21...",,[-2.5583043 -0.23384659 0.7930771 0.582074...
1924,8445b97aa,"[0.05311494819225966, 0.13587746452287816, 0.2...",,[-0.9309309 0.43255812 0.58860093 0.068445...
2673,9b344389b,"[0.01173449569350693, 0.21268734041300288, 0.3...",[2.1254060e-04 4.0527950e-03 7.9056925e-01 3.9...,[-1.7550509 0.4940071 0.6531632 0.472945...
2372,75c84313d,"[0.0036561157897600305, 0.15058636212217133, 0...",,[-1.8786646 0.09811264 0.64786506 0.338344...
1998,125522846,"[0.003222417558553996, 0.09959089860978582, 0....",,[-2.7824907 1.2435187 1.4269471 0.608873...
839,6ae4fc251,"[0.006920323249896333, 0.04838844077072137, 0....",[1.0886696e-04 1.0975566e-02 2.5250441e-01 4.4...,[-2.2456908 0.09022752 0.9485485 0.273897...
715,0636650db,"[0.015318122666526053, 0.3440343319641459, 0.2...",,[-0.7520431 0.47467205 0.3359408 0.295494...
528,1a983eced,"[0.0034417683051170293, 0.08836005613126553, 0...",,[-2.2891505 0.5950197 0.74151635 0.663614...
563,db03ddc32,"[0.008247881511331111, 0.008975440426764838, 0...",,[-2.3731718 0.34580326 0.29350746 0.390920...
1348,ad5250ecf,"[0.0019428246864814605, 0.05664992713098344, 0...",,[-1.711295 0.19684519 0.82712495 0.615364...


In [182]:
X_val_id.head()

0    8f20e24ef
1    2d72ef0c4
2    44cd12263
3    210c4a637
4    21493e6ea
Name: PetID, dtype: object