In [1]:
import pandas as pd
from feature_engine.imputation import AddMissingIndicator, CategoricalImputer, MeanMedianImputer
from feature_engine.encoding import OneHotEncoder,RareLabelEncoder,CountFrequencyEncoder
from feature_engine.selection import  DropFeatures
from sklearn.preprocessing import MultiLabelBinarizer,LabelEncoder
from sklearn.preprocessing import FunctionTransformer
import json,re
from sklearn.model_selection import train_test_split
from concurrent.futures import ThreadPoolExecutor
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from joblib import Parallel, delayed
from xgboost import XGBClassifier


In [2]:
df = pd.read_csv('metadata.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2298 entries, 0 to 2297
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   patient_id           2298 non-null   object 
 1   lesion_id            2298 non-null   int64  
 2   smoke                1494 non-null   object 
 3   drink                1494 non-null   object 
 4   background_father    1480 non-null   object 
 5   background_mother    1476 non-null   object 
 6   age                  2298 non-null   int64  
 7   pesticide            1494 non-null   object 
 8   gender               1494 non-null   object 
 9   skin_cancer_history  1494 non-null   object 
 10  cancer_history       1494 non-null   object 
 11  has_piped_water      1494 non-null   object 
 12  has_sewage_system    1494 non-null   object 
 13  fitspatrick          1494 non-null   float64
 14  region               2298 non-null   object 
 15  diameter_1           1494 non-null   f

In [4]:
X = df.sample(frac=0.8, random_state=200)
X_test = df.drop(X.index)
y = X.diagnostic
y_test = X_test.diagnostic

X = X.drop('diagnostic',axis=1)
X_test = X_test.drop('diagnostic',axis=1)

del df

In [16]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)
y_test = encoder.transform(y_test)
preprocess = Pipeline([
    ("dropear", FunctionTransformer(lambda x: x.drop(["img_id","patient_id","lesion_id"], axis=1, errors='ignore'))),
    ("indicador_NAs",AddMissingIndicator(missing_only= True,variables = ["has_sewage_system","fitspatrick","diameter_1","diameter_2","skin_cancer_history","cancer_history","has_piped_water","background_father","background_mother","pesticide","smoke","drink"])),
    ("missing_categoricas",CategoricalImputer(variables = ["has_sewage_system","region","gender"])),
    ("encoding_onehot",OneHotEncoder(variables = ["region","gender"])),
    ("frequency_encoder", CountFrequencyEncoder(encoding_method='frequency',missing_values='ignore', variables=["background_father","background_mother"])),
    ("NaN_father",FunctionTransformer(lambda x: x.assign(background_father = x['background_father'].fillna(0)))),
    ("NaN_mother",FunctionTransformer(lambda x: x.assign(background_mother = x['background_mother'].fillna(0)))),
    ("tf_itch", FunctionTransformer(lambda x: x.assign(itch = x['itch'].apply(lambda x: 1 if x=="TRUE" else 2 if x=="UNK" else 0)))),
    ("tf_grew", FunctionTransformer(lambda x: x.assign(grew = x['grew'].apply(lambda x: 1 if x=="TRUE" else 2 if x=="UNK" else 0)))),
    ("tf_hurt", FunctionTransformer(lambda x: x.assign(hurt = x['hurt'].apply(lambda x: 1 if x=="TRUE" else 2 if x=="UNK" else 0)))),
    ("tf_changed", FunctionTransformer(lambda x: x.assign(changed = x['changed'].apply(lambda x: 1 if x=="TRUE" else 2 if x=="UNK" else 0)))),
    ("tf_bleed", FunctionTransformer(lambda x: x.assign(bleed = x['bleed'].apply(lambda x: 1 if x=="TRUE" else 2 if x=="UNK" else 0)))),
    ("tf_elevation", FunctionTransformer(lambda x: x.assign(elevation = x['elevation'].apply(lambda x: 1 if x=="TRUE" else 2 if x=="UNK" else 0)))),
    ("tf_biopsed",FunctionTransformer(lambda x: x.assign(biopsed = x['biopsed'].apply(lambda val: 1 if val  else 0)))),
    ("tf_sewage", FunctionTransformer(lambda x: x.assign(has_sewage_system = x['has_sewage_system'].replace("Missing", False).fillna(False).astype(int).apply(lambda val: 1 if val else 0)))),
    ("tf_skin_cancer_history", FunctionTransformer(lambda x: x.assign(skin_cancer_history = x['skin_cancer_history'].replace("Missing", False).fillna(False).astype(int).apply(lambda val: 1 if val else 0)))),
    ("tf_cancer_history", FunctionTransformer(lambda x: x.assign(cancer_history = x['cancer_history'].replace("Missing", False).fillna(False).astype(int).apply(lambda val: 1 if val else 0)))),
    ("tf_pipedwater", FunctionTransformer(lambda x: x.assign(has_piped_water = x['has_piped_water'].replace("Missing", False).fillna(False).astype(int).apply(lambda val: 1 if val else 0)))),
    ("tf_pesticide", FunctionTransformer(lambda x: x.assign(pesticide = x['pesticide'].replace("Missing", False).fillna(False).astype(int).apply(lambda val: 1 if val else 0)))),
    ("tf_smoke", FunctionTransformer(lambda x: x.assign(smoke = x['smoke'].replace("Missing", False).fillna(False).astype(int).apply(lambda val: 1 if val else 0)))),
    ("tf_drink", FunctionTransformer(lambda x: x.assign(drink = x['drink'].replace("Missing", False).fillna(False).astype(int).apply(lambda val: 1 if val else 0)))),
    ("imputar_numericas",MeanMedianImputer( imputation_method = "median", variables= ["fitspatrick","diameter_1","diameter_2","age"])),
])
pipe = Pipeline([
    ('preprocess',preprocess),
    ('model',XGBClassifier())
])
""" x2 = preprocess.fit_transform(X)

#Print x2 categorical variables
for col in x2.columns:
    if x2[col].dtype == 'object':
        print(col)
        print(x2[col].value_counts())
        print()
        print()
 """

pipe.fit(X,y)
pipe.score(X_test,y_test)



0.841304347826087