## Imports

In [1]:
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from feature_engine.imputation import AddMissingIndicator, CategoricalImputer, MeanMedianImputer
from feature_engine.encoding import OneHotEncoder,CountFrequencyEncoder,OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer,LabelEncoder,label_binarize

from sklearn.metrics import (accuracy_score, roc_auc_score,explained_variance_score,mean_squared_error,r2_score,average_precision_score, classification_report, roc_curve, auc, confusion_matrix, 
                     precision_recall_fscore_support,make_scorer, f1_score,precision_score,recall_score, ConfusionMatrixDisplay)
from sklearn.model_selection import GridSearchCV,KFold, cross_val_score
from scikitplot.metrics import plot_roc, plot_confusion_matrix, plot_precision_recall,plot_roc_curve

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
#from catboost import CatBoostClassifier
#from lightgbm import LGBMClassifier

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data.csv')


## Pipelines
Se crean variables para unificar y entrenar el modelo a ver si es cancer o no.

In [3]:
df["is_cancer"] = df["diagnostic"].apply(lambda x: 1 if x in ["BCC","MEL","SCC"] else 0)

In [4]:
df.columns

Index(['patient_id', 'lesion_id', 'smoke', 'drink', 'background_father',
       'background_mother', 'age', 'pesticide', 'gender',
       'skin_cancer_history', 'cancer_history', 'has_piped_water',
       'has_sewage_system', 'fitspatrick', 'region', 'diameter_1',
       'diameter_2', 'diagnostic', 'itch', 'grew', 'hurt', 'changed', 'bleed',
       'elevation', 'img_id', 'biopsed', 'is_cancer'],
      dtype='object')

In [5]:
# la persona estuvo aca antes? Tuvo en esta parte del cuerpo antes? Que tuvo antes? -> me parece que va a overfitear con eso
x1 = df.drop(columns=["is_cancer", "diagnostic"]) 
y1 = df["is_cancer"]

def had_here_before(df):
    patient_counts = df["patient_id"].value_counts()
    df["had_here_before"] = df["patient_id"].apply(lambda x: patient_counts[x] -1  )
    return df

def had_this_part_before(df):
    patient_region_counts = df.groupby(["patient_id", "region"]).size().unstack(fill_value=0).T.to_dict('dict')
    df["had_this_part_before"] = df.apply(lambda row: 1 if  patient_region_counts[row["patient_id"]][row["region"]]>=2 else 0, axis=1)
    return df



preguntas = Pipeline([
    ( "had_here_before", FunctionTransformer(had_here_before)),
    ( "had_this_part_before", FunctionTransformer(had_this_part_before)),

])

limpieza = Pipeline([
    ("dropear", FunctionTransformer(lambda x: x.drop(["img_id","lesion_id", "patient_id"], axis=1,))),
                     ("frequency", CountFrequencyEncoder(encoding_method='frequency',variables=["region"])),
                     ("ordinal", OrdinalEncoder(variables=["smoke","drink","background_father",
                                                            "background_mother","pesticide","skin_cancer_history",
                                                            "cancer_history", "has_piped_water","has_sewage_system",
                                                            "itch","grew","hurt","changed","bleed","elevation","biopsed"]))
                    ])
imputing = Pipeline([("imputar_numericas",MeanMedianImputer( imputation_method = "median", variables= ["fitspatrick","diameter_1","diameter_2","age"]))])

res = preguntas.fit_transform(x1)

pipe = Pipeline([("preguntas",preguntas), ("limpieza",limpieza), ("imputing",imputing)])


In [29]:
x1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1470 entries, 1735 to 1339
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   patient_id            1470 non-null   object 
 1   lesion_id             1470 non-null   int64  
 2   smoke                 942 non-null    object 
 3   drink                 942 non-null    object 
 4   background_father     933 non-null    object 
 5   background_mother     929 non-null    object 
 6   age                   1470 non-null   int64  
 7   pesticide             942 non-null    object 
 8   gender                942 non-null    object 
 9   skin_cancer_history   942 non-null    object 
 10  cancer_history        942 non-null    object 
 11  has_piped_water       942 non-null    object 
 12  has_sewage_system     942 non-null    object 
 13  fitspatrick           942 non-null    float64
 14  region                1470 non-null   object 
 15  diameter_1            9

In [7]:


x1,x_test1,y1,y1_test = train_test_split(x1,y1, test_size=0.2, random_state=42)
pipe.fit(x1,y1)
x1 = pipe.transform(x1)
x_test1 = pipe.transform(x_test1)

ValueError: Some of the variables in the dataset contain NaN. Check and remove those before using this transformer or set the parameter `missing_values='ignore'` when initialising this transformer.

In [None]:
label_encoder = LabelEncoder()
y1 = label_encoder.fit_transform(y1)
y1_test= label_encoder.transform(y1_test)

## Modelos