In [None]:
import pandas as pd
import numpy as np
import scipy
pd.set_option('display.max_columns',None)
import matplotlib.pyplot as plt
import seaborn as sns
from numpy.random import randint
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.impute import *
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split,cross_val_score, StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,f1_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler

from  sklearn.base import clone ## Clonar un pipeline
import xgboost as xgb

from bayes_opt import BayesianOptimization

### CARGUE DE LOS DATOS

In [None]:
full_data = pd.read_parquet("C:/Users/jf95n/OneDrive/Desktop/KaggleCompetition/df_train.parquet")

### ANÁLISIS EXPLORATORIO DE DATOS

In [None]:
full_data.dtypes

In [None]:
ax = full_data.loc[:,full_data.columns[full_data.isna().sum() > 0]].isna().sum().plot(kind = 'barh')
ax.bar_label(ax.containers[0])
plt.title('Datos faltantes por variable')
ax.set_xticks([])
plt.show()

**Por conocimiento del negocio, se tiene que los pacientes que tengan un None en la variable Multicáncer no se evidenció que tuviera otros tipos de cáncer, por lo que se realiza la imputación.**

In [None]:
full_data['MULTI_CANCER'].fillna(0, inplace = True)
## CASTEAR LA VARIABLES
full_data['MULTI_CANCER'] = pd.to_numeric(full_data['MULTI_CANCER'])

In [None]:
for i, j in full_data.isna().sum(axis = 1).value_counts().sort_index().items():
    match i:
        case 0: 
            print(f'En el dataset hay {j} observaciones sin datos nulos.')   
        case _:
            print(f'En el dataset hay {j} observaciones con datos nulos o faltantes en {i} de sus variables.')

In [None]:
ax = full_data['Target'].value_counts(normalize = True).plot(kind = 'bar')
labels = (full_data.Target.value_counts(normalize = True).sort_index()*100).round(1).astype('str') + '%' 
ax.tick_params(axis = 'x', rotation = 0)
ax.set_title("Categorías variable dependiente, conjunto de prueba")
ax.set_yticks([])
for container in ax.containers:
    ax.bar_label(container, labels = labels)
ax.set_title('Distribución de la variable respuesta')
plt.show()

In [None]:
ax = full_data.groupby(['ESTADO_CIVIL','Target'])['GENERO'].count().unstack().plot(kind = 'bar')
ax.tick_params(axis = 'x', rotation = 0)
ax.set_title("Categorías variable dependiente, conjunto de prueba")
ax.set_title('Distribución de la variable respuesta')
plt.show()

In [None]:
## Se define una función para obtener el índice de las variables categóricas, numéricas y variable objetivo
def SepararNumCate(df : pd.DataFrame, target_variable : str):
    '''Returns a triplet with column names (numerical, categorical, target)
    '''
    numerical = df.select_dtypes(include = 'number').columns.to_list()
    date_time = df.select_dtypes(include = 'datetime').columns.to_list()
    categorical = df.select_dtypes(include = 'object').columns.to_list()
    numerical.remove(target_variable) ## REMOVES THE TARGET VARIABLE
    target = target_variable
    return numerical, date_time, categorical, target


num_idx, date_time_idx, categ_idx, target_idx = SepararNumCate(full_data,'Target')

In [None]:
full_data.select_dtypes(include = 'object')

In [None]:
full_data.loc[:,num_idx].describe().T

#### OUTLIERS UNIVARIADOS

#### OUTLIERS MULTIVARIADOS

In [None]:
sns.boxplot(y = full_data.loc[:,['mes_6', 'mes_5', 'mes_4', 'mes_3', 'mes_2', 'mes_1',]].agg('sum', axis = 1), x = full_data['Target'])
plt.show()

In [None]:
full_data.loc[:,['GENERO','Target']].value_counts().unstack().plot(kind = 'bar')

In [None]:
full_data.loc[full_data['GENERO']=='M',['GENERO','Target']].value_counts(normalize = True)

### SEPARAR 15% COMO CONJUNTO DE PRUEBA

#### IMPUTATION

In [None]:
### BAYESIAN OPTIMIZATION FOR IMPUTATION
def target_imput(numerical, naDrop, nNeighborsIterator,) -> float :
    

    rf_model = RandomForestClassifier(random_state = 123)
    nNeighborsIterator = int(nNeighborsIterator)
    numerical = int(numerical)
    numericalOptions = ['simple','knn']
    data = full_data.copy()
    naDrop = int(naDrop)

    pl_dict_imputer = {
        'simple' : SimpleImputer(strategy = 'median'),
        'knn' :  KNNImputer(n_neighbors = nNeighborsIterator)
                    }

    num_imputer = Pipeline(steps = [
            ('num_imputer',pl_dict_imputer[numericalOptions[numerical]]),
            ('scaler',StandardScaler())
    ])

    cat_imputer = Pipeline(steps = [
        ('cat_imputer',SimpleImputer(strategy = 'most_frequent')),
        ('encoder',OneHotEncoder(handle_unknown = 'ignore',drop = 'if_binary'))
    ])    

    columnImputer = ColumnTransformer(transformers = [
        ('Numerical',num_imputer,num_idx),
        ('Categorical', cat_imputer, categ_idx),
                                    ]
    )

    finalPipeline = Pipeline(
        steps = [
            ('imputer',columnImputer),
            ('clf',rf_model)
                ])

    match naDrop:
        case 2:
            dataTrain = data.drop(data.loc[data.isna().sum(axis = 1) == 2].index).copy()
        case 4:
            dataTrain = data.drop(data.loc[data.isna().sum(axis = 1) == 4].index).copy()
        case 0:
            dataTrain = data.drop(data.loc[data.isna().sum(axis = 1) == 0].index).copy()
        case _:
            dataTrain = data.copy()

    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 123)
    
    pipe = clone(finalPipeline)
    return np.median(np.array(cross_val_score(pipe,dataTrain[num_idx + categ_idx], dataTrain[target_idx], cv = skf, scoring = 'f1',n_jobs = -4)))

In [None]:
pBounds_imput = dict(
    numerical = (0,1),
    naDrop = (0,4),
    nNeighborsIterator = (2,50)
)

In [None]:
optimizer = BayesianOptimization(
    f=target_imput,
    pbounds=pBounds_imput,
    random_state=123,
    verbose=2,
)

In [None]:
optimizer.maximize(n_iter = 50)

In [None]:
optimizer.max

### PIPELINES DE FEATURE ENGINEERING

In [None]:
### AQUÍ VAN TODAS LAS VARIABLES NUEVAS QUE SE VAN A CREAR
class FeatureTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Perform arbitary transformation
        X_transf = X.copy()
        X_transf['MULT_CANCER'] = X_transf['MULTI_CANCER'].fillna(0)
        X_transf['IMC'] = X_transf['PESO']/X_transf['TALLA'] ### HACER UNA MEJOR CONSTRUCCIÓN DE ESTA VARIABLE QUE TOME EN CUENTA LA EDAD
        X_transf['SumCosto'] = X_transf.loc[:,['mes_6', 'mes_5', 'mes_4', 'mes_3', 'mes_2', 'mes_1',]].agg(sum, axis = 1) ### 
        X_transf['EdadComplicacion'] = (((X_transf['Fecha_cero'] - X['FECHA_NACIMIENTO']).dt.components.days)/365).astype(int)        
        X_transf.drop(labels = 'Fecha_cero', axis = 1, inplace = True) ## IMPORTANTE ESTE PASO PARA QUE NO VAYAMOS A DEJAR ESTA VARIABLE
        return X_transf

### IMPUTATION AND STANDARDIZATION PIPELINES ==

In [None]:
def SVC(kernel, gamma, C, degree, coef0, tol):
    kernel = int(kernel) ## (0,3)
    n_jobs = -3 ## to use all but 2 cores.
    kernels = ['linear','polynomial','rbf','sigmoid']
    # match kernel:
    #     case 2:
    #         # use gamma : must be non-negative
    #     case _:
    #         # dont use gamma
    classificator = SVC(C = C, kernel = kernels[kernel], class_weight = )
    return -np.median(cross_val_score(classificator, x_train, y_train, n_jobs = n_jobs, cv = 5, scoring = 'f1'))

In [None]:
pbounds = {
    
}

### FEATURE SELECTION WITH LASSO

In [None]:
class FeatureCreation_Cleaning(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Perform arbitary transformation
        X_transf = X.copy()
        # X_transf = X_transf.drop(X_transf.loc[X_transf.isna().sum(axis = 1) == 4].index).copy()
        X_transf.drop(labels = ['Fecha_cero'], axis = 1, inplace = True) ## IMPORTANTE ESTE PASO PARA QUE NO VAYAMOS A DEJAR ESTA VARIABLE
        return X_transf

In [None]:
#### SE DEBE ELIMINAR LAS FILAS QUE TENGAN 4 FALTANTES ASÍ O MIRAR COMO SE ACOMODA EL FEATURE TRANSFORMER PORQUE SÓLO ELIMINA LAS FILAS EN LAS X's Y NO EN LAS Y's.
full_data_2 = full_data.drop(full_data.loc[full_data.isna().sum(axis = 1) == 4].index).copy()

In [None]:
from sklearn.base import TransformerMixin

class LassoFeatureSelectorCV(TransformerMixin):
    def __init__(self, n_jobs = -4, cv = skf):
        self.n_jobs = n_jobs
        self.cv = cv
        self.model = LassoCV(cv = self.cv, n_jobs = self.n_jobs)
    
    def fit(self, X, y):
        self.model.fit(X, y)
        self.important_features_ = self.model.coef_ != 0
        return self
    
    def transform(self, X):
        return X[:, self.important_features_]


In [None]:
#### PCA
canc_transf = FeatureCreation_Cleaning()
imputer_knn =   KNNImputer(n_neighbors = 35)
scaler_pca = StandardScaler()
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 123)
princ_comp = PCA()
xgb_clf = XGBClassifier(scale_pos_weight = )

numer_prepro = Pipeline( steps = [
        ('imputer',imputer_knn),
        ('scaler',scaler_pca),
        ('dim_reduc',princ_comp),
        ('feat_selec', LassoFeatureSelectorCV(cv = skf))
])

categ_prepro = Pipeline( steps = [
        ('cat_imputer',SimpleImputer(strategy = 'most_frequent')),
        ('encoder',OneHotEncoder(handle_unknown = 'ignore',drop = 'if_binary'))
])

column_prepro = ColumnTransformer( transformers = [
        ('num',numer_prepro, num_idx),
        ('categ',categ_prepro, categ_idx)
])

final_pipeline = Pipeline( steps = [
        ('Preprocessing',column_prepro),
        ('clf',xgb_clf)
])

In [None]:
def XGBClf_bayes_opt(learning_rate, n_estimators, max_depth, subsample, colsample,
                    reg_alpha, reg_lambda,scale_pw):
                    
    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 123)
    full_data2 = full_data_2.copy()
    n_jobs = -4
    xgb_grid_params = {'clf__learning_rate': learning_rate,
                       'clf__subsample'    : subsample,
                       'clf__n_estimators' :  int(n_estimators), 
                       'clf__reg_alpha': reg_alpha,
                       'clf__reg_lambda': reg_lambda ,
                       'clf__scale_pos_weight':scale_pw,
                       'clf__max_depth'    : int(max_depth) }
    clf = clone(final_pipeline.set_params(**xgb_grid_params))
    return np.median(cross_val_score(clf,full_data2.loc[:,num_idx + categ_idx],full_data2.loc[:,target_idx], cv = skf, scoring = 'f1', n_jobs = n_jobs))

In [None]:
pbounds_XGB = {
    'learning_rate': (0.001, 1.0),
    'n_estimators': (100, 1000),
    'scale_pw':(0,12),
    'max_depth': (3,20),
    'subsample': (0.2, 1.0),  # Change for big datasets
    'colsample': (0.2, 1.0),  # Change for datasets with lots of features
    'reg_alpha' : (0.1,10),
    'reg_lambda' : (0.1,10),
    }

optimizerXGB = BayesianOptimization(
    f=XGBClf_bayes_opt,
    pbounds=pbounds_XGB,
    random_state=123,
)

In [None]:
optimizerXGB.maximize(n_iter = 50,
                      init_points = 5)

In [None]:
bestRun = optimizerXGB.max

In [None]:
### FIT THE MODEL WITH THE BEST HYPERPARAMETERS
xgb_grid_params = {'clf__learning_rate': bestRun['params']['learning_rate'] ,
                    'clf__subsample'    : bestRun['params']['subsample']  ,
                    'clf__n_estimators' : int(bestRun['params']['n_estimators'])  , 
                    'clf__scale_pos_weight' : bestRun['params']['scale_pw'],
                    'clf__reg_alpha': bestRun['params']['reg_alpha'] ,
                    'clf__reg_lambda': bestRun['params']['reg_lambda']  ,
                    'clf__max_depth'    : int(bestRun['params']['max_depth']) }


In [None]:
plt.bar(range(len(np.cumsum(PCApipe['dim_reduc'].explained_variance_ratio_))),np.cumsum(PCApipe['dim_reduc'].explained_variance_ratio_))
plt.title('Varianza explicada acumulada')
plt.show()

### SUBMISSION

In [None]:
df_sub = pd.read_parquet("C:/Users/jf95n/OneDrive/Desktop/KaggleCompetition/df_test.parquet")

In [None]:
pipe_submission2 = clone(final_pipeline)

In [None]:
pipe_submission2.set_params(**xgb_grid_params)

In [None]:
pipe_submission2.fit(full_data_2.loc[:,num_idx + categ_idx],full_data_2.loc[:,target_idx])

In [None]:
df_sub.reset_index(inplace = True)

In [None]:
df_sub['Target'] = pipe_submission2.predict(df_sub)

In [None]:
df_sub.loc[:,['ID','Target']].to_csv('xgbClfBalanced_250324.csv',index = False)