In [None]:
import pandas as pd
import numpy as np
import scipy
pd.set_option('display.max_columns',None)
import matplotlib.pyplot as plt
import seaborn as sns
from numpy.random import randint
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.impute import *
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.svm import SVC

### CARGUE DE LOS DATOS

In [None]:
full_data = pd.read_parquet("C:/Users/jf95n/OneDrive/Desktop/KaggleCompetition/df_train.parquet")

### ANÁLISIS EXPLORATORIO DE DATOS

In [None]:
full_data.dtypes

In [None]:
ax = full_data.loc[:,full_data.columns[full_data.isna().sum() > 0]].isna().sum().plot(kind = 'barh')
ax.bar_label(ax.containers[0])
plt.title('Datos faltantes por variable')
plt.show()

In [None]:
for i, j in full_data.isna().sum(axis = 1).value_counts().items():
    match i:
        case 0: 
            print(f'En el dataset hay {j} observaciones sin datos nulos.')   
        case _:
            print(f'En el dataset hay {j} observaciones con datos nulos o faltantes en {i} de sus variables.')

In [None]:
ax = full_data['Target'].value_counts(normalize = True).plot(kind = 'bar')
labels = (full_data.Target.value_counts(normalize = True).sort_index()*100).round(1).astype('str') + '%' 
ax.tick_params(axis = 'x', rotation = 0)
ax.set_title("Categorías variable dependiente, conjunto de prueba")
ax.set_yticks([])
for container in ax.containers:
    ax.bar_label(container, labels = labels)
ax.set_title('Distribución de la variable respuesta')
plt.show()

In [None]:
full_data.groupby(['ESTADO_CIVIL','Target'])['GENERO'].count().plot(kind = 'bar')
ax.tick_params(axis = 'x', rotation = 0)
ax.set_title("Categorías variable dependiente, conjunto de prueba")
ax.set_title('Distribución de la variable respuesta')
plt.show()

In [None]:
## Se define una función para obtener el índice de las variables categóricas, numéricas y variable objetivo
def SepararNumCate(df : pd.DataFrame, target_variable : str):
    '''Returns a triplet with column names (numerical, categorical, target)
    '''
    numerical = df.select_dtypes(include = 'number').columns.to_list()
    date_time = df.select_dtypes(include = 'datetime').columns.to_list()
    categorical = df.select_dtypes(include = 'object').columns.to_list()
    numerical.remove(target_variable) ## REMOVES THE TARGET VARIABLE
    target = df[target_variable].copy()
    return numerical, date_time, categorical, target


num_idx, date_time_idx, categ_idx, target_idx = SepararNumCate(full_data,'Target')

In [None]:
full_data.select_dtypes(include = 'datetime')

In [None]:
categ_idx

In [None]:
full_data.loc[:,num_idx].describe().T

#### OUTLIERS UNIVARIADOS

In [None]:
full_data[full_data.isna().sum(axis = 1) == 1]

In [None]:
for i in num_idx:
    plt.figure()
    plt.title(i)
    plt.boxplot(train_set[i], vert = False)

#### OUTLIERS MULTIVARIADOS

In [None]:
sns.boxplot(y = full_data.loc[:,['mes_6', 'mes_5', 'mes_4', 'mes_3', 'mes_2', 'mes_1',]].agg('sum', axis = 1), x = full_data['Target'])
plt.show()

In [None]:
full_data.loc[:,['GENERO','Target']].value_counts()

In [None]:
full_data.loc[full_data['GENERO']=='M',['GENERO','Target']].value_counts(normalize = True)

### PIPELINES DE FEATURE ENGINEERING

In [None]:
### AQUÍ VAN TODAS LAS VARIABLES NUEVAS QUE SE VAN A CREAR
class FeatureTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Perform arbitary transformation
        X['IMC'] = X['PESO']/X['TALLA'] ### HACER UNA MEJOR CONSTRUCCIÓN DE ESTA VARIABLE QUE TOME EN CUENTA LA EDAD
        X['SumCosto'] = X.loc[:,['mes_6', 'mes_5', 'mes_4', 'mes_3', 'mes_2', 'mes_1',]].agg(sum, axis = 1) ### 
        X['EdadComplicacion'] = (((X['Fecha_cero'] - X['FECHA_NACIMIENTO']).dt.components.days)/365).astype(int)

        
        X.drop(labels = 'Fecha_cero', axis = 1, inplace = True) ## IMPORTANTE ESTE PASO PARA QUE NO VAYAMOS A DEJAR ESTA VARIABLE
        return X


### TRAIN TEST SPLIT

In [None]:
full_data.groupby(['GENERO','Target']).apply(lambda x : x.sample(frac = 0.9, random_state = 123))

In [None]:
train, test = train_test_split(full_data,test_size=0.1,stratify = full_data[['Target','GENERO']], shuffle = True, random_state = 123) ## IMPORTANTE MIRAR POR CUALES VARIABLES SE DEBE ESTRATIFICAR

### IMPUTATION AND STANDARDIZATION PIPELINES ==

In [None]:
### Pipelines for standardization

num_stand = Pipeline(steps = [
    ('numScaler',StandardScaler())
])

categ_stand = Pipeline(steps = [
    ('categStand',OneHotEncoder(handle_unkown = 'ignore'))
])

columnScaler = ColumnTransformer(transformers = 
    [('numScaler',num_stand,num_idx),
     ('categStand', categ_stand, categ_idx)],
     remainder = 'drop'
    )

### Pipelines for imputation

num_imp = Pipeline(steps = [
    ('numerical_imputer',SimpleImputer(strategy = 'median'))
])

cat_imp = Pipeline(steps = [
    ('categorical_imputer',SimpleImputer(strategy = 'most_frequent'))
])

columnImputer = ColumnTransformer(transformers = 
    [('NumericalImputer',num_imp,num_idx),
     ('CategoricalImputer',cat_imp, categ_idx)

    ],
remainder = 'passthrough')

In [None]:
def SVC(kernel, gamma, C, degree, coef0, tol):
    kernel = int(kernel) ## (0,3)
    n_jobs = -3 ## to use all but 2 cores.
    kernels = ['linear','polynomial','rbf','sigmoid']
    # match kernel:
    #     case 2:
    #         # use gamma : must be non-negative
    #     case _:
    #         # dont use gamma
    classificator = SVC(C = C, kernel = kernels[kernel], class_weight = )
    return -np.median(cross_val_score(classificator, x_train, y_train, n_jobs = n_jobs, cv = 5, scoring = 'f1'))

In [None]:
pbounds = {
    
}