In [54]:
seed = 42
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
import category_encoders as ce




# IMPLEMENTACIÓN DE UN PIPELINE EVALUADO CON GRID SEARCH 

In [36]:
ruta = '../data/raw/patient_churn_dataset.csv'
data = pd.read_csv(ruta)
data.head()

Unnamed: 0,PatientID,Age,Gender,State,Tenure_Months,Specialty,Insurance_Type,Visits_Last_Year,Missed_Appointments,Days_Since_Last_Visit,...,Overall_Satisfaction,Wait_Time_Satisfaction,Staff_Satisfaction,Provider_Rating,Avg_Out_Of_Pocket_Cost,Billing_Issues,Portal_Usage,Referrals_Made,Distance_To_Facility_Miles,Churned
0,C20000,41,Female,PA,62,Pediatrics,Medicaid,1,0,564,...,3.5,4.9,3.8,4.2,306,0,0,3,21.4,1
1,C20001,43,Female,GA,44,Internal Medicine,Self-Pay,7,4,254,...,2.6,3.1,4.7,4.3,1851,0,0,0,47.6,1
2,C20002,21,Male,MI,120,Internal Medicine,Medicaid,15,5,89,...,1.6,4.4,2.1,4.7,391,0,0,2,7.1,0
3,C20003,65,Male,FL,118,General Practice,Private,10,3,135,...,2.6,4.3,4.3,4.9,808,0,0,0,11.6,1
4,C20004,18,Female,CA,70,Cardiology,Medicaid,5,4,696,...,2.2,4.0,4.1,4.4,866,0,0,0,10.3,1


In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   PatientID                   2000 non-null   object 
 1   Age                         2000 non-null   int64  
 2   Gender                      2000 non-null   object 
 3   State                       2000 non-null   object 
 4   Tenure_Months               2000 non-null   int64  
 5   Specialty                   2000 non-null   object 
 6   Insurance_Type              2000 non-null   object 
 7   Visits_Last_Year            2000 non-null   int64  
 8   Missed_Appointments         2000 non-null   int64  
 9   Days_Since_Last_Visit       2000 non-null   int64  
 10  Last_Interaction_Date       2000 non-null   object 
 11  Overall_Satisfaction        2000 non-null   float64
 12  Wait_Time_Satisfaction      2000 non-null   float64
 13  Staff_Satisfaction          2000 

# IDENTIFICAR VARIABLE OBJETIVO Y SEPARAR CARACTERÍSTICAS DE LA ETIQUETA

In [38]:
target_variable = 'Churned'
X = data.drop(columns = [target_variable])
y = data[target_variable]

## VALIDACIÓN

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, stratify=y)

## IDENTIFICACION DE LAS COLUMNAS NUMÉRICAS Y CATEGÓRICAS (ORDINALES / NOMINALES)

In [45]:
real_ordinal_cols = ['Referrals_Made', 'Staff_Satisfaction']
real_nominal_cols = ['Billing_Issues', 'Portal_Usage'] 

# Numericas
numerical_cols = data.select_dtypes(include = ['int64', 'float64']).columns.tolist()
numerical_cols = [col for col in numerical_cols if col not in real_nominal_cols and col != 'PatientID']  

# Categoricas Ordinales 
ordinal_cols = real_ordinal_cols

# Categoricas Nominales
nominal_cols = [col for col in data.columns if col not in numerical_cols and col not in ordinal_cols and col != 'PatientID' and col != 'Last_Interaction_Date']


print(f'Columnas categóricas ordinales:')
print(ordinal_cols)
print(f'\n Columnas categóricas nominales:')
print(nominal_cols)
print(f'\n Columnas numéricas:')
numerical_cols


Columnas categóricas ordinales:
['Referrals_Made', 'Staff_Satisfaction']

 Columnas categóricas nominales:
['Gender', 'State', 'Specialty', 'Insurance_Type', 'Billing_Issues', 'Portal_Usage']

 Columnas numéricas:


['Age',
 'Tenure_Months',
 'Visits_Last_Year',
 'Missed_Appointments',
 'Days_Since_Last_Visit',
 'Overall_Satisfaction',
 'Wait_Time_Satisfaction',
 'Staff_Satisfaction',
 'Provider_Rating',
 'Avg_Out_Of_Pocket_Cost',
 'Referrals_Made',
 'Distance_To_Facility_Miles',
 'Churned']

## COLUMN TRANSFORMER PARA LA INGENIERÍA DE CARACTERÍSTICAS

In [46]:
class FeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        X = X.copy()
        
        X['frecuencia_visitas'] = X['Visits_Last_Year'] / 12
        X['ratio_citas'] = X['Missed_Appointments'] / X['Visits_Last_Year'] 
        X['costoxvisita'] = X['Avg_Out_Of_Pocket_Cost'] / X['Visits_Last_Year']

        return X

## PIPELINE PARA CADA TIPO DE DATO

In [None]:
## Pipeline para datos ordinales

ord_pipeline = Pipeline([
    ('encoder', OrdinalEncoder(categories=ordinal_cols))
])

## Pipeline para datos nominales
nom_pipeline = Pipeline([
    ('ohe_dropfirst', OneHotEncoder(drop= True), ['Gender', 'Billing_Issues', 'Portal_Usage']),
    ('ohe', OneHotEncoder(), ['Insurance_Type']),
    ('frecuency_encoder', ce.CountEncoder(normalize=True), ['State', 'Speciality'])
])

## Pipeline para datos numéricos
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler(), numerical_cols)
])
