##### Maestría Inteligencia de Negocios y Análisis de Datos
##### Curso: Statistical Learning I
##### Alumno: 22008375- Leonel Eduardo Contreras
##### Alumno: 22008067- Luis Pedro Perez
##### Sección L

## Proyecto Final-PipeLine

#### 1. Importar librerias

In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler


import myPreprocessors as mypp # Nuestra librerías de transformaciones.


from feature_engine.imputation import MeanMedianImputer
from feature_engine.encoding import CountFrequencyEncoder
from feature_engine.transformation import LogTransformer
from feature_engine.selection import DropFeatures

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

import joblib


#### 2. Cargar Dataset

In [113]:
df=pd.read_csv('airline_satisfaction.csv')
df

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,1,Male,disloyal Customer,48,Business travel,Business,821,3,3,3,...,5,3,2,5,4,5,5,2,5.0,neutral or dissatisfied
1,2,Female,Loyal Customer,35,Business travel,Business,821,2,2,2,...,5,5,5,5,3,5,5,26,39.0,satisfied
2,3,Male,Loyal Customer,41,Business travel,Business,853,4,4,4,...,3,3,3,3,4,3,5,0,0.0,satisfied
3,4,Male,Loyal Customer,50,Business travel,Business,1905,2,2,2,...,5,5,5,5,3,5,4,0,0.0,satisfied
4,5,Female,Loyal Customer,49,Business travel,Business,3470,3,3,3,...,3,3,4,3,3,3,5,0,1.0,satisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,129876,Male,Loyal Customer,28,Personal Travel,Eco Plus,447,4,4,4,...,4,5,4,4,4,5,4,2,3.0,neutral or dissatisfied
129876,129877,Male,Loyal Customer,41,Personal Travel,Eco Plus,308,3,5,3,...,2,5,5,5,5,4,2,0,0.0,neutral or dissatisfied
129877,129878,Male,Loyal Customer,42,Personal Travel,Eco Plus,337,2,5,2,...,3,3,4,5,4,4,3,6,14.0,neutral or dissatisfied
129878,129879,Male,Loyal Customer,50,Personal Travel,Eco Plus,337,5,4,4,...,3,4,5,5,3,4,3,31,22.0,satisfied


In [114]:
df.columns

Index(['id', 'Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class',
       'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction'],
      dtype='object')

#### 3. Configuración del Pipeline

In [115]:
#Imputación de variables numéricas
NUMERICAL_VARS_WITH_NA = ['Arrival Delay in Minutes']

#Variables para codificación por frecuencia
CATEGORICAL_VARS_FREQ = ['Class']

#Variables para codificación One Hot Encoding
ONE_HOT_GENDER=['Gender']
ONE_HOT_CUSTOMER=['Customer Type']
ONE_HOT_TRAVEL=['Type of Travel']

#Variables para codificación ordinal.
CATEGORICAL_VARS_ORD=['satisfaction']

#Variables para transfomraicón logarítmica
NUMERICAL_LOG_VARS = ['Flight Distance']

#Mapeo para varibels categótricas
MAPPING_GENDER={'Female':0,'Male':1}
MAPPING_CUSTOMER={'Loyal Customer':0,'disloyal Customer':1}
MAPPING_TRAVEL={'Business travel':0,'Personal Travel':1}
MAPPING_SATISFACTION={'neutral or dissatisfied':0,'satisfied':1}

#Variables a utilzar en el entrenamiento
FEATURES =['Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class',
       'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']

#### 4. Construcción del Pipeline

In [116]:
pipeline_airline = Pipeline([
    
    #=========== IMPUTACIONES ===============

    #1. Imputación de variables numéricas
    ('mean_imputation',
        MeanMedianImputer(imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA)
    ),
    
    #============= CODIFICACIÓN DE VARIABLES CATEGORICAS  ==================

    # 1. Codificación por frecuencia
    ('categorical_freq_encoder',
        CountFrequencyEncoder(encoding_method='count',variables=CATEGORICAL_VARS_FREQ)
    ),

    # 2. Codificación One Hot Encoding

    ('one_hot_gender',
       mypp.Mapper(variables=ONE_HOT_GENDER, mappings=MAPPING_GENDER)
    ),

    ('one_hot_customer',
       mypp.Mapper(variables=ONE_HOT_CUSTOMER, mappings=MAPPING_CUSTOMER)
    ),

    ('one_hot_travel',
       mypp.Mapper(variables=ONE_HOT_TRAVEL, mappings=MAPPING_TRAVEL)
    ),   
   
    
    # =================== TRANSFORMACION DE VARIABLES ======================
    
    ('log_transform',
      
        LogTransformer(variables=NUMERICAL_LOG_VARS)
    ),
    #=============== SCALER ============
    ('scaler',
        MinMaxScaler()
    ),

])

In [117]:
# Guardamos pipeline para uso en proyecto
joblib.dump(pipeline_airline, 'pipeline_airline.pkl')

['pipeline_airline.pkl']