# Entrenamiento y Evaluación de Modelos

## Trabajo Práctico Nro. 2 - Grupo 3

#### Integrantes:
* Ignacio Busso
* Lucas Copes
* Jesica Heit

#### Dataset: https://www.kaggle.com/datasets/teejmahal20/airline-passenger-satisfaction
* Detalle: contiene datos de la satisfacción de los pasajeros de diferentes vuelos tomando en cuenta multiples aspectos (calidad del servicio, comodidad, limpieza, etc.)
* Target: columna 'satisfaction', para determinar la satisfacción de un pasajero respecto a un vuelo.
* Dimensiones: 25 columnas x 129.880 filas.

In [None]:
%matplotlib inline

import warnings
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

pd.options.display.max_columns = 0

#Cambios en el estilo de los graficos
plt.style.use('fast')
plt.rcParams.update({
    "font.family": ["serif"],
    "font.sans-serif": ["Roboto"],
    "font.size": 9,
    "axes.labelsize": 11,
    "axes.titlesize": 13,
    "xtick.labelsize": 11,
    "ytick.labelsize": 11,
    "legend.fontsize": 11,
    'figure.figsize': (11.0, 5.0),
    'axes.grid': True,
    'axes.spines.left': True,
    'axes.spines.right': True,
    'axes.spines.top': True,
    'axes.spines.bottom': True,
})

np.set_printoptions(suppress=True)

warnings.filterwarnings('ignore')

In [None]:
# Lectura y concatenación de los .csv
train = pd.read_csv('data/train.csv', index_col=[0])
test = pd.read_csv('data/test.csv', index_col=[0])
full = pd.concat([train, test], sort=False)

# Asignamos nuevos nombres a algunas de las columnas
new_column_names = {
    'Gender': 'gender',
    'Customer Type': 'customer_type',
    'Age': 'age',
    'Type of Travel': 'business_travel',
    'Class': 'ticket_class',
    'Flight Distance': 'flight_distance',
    'Inflight wifi service': 'wifi_service',
    'Departure/Arrival time convenient': 'departure_arrival_time_convenient',
    'Ease of Online booking': 'online_booking',
    'Gate location': 'gate_location',
    'Food and drink': 'food_and_drink',
    'Online boarding': 'online_boarding',
    'Seat comfort': 'seat_comfort',
    'Inflight entertainment': 'inflight_entertainment',
    'On-board service': 'onboard_service',
    'Leg room service': 'leg_room',
    'Baggage handling': 'baggage_handling',
    'Checkin service': 'checkin',
    'Inflight service': 'inflight_service',
    'Cleanliness': 'cleanliness',
    'Departure Delay in Minutes': 'departure_delay',
    'Arrival Delay in Minutes': 'arrival_delay',
}

full.rename(columns=new_column_names, inplace=True)
full.set_index('id', inplace=True)

In [None]:
# Conversión a variables booleanas
full['gender'] = full['gender'].replace(['Male','Female'],['0','1'])
full['customer_type'] = full['customer_type'].replace(['disloyal Customer','Loyal Customer'],['0','1'])
full['business_travel'] = full['business_travel'].replace(['Personal Travel','Business travel'],['0','1'])
full['satisfaction'] = full['satisfaction'].replace(['neutral or dissatisfied','satisfied'],['0','1'])

In [None]:
full.shape

In [None]:
#Limpieza de filas con pocas (< 500) features de servicios nulas
full = full[
    ~(full == 0).gate_location &
    ~(full == 0).food_and_drink &
    ~(full == 0).seat_comfort & 
    ~(full == 0).inflight_entertainment &
    ~(full == 0).onboard_service &
    ~(full == 0).checkin &
    ~(full == 0).inflight_service &
    ~(full == 0).cleanliness
    ]
full.shape

In [None]:
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

mapper = DataFrameMapper([
    (['wifi_service'], [IterativeImputer(missing_values=0), MinMaxScaler()]),
    (['departure_arrival_time_convenient'], [IterativeImputer(missing_values=0), MinMaxScaler()]),
    (['online_booking'], [IterativeImputer(missing_values=0), MinMaxScaler()]),
    (['gate_location'], [MinMaxScaler()]),
    (['food_and_drink'], [MinMaxScaler()]),
    (['online_boarding'], [IterativeImputer(missing_values=0), MinMaxScaler()]),
    (['seat_comfort'], [MinMaxScaler()]),
    (['inflight_entertainment'], [MinMaxScaler()]),
    (['onboard_service'], [MinMaxScaler()]),
    (['leg_room'], [IterativeImputer(missing_values=0), MinMaxScaler()]),
    (['baggage_handling'], [MinMaxScaler()]),
    (['checkin'], [MinMaxScaler()]),
    (['inflight_service'], [MinMaxScaler()]),
    (['cleanliness'], [MinMaxScaler()]),
    (['departure_delay'], [MinMaxScaler()]),
    (['arrival_delay'], [IterativeImputer(), MinMaxScaler()]),
    (['ticket_class'], [OneHotEncoder()])
], df_out=True)

mapper.fit(full)
#mfit = mapper.fit(full)
#full_transformed = mapper.transform(full)

In [None]:
# Transformación de un sample:
sample = full.sample(20)
# Sample original:
sample

In [None]:
# Sample transformado.
mapper.transform(sample)

In [None]:
# Nombres de los features
mapper.transformed_names_


In [None]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('mapper', mapper),
    ('imputer', IterativeImputer()),
])
# Lo entrenamos con train
pipe.fit(full)

In [None]:
pipe.transform(full)

In [None]:
#Ejemplo IterativeImputer

entradas = np.array((
    (20, 1, 200),
    (10, 1, 0),
    (30, 3, 1),
    (20, 2, 1),
    (10, 2, 23),
))

imputador = IterativeImputer(missing_values=0)
imputador.fit_transform(entradas)