In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

Se utilizará un Dataset que expresa la satifacción de los pasajeros de diferentes vuelos según las caracteristicas del mismo.

Variables | Descripción
-----------|----------
Gender| Gender of the passengers (Female, Male)
Customer Type| The customer type (Loyal customer, disloyal customer)
Age| The actual age of the passengers
Type of Travel| Purpose of the flight of the passengers (Personal Travel, Business Travel)
Class| Travel class in the plane of the passengers (Business, Eco, Eco Plus)
Flight distance| The flight distance of this journey
Inflight wifi service| Satisfaction level of the inflight wifi service (0:Not Applicable;1-5)
Departure/Arrival time convenient| Satisfaction level of Departure/Arrival time convenient
Ease of Online booking| Satisfaction level of online booking
Gate location| Satisfaction level of Gate location
Food and drink|Satisfaction level of Food and drink
Online boarding|Satisfaction level of online boarding
Seat comfort| Satisfaction level of Seat comfort
Inflight entertainment| Satisfaction level of inflight entertainment
On-board service| Satisfaction level of On-board service
Leg room service| Satisfaction level of Leg room service
Baggage handling| Satisfaction level of baggage handling
Check-in service| Satisfaction level of Check-in service
Inflight service| Satisfaction level of inflight service
Cleanliness| Satisfaction level of Cleanliness
Departure Delay in Minutes| Minutes delayed when departure
Arrival Delay in Minutes| Minutes delayed when Arrival
Satisfaction| Airline satisfaction level(Satisfaction, neutral or dissatisfaction)

Corroboramos que no existan valores nulos.

In [28]:
test.isnull().sum()

Unnamed: 0                            0
id                                    0
Gender                                0
Customer Type                         0
Age                                   0
Type of Travel                        0
Class                                 0
Flight Distance                       0
Inflight wifi service                 0
Departure/Arrival time convenient     0
Ease of Online booking                0
Gate location                         0
Food and drink                        0
Online boarding                       0
Seat comfort                          0
Inflight entertainment                0
On-board service                      0
Leg room service                      0
Baggage handling                      0
Checkin service                       0
Inflight service                      0
Cleanliness                           0
Departure Delay in Minutes            0
Arrival Delay in Minutes             83
satisfaction                          0


In [29]:
test = test.dropna();
train = train.dropna();

test.drop(['Unnamed: 0'], axis=1, inplace=True);
train.drop(['Unnamed: 0'], axis=1, inplace=True);

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.drop(['Unnamed: 0'], axis=1, inplace=True);
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop(['Unnamed: 0'], axis=1, inplace=True);


In [30]:
test.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,19556,Female,Loyal Customer,52,Business travel,Eco,160,5,4,3,...,5,5,5,5,2,5,5,50,44.0,satisfied
1,90035,Female,Loyal Customer,36,Business travel,Business,2863,1,1,3,...,4,4,4,4,3,4,5,0,0.0,satisfied
2,12360,Male,disloyal Customer,20,Business travel,Eco,192,2,0,2,...,2,4,1,3,2,2,2,0,0.0,neutral or dissatisfied
3,77959,Male,Loyal Customer,44,Business travel,Business,3377,0,0,0,...,1,1,1,1,3,1,4,0,6.0,satisfied
4,36875,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,4,...,2,2,2,2,4,2,4,0,20.0,satisfied


#Feature Engineering

In [31]:
full= pd.concat([train,test], axis=0)
full.tail()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
25971,78463,Male,disloyal Customer,34,Business travel,Business,526,3,3,3,...,4,3,2,4,4,5,4,0,0.0,neutral or dissatisfied
25972,71167,Male,Loyal Customer,23,Business travel,Business,646,4,4,4,...,4,4,5,5,5,5,4,0,0.0,satisfied
25973,37675,Female,Loyal Customer,17,Personal Travel,Eco,828,2,5,1,...,2,4,3,4,5,4,2,0,0.0,neutral or dissatisfied
25974,90086,Male,Loyal Customer,14,Business travel,Business,1127,3,3,3,...,4,3,2,5,4,5,4,0,0.0,satisfied
25975,34799,Female,Loyal Customer,42,Personal Travel,Eco,264,2,5,2,...,1,1,2,1,1,1,1,0,0.0,neutral or dissatisfied


#Vuelo de larga distancia
Partiendo de que se considera un vuelo de larga distancia a aquellos que duran mas de 7 horas, se tiene en cuenta que la distancia aproximada es de unos 5000km. Por lo tanto se define una nueva variable que plantea si el vuelo es o no de larga distancia.

In [32]:
full['distancia_vuelo'] = full['Flight Distance'].apply(lambda x: 0 if x < 5000 else 1)



---



# Encoding


Se aplica ONE HOT ENCODING a las variables categoricas relevantes.

In [33]:
# Lista de columnas categóricas a codificar
columnas_categoricas = ['Gender','Customer Type','Type of Travel','Class','satisfaction', 'distancia_vuelo']

# Se aplica one-hot encoding a las columnas categóricas pertinentes
dummies = pd.get_dummies(full[columnas_categoricas])

full_v2 = pd.concat([full, dummies], axis=1)

full_v2.drop(columnas_categoricas, axis=1, inplace=True)

full_v2

Unnamed: 0,id,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,...,Gender_Male,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus,satisfaction_neutral or dissatisfied,satisfaction_satisfied
0,70172,13,460,3,4,3,1,5,3,5,...,1,1,0,0,1,0,0,1,1,0
1,5047,25,235,3,2,3,3,1,3,1,...,1,0,1,1,0,1,0,0,1,0
2,110028,26,1142,2,2,2,2,5,5,5,...,0,1,0,1,0,1,0,0,0,1
3,24026,25,562,2,5,5,5,2,2,2,...,0,1,0,1,0,1,0,0,1,0
4,119299,61,214,3,3,3,3,4,5,5,...,1,1,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25971,78463,34,526,3,3,3,1,4,3,4,...,1,0,1,1,0,1,0,0,1,0
25972,71167,23,646,4,4,4,4,4,4,4,...,1,1,0,1,0,1,0,0,0,1
25973,37675,17,828,2,5,1,5,2,1,2,...,0,1,0,0,1,0,1,0,1,0
25974,90086,14,1127,3,3,3,3,4,4,4,...,1,1,0,1,0,1,0,0,0,1


A continuación se procede a escalar los datos. Para esto, se prescindirá de los datos del ID

In [34]:
full_v2.drop(['id'], axis=1, inplace=True)

scaler = StandardScaler()

data_scaled = scaler.fit_transform(full_v2)

---

#Predicción

Se definen las variables X e Y para entrenar posteriormente. Nuestra feature objetivo será la variable "satisfaction_satisfied".

In [35]:
X = data_scaled # feature matrix
Y = full_v2['satisfaction_satisfied'] # feature objetivo

Se crea la siguiente función que se utiliza para dividir un conjunto de datos en dos subconjuntos aleatorios llamados conjuntos de entrenamiento y prueba. Se utiliza para entrenar y evaluar modelos de aprendizaje automático.

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.4, random_state=42)

Se aplica los modelos de predicción de Regresión Logistica, Arbol de decisiones y Random Forest.

In [37]:
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)

# Decision Tree
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
y_pred_dtree = dtree.predict(X_test)

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Evaluate the performance of each algorithm
def evaluate(y_test, y_pred):
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1_score': f1_score(y_test, y_pred)
}

logreg_metrics = evaluate(y_test, y_pred_logreg)    
dtree_metrics = evaluate(y_test, y_pred_dtree)
rf_metrics = evaluate(y_test, y_pred_rf)

# Compare the performance of each algorithm
print("Logistic Regression Metrics: ", logreg_metrics)
print("Decision Tree Metrics: ", dtree_metrics)
print("Random Forest Metrics: ", rf_metrics)

Logistic Regression Metrics:  {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0}
Decision Tree Metrics:  {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0}
Random Forest Metrics:  {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0}
