In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.preprocessing import PolynomialFeatures, Binarizer, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier

In [2]:
# Vamos a cargar los datos originales para guardarlos sin cambios
train_data_original = pd.read_csv('Data/train.csv')
test_data_original = pd.read_csv('Data/test.csv')

train_data_original = train_data_original.dropna()
train_data_original.head()
data_target = train_data_original['Transported']

train_data_original.isnull().sum()


PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
dtype: int64

In [3]:
"""
Ahora trabajaremos los datos categóricos. Me interean  'HomePlanet', 'Cabin', 'Destination'

"""
numerical_keys = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
categorical_keys = ['HomePlanet', 'Cabin', 'Destination','CryoSleep', 'VIP']

data_categorica = train_data_original[categorical_keys].copy()              

# Primero manejamos la columna cabina que viene en un string Piso/numero/lado

data_cabin = data_categorica['Cabin']                       
# Separamos los valores del string
data_cabin_separada = data_cabin.str.split('/', expand=True)
# Los agregamos devuelta a los datos categoricos
data_categorica[['Piso', 'Numero','Lado']] = data_cabin_separada

# hay 3 valores unicos y valores nan
print( data_categorica['Destination'].unique() )
print(' ')
# Con esta funcion, transformo valores categoricos a 1 y 0. la ausencia de un 1 en todas las columnas indica la ultma opcion, es decir los con valor null
data_categorica = pd.get_dummies(data_categorica, columns=['Destination'])

# Ahora hago lo mismo con los datos de HomePlanet, Piso y Lado
data_categorica = pd.get_dummies(data_categorica, columns=['CryoSleep'])
data_categorica = pd.get_dummies(data_categorica, columns=['VIP'])
data_categorica = pd.get_dummies(data_categorica, columns=['HomePlanet'])
data_categorica = pd.get_dummies(data_categorica, columns=['Piso'])
data_categorica = pd.get_dummies(data_categorica, columns=['Lado'])


# Quitamos el Cabin original ya que no lo usaremos
data_categorica_final = data_categorica.drop( ['Cabin','Numero'], axis=1 )

# Se puede ver que agregamos 3 columnas con cada destino, si no corresponde a ninguna todas son 0. Lo mismo para el piso y el lado
print('Datos categoricos limpios')
print(data_categorica_final.info())

# Con esto habremos limpiado de valores nulos todos los datos categóricos
# Solo la columna Numero tiene valores nulos pero los manejaremos junto a las otra variables numéricas

['TRAPPIST-1e' 'PSO J318.5-22' '55 Cancri e']
 
Datos categoricos limpios
<class 'pandas.core.frame.DataFrame'>
Index: 6606 entries, 0 to 8692
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0   Destination_55 Cancri e    6606 non-null   bool 
 1   Destination_PSO J318.5-22  6606 non-null   bool 
 2   Destination_TRAPPIST-1e    6606 non-null   bool 
 3   CryoSleep_False            6606 non-null   bool 
 4   CryoSleep_True             6606 non-null   bool 
 5   VIP_False                  6606 non-null   bool 
 6   VIP_True                   6606 non-null   bool 
 7   HomePlanet_Earth           6606 non-null   bool 
 8   HomePlanet_Europa          6606 non-null   bool 
 9   HomePlanet_Mars            6606 non-null   bool 
 10  Piso_A                     6606 non-null   bool 
 11  Piso_B                     6606 non-null   bool 
 12  Piso_C                     6606 non-null   bool 
 13  Piso_D   

In [4]:
# Ahora trabajaremos los datos numéricos

data_numerica = train_data_original[numerical_keys].copy()
data_numerica['Numero Habitacion'] = data_categorica['Numero']

#data_numerica['TotalGastos'] = data_numerica['RoomService'] + data_numerica['FoodCourt'] + data_numerica['ShoppingMall'] + data_numerica['Spa']+ data_numerica['VRDeck']

# Revisaremos los datos para examinarlos
data_numerica.head()
#  Ahora solo usaremos el total de gastos como indicador de dinero gastado ya que se encuentran muy relacionados entre si



Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Numero Habitacion
0,39.0,0.0,0.0,0.0,0.0,0.0,0
1,24.0,109.0,9.0,25.0,549.0,44.0,0
2,58.0,43.0,3576.0,0.0,6715.0,49.0,0
3,33.0,0.0,1283.0,371.0,3329.0,193.0,0
4,16.0,303.0,70.0,151.0,565.0,2.0,1


In [5]:

# Ahora utilizamos un imputer para transformar los valores no validos al promedio
# Tambien estandarizaremos los datos para que tengan desviacion estandar 1 y promedio 0

imputer = SimpleImputer(strategy='median')
estandarizador = StandardScaler()

data_numerica = pd.DataFrame( imputer.fit_transform(data_numerica, data_target), columns=data_numerica.columns )

data_numerica_final = pd.DataFrame( estandarizador.fit_transform(data_numerica), columns=data_numerica.columns)

data_numerica_final.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Numero Habitacion
count,6606.0,6606.0,6606.0,6606.0,6606.0,6606.0,6606.0
mean,9.035057000000001e-17,-1.613403e-18,3.3343660000000003e-17,-1.936084e-17,2.904126e-17,1.936084e-17,6.883853000000001e-17
std,1.000076,1.000076,1.000076,1.000076,1.000076,1.000076,1.000076
min,-1.988259,-0.3457562,-0.285355,-0.3094937,-0.2737594,-0.2695339,-1.167051
25%,-0.6808293,-0.3457562,-0.285355,-0.3094937,-0.2737594,-0.2695339,-0.8437233
50%,-0.1303326,-0.3457562,-0.285355,-0.3094937,-0.2737594,-0.2695339,-0.3412026
75%,0.6266005,-0.26978,-0.2360541,-0.2574361,-0.2169377,-0.223396,0.7690175
max,3.447896,15.03554,17.4767,20.95256,19.31486,17.77392,2.521997


In [6]:
"""
Ahora voy a juntar los valores numericos iniciales y los valores categoricos luego 
de haber pasado por el one hot encoder para terminar de preparar el data set.

Vamos a estandarizar, es decir, todos los datos tienen desviacion estandar igual a 1 y promedio 0
"""
# Unimos los datos categoricos y los numericos
X = data_numerica_final
X[data_categorica_final.columns] = data_categorica_final


X_train, X_test, y_train, y_test = train_test_split(X,data_target, train_size=.75, shuffle=True)

X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4954 entries, 1140 to 772
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        4954 non-null   float64
 1   RoomService                4954 non-null   float64
 2   FoodCourt                  4954 non-null   float64
 3   ShoppingMall               4954 non-null   float64
 4   Spa                        4954 non-null   float64
 5   VRDeck                     4954 non-null   float64
 6   Numero Habitacion          4954 non-null   float64
 7   Destination_55 Cancri e    3773 non-null   object 
 8   Destination_PSO J318.5-22  3773 non-null   object 
 9   Destination_TRAPPIST-1e    3773 non-null   object 
 10  CryoSleep_False            3773 non-null   object 
 11  CryoSleep_True             3773 non-null   object 
 12  VIP_False                  3773 non-null   object 
 13  VIP_True                   3773 non-null   object 


In [7]:
"""
Primera prueba con algunos modelos usando todos los datos
"""
from sklearn.ensemble import RandomForestClassifier


modelo_random_forest_clf = RandomForestClassifier( n_estimators=200, criterion='gini', random_state=42)

modelo_random_forest_clf.fit(X_train,y_train)
modelo_random_forest_clf.score(X_test,y_test)


0.7681598062953995

In [8]:
mlp = MLPClassifier(max_iter=1000, 
                    random_state=42, 
                    activation='tanh', 
                    learning_rate_init=0.001, 
                    hidden_layer_sizes=(16,8,4,2), 
                     )
mlp.fit(X_train, y_train)
mlp.score(X_test,y_test)

ValueError: Input X contains NaN.
MLPClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
test_data_original