# Proyecto

***

### Librerías

Se importan las librerías necesarias:

In [156]:
import tensorflow as tf, numpy as np, matplotlib as plt, pandas as pd, sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import svm

***
### Análisis exploratorio de datos

In [157]:
data_set = pd.read_csv("data_titanic_proyecto.csv")
data_set.head(5)

Unnamed: 0,PassengerId,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,passenger_class,passenger_sex,passenger_survived
0,1,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S,Lower,M,N
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,Upper,F,Y
2,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,S,Lower,F,Y
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S,Upper,F,Y
4,5,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S,Lower,M,N


In [158]:
data_shape = data_set.shape
print(data_shape)

(891, 12)


In [159]:
col_name = data_set.columns
print(col_name)

Index(['PassengerId', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked', 'passenger_class', 'passenger_sex',
       'passenger_survived'],
      dtype='object')


***

### NaN

Buscando valores NaN en los features

In [160]:
data_set.isnull().sum()

PassengerId             0
Name                    0
Age                   177
SibSp                   0
Parch                   0
Ticket                  0
Fare                    0
Cabin                 687
Embarked                2
passenger_class         0
passenger_sex           0
passenger_survived      0
dtype: int64

Para las features utilizadas se dejará de tomar en cuenta la variable Cabin (687 NaN) porque tiene más del 70% de la data perdida.

In [161]:
data_set = data_set.drop('Cabin', axis = 1)
col_name = col_name.drop('Cabin')
print(x_name)
data_set.head(5)

Index(['PassengerId', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Embarked', 'passenger_class', 'passenger_sex'],
      dtype='object')


Unnamed: 0,PassengerId,Name,Age,SibSp,Parch,Ticket,Fare,Embarked,passenger_class,passenger_sex,passenger_survived
0,1,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,S,Lower,M,N
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C,Upper,F,Y
2,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,S,Lower,F,Y
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,S,Upper,F,Y
4,5,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,S,Lower,M,N


Embarked tiene dos valores desconocidos y se buscará reemplazar con el valor que más veces se repite en esta columna.

In [162]:
data_set.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [163]:
data_set["Embarked"] = data_set["Embarked"].fillna("S")

En el caso Age (108 NaN) y  y se colocará la mediana de la edad.

In [164]:
median_edad = data_set["Age"].median()
data_set["Age"] = data_set["Age"].fillna(median_edad)

Nos aseguramos que en ninguna de las features tengamos valores NaN

In [165]:
data_set.isnull().sum()

PassengerId           0
Name                  0
Age                   0
SibSp                 0
Parch                 0
Ticket                0
Fare                  0
Embarked              0
passenger_class       0
passenger_sex         0
passenger_survived    0
dtype: int64

***
### Datos categóricos

Se realiza one hot encoding para las siguientes variables categoricas:
* passenger_sex
* passenger_survived (target)
* Embarked

***

Para la siguiente variable categorica es del tipo ordinal:
* passenger_class

Lo que quiere decir que tiene un orden entonces se asignará de la siguiente manera:
- Lower = 1
- Middle = 2
- Upper = 3

In [178]:
data_x = data_set.iloc[:,:-1]
data_x.head(2)

Unnamed: 0,PassengerId,Name,Age,SibSp,Parch,Ticket,Fare,Embarked,passenger_class,passenger_sex
0,1,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,S,Lower,M
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C,Upper,F


In [167]:
data_y = data_set.iloc[:,-1]

In [168]:
labelencoder = LabelEncoder()
#Aplicando one hot encoding para y
categorias = labelencoder.fit_transform(data_y)
one_hot = np.eye(len(set(data_y)))[categorias]
data_y = one_hot
print(data_y[:2])

[[1. 0.]
 [0. 1.]]


In [169]:
labels_one_hot = list(("Embarked", "passenger_sex"))
data_encoded = pd.get_dummies(data_x[labels_one_hot])
data_encoded.head(5)

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S,passenger_sex_F,passenger_sex_M
0,0,0,1,0,1
1,1,0,0,1,0
2,0,0,1,1,0
3,0,0,1,1,0
4,0,0,1,0,1


Agregando la data con one hot encoding en nuestra matriz de features

In [179]:
data_x = data_x.join(data_encoded)

In [181]:
data_x.head(2)

Unnamed: 0,PassengerId,Name,Age,SibSp,Parch,Ticket,Fare,Embarked,passenger_class,passenger_sex,Embarked_C,Embarked_Q,Embarked_S,passenger_sex_F,passenger_sex_M
0,1,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,S,Lower,M,0,0,1,0,1
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C,Upper,F,1,0,0,1,0


***
Separando el data set en:
* Training
* Validation
* Test

In [95]:

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size = 0.2, random_state = 0)

In [96]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state = 0)

In [7]:
print("------------------")
print("x_train "+str(x_train.shape))
print("y_train "+str(y_train.shape))
print("------------------")
print("x_val "+str(x_val.shape))
print("y_val "+str(y_val.shape))
print("------------------")
print("x_test "+str(x_test.shape))
print("y_test "+str(y_test.shape))
print("------------------")

------------------
x_train (569, 11)
y_train (569,)
------------------
x_val (143, 11)
y_val (143,)
------------------
x_test (179, 11)
y_test (179,)
------------------


PassengerId          0
Name                 0
Age                108
SibSp                0
Parch                0
Ticket               0
Fare                 0
Cabin              435
Embarked             1
passenger_class      0
passenger_sex        0
dtype: int64

Index(['PassengerId', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Embarked', 'passenger_class', 'passenger_sex'],
      dtype='object')


S    418
C    100
Q     51
Name: Embarked, dtype: int64


Nos aseguramos no tener NaN en ninguna de las columnas.

PassengerId        0
Name               0
Age                0
SibSp              0
Parch              0
Ticket             0
Fare               0
Embarked           0
passenger_class    0
passenger_sex      0
dtype: int64

In [39]:
label_embarked = x_train['Embarked'].unique()
label_sex = x_train['passenger_sex'].unique()
label_survived = y_train.unique()
label_class = x_train['passenger_class'].unique()

print(label_sex)
print(label_embarked)
print(label_survived)
print(label_class)

['F' 'M']
['S' 'Q' 'C']
['Y' 'N']
['Upper' 'Lower' 'Middle']


In [79]:
x_onehot = x_train.copy()


Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S,passenger_sex_F,passenger_sex_M
140,1,0,0,1,0
439,0,0,1,0,1
817,1,0,0,0,1
378,1,0,0,0,1
491,0,0,1,0,1


In [54]:
def train_SVM(x, y):
    class_svm = svm.SVC(kernel='linear') # Linear Kernel
    #Train the model using the training sets
    clf.fit(X_train, y_train)
    #Predict the response for test dataset
    y_pred = clf.predict(X_test)
    return
    