## Tratamiento de variables

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("data/Titanic-Dataset.csv", sep=",")
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# Columna Sex

Se cambian las etiquetas de la columna "Sex" por los valores numéricos **0** y **1**

In [4]:
data['Sex'] = np.where(data['Sex'] == 'male',0,1)

# Columna Embarked

Completamos las dos filas con datos nulos en "Embarket" por la S que es el puerto deonde subieron mayor cantidad de personas

In [5]:
data.loc[data["Embarked"].isnull(),"Embarked"] = "S"

Se establece cada etiqueta de *Embatked* como **1**, **2** y **3** en funcion a la cantidad de pasajeros que abordo el barco en cada Puerto. Mayor cantidad de pasajeros como **3** y menor cantidad como **1**

In [6]:
data['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [7]:
data['Embarked'].value_counts()

S    646
C    168
Q     77
Name: Embarked, dtype: int64

In [8]:
data['Embarked'] = data["Embarked"].replace({ "S": 3, "C":2, "Q":1})

# Columnas SibSp y Parch

In [9]:
data['SibSp'].unique()

array([1, 0, 3, 4, 2, 5, 8], dtype=int64)

In [10]:
data['Parch'].unique()

array([0, 1, 2, 5, 3, 4, 6], dtype=int64)

Como ambas columnas indican el tipo de vinculos entre pasajeros, se suman para generar una columna nueva que almacene la misma información

In [11]:
data['Flia'] = data['SibSp'] + data['Parch']

In [12]:
np.unique(data['Flia'])

array([ 0,  1,  2,  3,  4,  5,  6,  7, 10], dtype=int64)

# Columna Age

Como la columna se presenta como útil, se busca la mejor manera de completar los datos faltantes. En un paso posterior podríamos establecer rango de edades para buscar similitudes entre menos cantidad de datos

A los NaN de la variable "Edad" los reemplazamos por la media de edad en funcion a las variables clase y sexo

In [13]:
for s in range(0, 2):
    for c in range(1, 4):
        media = (data.Age[(data.Survived == s) & (data.Pclass == c) & (data.Sex == s)].mean())
        data.loc[(data.Age.isnull()) & (data.Pclass == c) & (data.Sex == s), "Age"] = media 

In [14]:
#bins = [0, 18, 24, 30, 36, 48, 100]
#names = [1, 2, 3, 4, 5, 6]
#data['Rango_Edad'] = pd.cut(data['Age'], bins, labels = names)

In [15]:
#data["Rango_Edad"] = pd.to_numeric(data["Rango_Edad"])

# Columna Cabin

In [16]:
data.Cabin.fillna("Null", inplace = True)
data['Camarote'] = data.Cabin.str.slice(0, 1)

In [17]:
data['Camarote'].unique()

array(['N', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [18]:
bins = ['N', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T']
data['Camarote'].replace(bins,[0,1,2,3,4,5,6,7,8],inplace=True)

# Columna Fare

Cambio los valores nulos de la columna Fare con la media de dicho campo agrupandolos por su Pclass y Fare

In [19]:
data["Fare"].fillna(data.groupby("Pclass")["Fare"].transform("median"), inplace=True)

Se establecen rangos de valores de los Tickets segun 10 percentiles:

In [20]:
#bins = [-1, 8, 13, 24, 38, 7225]
#names = [1, 2, 3, 4, 5]
#data['Rango_Precio'] = pd.cut(data['Fare'], bins, labels = names)

In [21]:
#data["Rango_Precio"] = pd.to_numeric(data["Rango_Precio"])

In [22]:
#data["Rango_Precio"].unique()

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int32  
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    object 
 11  Embarked     891 non-null    int64  
 12  Flia         891 non-null    int64  
 13  Camarote     891 non-null    int64  
dtypes: float64(2), int32(1), int64(8), object(3)
memory usage: 94.1+ KB


# Columna Ticket

In [24]:
data['Ticket'] = list(map(str, data['Ticket'])) #Convierte la lista con elementos enteros en elementos string
data['Ticket'] = [i[-4:] for i in data['Ticket']] #Recorre los elementos de la lista, y les quita los ultimos 4 digitos

In [25]:
data['Ticket'] = data['Ticket'].replace({ "LINE": 1010, "P. 3":2112})

In [26]:
data['Ticket'].unique()

array(['1171', '7599', '1282', '3803', '3450', '0877', '7463', '9909',
       '7742', '7736', '9549', '3783', '2151', '7082', '0406', '8706',
       '2652', '4373', '5763', '2649', '9865', '8698', '0923', '3788',
       '7077', '2631', '9950', '0959', '9216', '7601', '7569', '5677',
       '4579', '7604', '3789', '2677', '2152', '5764', '2651', '7546',
       '1668', '9253', '2123', '0958', '3567', '0371', '4311', '2662',
       '9237', '1295', '9886', '7572', '2926', '3509', '9947', '1026',
       '2697', '4651', '2144', '2669', '3572', '6973', '7088', '7605',
       '2661', '9395', '3464', '1281', '5151', '3111', '4879', '2680',
       '1601', '8123', '9208', '4746', '8738', '4516', '5767', '5779',
       '0932', '3059', '4885', '1278', '6608', '2086', '3275', '3276',
       '7466', '5734', '2315', '4500', '4910', '7754', '7759', '1919',
       '4367', '9245', '9215', '5281', '7540', '1276', '9207', '3120',
       '2991', '9249', '1110', '0465', '2665', '4669', '4136', '2627',
      

In [27]:
data['Ticket'] = list(map(int, data['Ticket'])) #Devuelve los elementos de la lista a su estado de entero

In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int32  
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    int64  
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    object 
 11  Embarked     891 non-null    int64  
 12  Flia         891 non-null    int64  
 13  Camarote     891 non-null    int64  
dtypes: float64(2), int32(1), int64(9), object(2)
memory usage: 94.1+ KB


# Columnas Name, Cabin, Age, Parch, Fare

Se eliminan las columnas *Name* por tener valores unicos y *Cabin* por tener demasiados datos faltantes

In [29]:
data = data.drop(['Name','SibSp','Parch','Cabin','Camarote','PassengerId'], axis = 1)

In [30]:
data

Unnamed: 0,Survived,Pclass,Sex,Age,Ticket,Fare,Embarked,Flia
0,0,3,0,22.000000,1171,7.2500,3,1
1,1,1,1,38.000000,7599,71.2833,2,1
2,1,3,1,26.000000,1282,7.9250,3,0
3,1,1,1,35.000000,3803,53.1000,3,1
4,0,3,0,35.000000,3450,8.0500,3,0
...,...,...,...,...,...,...,...,...
886,0,2,0,27.000000,1536,13.0000,3,0
887,1,1,1,19.000000,2053,30.0000,3,0
888,0,3,1,19.329787,6607,23.4500,3,3
889,1,1,0,26.000000,1369,30.0000,2,0


Se normaliza la columna Fare para evitar desvalance en el peso de los valores de las columnas

In [31]:
#data['Camarote'] = data['Camarote'] /data['Camarote'].abs().max()
data['Embarked'] = data['Embarked'] /data['Embarked'].abs().max()
data['Ticket'] = data['Ticket'] /data['Ticket'].abs().max()
data['Pclass'] = data['Pclass'] /data['Pclass'].abs().max()
#data['SibSp'] = data['SibSp'] /data['SibSp'].abs().max()
#data['Parch'] = data['Parch'] /data['Parch'].abs().max()
data['Fare'] = data['Fare'] /data['Fare'].abs().max()
data['Flia'] = data['Flia'] /data['Flia'].abs().max()
data['Age'] = data['Age'] /data['Age'].abs().max()


In [32]:
data

Unnamed: 0,Survived,Pclass,Sex,Age,Ticket,Fare,Embarked,Flia
0,0,1.000000,0,0.275000,0.117147,0.014151,1.000000,0.1
1,1,0.333333,1,0.475000,0.760204,0.139136,0.666667,0.1
2,1,1.000000,1,0.325000,0.128251,0.015469,1.000000,0.0
3,1,0.333333,1,0.437500,0.380452,0.103644,1.000000,0.1
4,0,1.000000,0,0.437500,0.345138,0.015713,1.000000,0.0
...,...,...,...,...,...,...,...,...
886,0,0.666667,0,0.337500,0.153661,0.025374,1.000000,0.0
887,1,0.333333,1,0.237500,0.205382,0.058556,1.000000,0.0
888,0,1.000000,1,0.241622,0.660964,0.045771,1.000000,0.3
889,1,0.333333,0,0.325000,0.136955,0.058556,0.666667,0.0


In [33]:
data.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,Ticket,Fare,Embarked,Flia
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,0.769547,0.352413,0.367371,0.471071,0.062858,0.879536,0.09046
std,0.486592,0.27869,0.47799,0.167848,0.29434,0.096995,0.211891,0.161346
min,0.0,0.333333,0.0,0.00525,0.002501,0.0,0.333333,0.0
25%,0.0,0.666667,0.0,0.2625,0.214486,0.01544,0.666667,0.0
50%,0.0,1.0,0.0,0.340698,0.416166,0.028213,1.0,0.0
75%,1.0,1.0,1.0,0.45,0.747199,0.060508,1.0,0.1
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Preparacion de datos para incluirlos en los modelos

In [34]:
X = data.drop('Survived', axis = 1)
y = data['Survived']

In [35]:
X

Unnamed: 0,Pclass,Sex,Age,Ticket,Fare,Embarked,Flia
0,1.000000,0,0.275000,0.117147,0.014151,1.000000,0.1
1,0.333333,1,0.475000,0.760204,0.139136,0.666667,0.1
2,1.000000,1,0.325000,0.128251,0.015469,1.000000,0.0
3,0.333333,1,0.437500,0.380452,0.103644,1.000000,0.1
4,1.000000,0,0.437500,0.345138,0.015713,1.000000,0.0
...,...,...,...,...,...,...,...
886,0.666667,0,0.337500,0.153661,0.025374,1.000000,0.0
887,0.333333,1,0.237500,0.205382,0.058556,1.000000,0.0
888,1.000000,1,0.241622,0.660964,0.045771,1.000000,0.3
889,0.333333,0,0.325000,0.136955,0.058556,0.666667,0.0


In [36]:
from sklearn.model_selection import train_test_split

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_val_train, X_val_test, y_val_train, y_val_test = train_test_split(X_train, y_train, test_size=0.2, random_state=123)

In [38]:
X_train.shape, y_train.shape, X_val_train.shape, y_val_train.shape, X_val_test.shape, y_val_test.shape

((712, 7), (712,), (569, 7), (569,), (143, 7), (143,))

In [39]:
from sklearn import linear_model
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

In [40]:
max_iter = np.arange(2,20)

In [41]:
df_rLog = pd.DataFrame(columns=['Accuracy','Max_Iter'])
for i in max_iter:
    rLog = linear_model.LogisticRegression(max_iter = i, fit_intercept = True, class_weight='balanced', n_jobs = -1, random_state = 123)
    rLog.fit(X_val_train, y_val_train)
    y_pred = rLog.predict(X_val_test)
    acc_rLog = accuracy_score(y_val_test, y_pred)
    df_rLog = df_rLog.append({'Accuracy': acc_rLog,'Max_Iter': i}, ignore_index=True)

In [42]:
df_rLog.nlargest(5,['Accuracy'])

Unnamed: 0,Accuracy,Max_Iter
0,0.804196,2.0
1,0.804196,3.0
2,0.804196,4.0
3,0.804196,5.0
8,0.797203,10.0


In [43]:
from sklearn.neighbors import KNeighborsClassifier

In [44]:
prof = np.arange(5,30)
weights = ['uniform','distance']

In [45]:
df_knn = pd.DataFrame(columns=['Accuracy','Cant_Vecinos','Weights'])
for i in prof:
    for j in weights:
        knn = KNeighborsClassifier(n_neighbors = i, weights = j)
        knn.fit(X_val_train, y_val_train)
        y_pred_knn = knn.predict(X_val_test)
        acc_knn = accuracy_score(y_val_test, y_pred_knn)
        df_knn = df_knn.append({'Accuracy': acc_knn,'Cant_Vecinos': i, 'Weights':j}, ignore_index=True)

In [46]:
df_knn.nlargest(5,['Accuracy'])

Unnamed: 0,Accuracy,Cant_Vecinos,Weights
26,0.839161,18,uniform
16,0.832168,13,uniform
18,0.832168,14,uniform
19,0.832168,14,distance
24,0.832168,17,uniform


In [47]:
from sklearn.tree import DecisionTreeClassifier

In [48]:
prof = np.arange(1,10)
crit = ['gini','entropy']
spli = ['best','random']

In [49]:
df_tree = pd.DataFrame(columns=['Accuracy','Max_Prof','Criterion','Splitter'])
for i in prof:
    for j in crit:
        for k in spli:
            tree = DecisionTreeClassifier(max_depth = i, criterion = j, splitter = k, random_state = 123)
            tree.fit(X_val_train, y_val_train)
            y_pred_tree = tree.predict(X_val_test)
            acc_tree = accuracy_score(y_val_test, y_pred_tree)
            df_tree = df_tree.append({'Accuracy': acc_tree, 'Max_Prof': i, 'Criterion':j, 'Splitter':k}, ignore_index=True)

In [50]:
df_tree.nlargest(5,['Accuracy'])

Unnamed: 0,Accuracy,Max_Prof,Criterion,Splitter
27,0.853147,7,entropy,random
18,0.839161,5,entropy,best
14,0.832168,4,entropy,best
26,0.832168,7,entropy,best
35,0.832168,9,entropy,random


# Ensambles

### Random Forest

In [51]:
from sklearn.ensemble import RandomForestClassifier

In [52]:
prof = np.arange(2,10)
crit = ['gini','entropy']
esti = np.arange(2,102,5)

In [53]:
df_rf = pd.DataFrame(columns=['Accuracy','Max_Prof','Criterion','n_Estimators'])
for i in prof:
    for j in crit:
        for k in esti:
            rf = RandomForestClassifier(max_depth=i, criterion = j, n_estimators= k, random_state=123)
            rf.fit(X_val_train, y_val_train)
            y_pred_rf = rf.predict(X_val_test)
            acc_rf = accuracy_score(y_val_test, y_pred_rf)
            df_rf = df_rf.append({'Accuracy': acc_rf,'Max_Prof': i,'Criterion': j,'n_Estimators': k}, ignore_index=True)

In [54]:
df_rf.nlargest(5,['Accuracy'])

Unnamed: 0,Accuracy,Max_Prof,Criterion,n_Estimators
272,0.874126,8,entropy,62
167,0.867133,6,gini,37
182,0.867133,6,entropy,12
184,0.867133,6,entropy,22
187,0.867133,6,entropy,37


### Gradient Boosting

In [55]:
from sklearn.ensemble import GradientBoostingClassifier

In [56]:
prof = np.arange(2,10)
esti = np.arange(2,52,5)

In [57]:
df_gb = pd.DataFrame(columns=['Accuracy','Max_Prof','n_Estimators'])
for i in prof:
    for k in esti:
            gb = GradientBoostingClassifier(max_depth=i, n_estimators= k, random_state = 123)
            gb.fit(X_val_train, y_val_train)
            y_pred_gb = gb.predict(X_val_test)
            acc_gb = accuracy_score(y_val_test, y_pred_gb)
            df_gb = df_gb.append({'Accuracy': acc_gb,'Max_Prof': i,'n_Estimators': k}, ignore_index=True)

In [58]:
df_gb.nlargest(5,['Accuracy'])

Unnamed: 0,Accuracy,Max_Prof,n_Estimators
18,0.874126,3.0,42.0
13,0.867133,3.0,17.0
14,0.867133,3.0,22.0
19,0.867133,3.0,47.0
16,0.86014,3.0,32.0


### XGBoost

In [59]:
from xgboost import XGBClassifier

In [60]:
learning_rate = [0.07,0.08,0.09,0.1,0.15,0.2,0.3]
max_depth = np.arange(2,52,10)
n_estimators = np.arange(15,155,5)
subsample = [0.1,0.25,0.5,0.75]
colsample_bytree = [0.1,0.25,0.5,0.75]

In [61]:
df_xgbo = pd.DataFrame(columns=['Accuracy','Max_Prof','n_Estimators','Learning_Rate','Subsample','Colsample_bytree'])
for i in max_depth:
    for j in n_estimators:
        for k in learning_rate:
            for l in subsample:
                for m in colsample_bytree:
                    xgbo = XGBClassifier(max_depth = i, n_estimators = j, learning_rate = k, subsample = l, colsample_bytree = m, nthread = -1, 
                    objective = "binary:logistic", eval_metric = "logloss", verbosity = None, use_label_encoder = False)
                    xgbo.fit(X_val_train, y_val_train)#, eval_set = [(X_val_train, y_val_train), (X_val_test, y_val_test)], early_stopping_rounds = 10, eval_metric = 'logloss')
                    y_pred_xgbo = xgbo.predict(X_val_test)
                    acc_xgbo = accuracy_score(y_val_test, y_pred_xgbo)
            df_xgbo = df_xgbo.append({'Accuracy':acc_xgbo,'Max_Prof':i,'n_Estimators':j,'Learning_Rate':k, 'Subsample':l,'Colsample_bytree':m}, ignore_index=True)

In [62]:
df_xgbo.nlargest(5,['Accuracy'])

Unnamed: 0,Accuracy,Max_Prof,n_Estimators,Learning_Rate,Subsample,Colsample_bytree
196,0.874126,12.0,15.0,0.07,0.75,0.75
199,0.874126,12.0,15.0,0.1,0.75,0.75
200,0.874126,12.0,15.0,0.15,0.75,0.75
207,0.874126,12.0,20.0,0.15,0.75,0.75
220,0.874126,12.0,30.0,0.1,0.75,0.75
