Titanic - Machine Learning from Disaster

# Crear Ambiente de Trabajo (Opcional)

Esto es una buena práctica en caso de trabajar de forma local. Si se va a seguir el diplomado en un notebook de Google Colab, se puede saltar esta sección ([Referencia](https://docs.python.org/3/library/venv.html)).

1. Crear `requirements.txt` con las siguientes librerías:
  - pandas
  - numpy
  - scikit-learn
  - plotly
  - tqdm
2. Inicializar ambiente virtual (en terminal) `python -m venv /path/to/new/virtual/environment`
3. Activar el ambiente virtual (en terminal) con
  - **Windows**: `<venv>\Scripts\activate.ps1`
  - **Mac & Linux**: `source <venv>/bin/activate`
4. Instalar las librerías (en terminal) `pip install -r requirements.txt`

## Librerías

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression
from sklearn.neural_network import MLPRegressor
from tqdm import tqdm
from copy import deepcopy

# Cargar Datos

Los datos vienen de la siguiente [fuente](https://www.kaggle.com/competitions/titanic/data). Es importante leer la descripción de las variables antes de comenzar con la exploración.


In [2]:
# Link que te sale al compartir archivo
data_url = 'https://drive.google.com/file/d/18N9Sl1T0dORwXlQ8gfy6dgAtPp3l5sDK/view?usp=drive_link'

# Extraer el id de archivo
file_id = data_url.split('/')[-2]

# Crear link de descarga
url = 'https://drive.google.com/uc?id=' + file_id

# Leer en Data Frame
data_orig = pd.read_csv(url)
data_orig

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Explorar Datos

Usaremos la librería [plotly](https://plotly.com/python/) para visualizaciones.

In [3]:
fig = px.histogram(x=data_orig['Survived'], histnorm='probability')
fig.show()

## Valores Nulos Preliminares

In [4]:
missing = data_orig.isna().sum() / len(data_orig)

fig = px.bar(x=missing.index, y=missing)

fig.show()

## Exploración de Variables

In [5]:
def plot_histogram(data: pd.DataFrame, variable: str, v_sort: str = None):
  if not v_sort:
    v_sort = variable

  # Histogram
  fig = px.histogram(
      data_frame=data.sort_values(v_sort),  # Data Frame que tiene los datos
      x=variable,                           # Variable a graficar
      color='Survived',                     # Variable que divide por colores
      # opacity=0.5,                        # Opacidad (1 = opaco, 0 = transparente)
      histnorm='probability',               # Presentación de y. Conteo, probabilidad, etc.
      barmode='overlay'                     # Apilar o mostrar por separado los histogramas
    )
  fig.update_layout(title=f'Histograma de "{variable}"')

  return fig


def plot_2d_histogram(
    data: pd.DataFrame,
    x_name: str,
    y_name: str,
    x_sort: str = None,
    y_sort: str = None
  ):

  if not x_sort:
    x_sort = x_name
  if not y_sort:
    y_sort = y_name


  # Ordenar dataframe para variables categóricas
  sorted = data.sort_values(y_sort).copy()
  sorted = sorted.sort_values(x_sort)
  survived_bool = sorted['Survived'] == 1
  x = sorted[x_name]
  y = sorted[y_name]

  # Histograma 2D
  fig = make_subplots(
    rows=1, cols=2,
    subplot_titles = ['Sobrevivieron', 'No Sobrevivieron']
  )

  # Para los que sobrevivieron
  fig.add_trace(
    go.Histogram2d(
      x=x[survived_bool],
      y=y[survived_bool],
      histnorm='probability',
      coloraxis='coloraxis'
    ),
    row=1,
    col=1
  )

  # Para los que no sobrevivieron
  fig.add_trace(
    go.Histogram2d(
      x=x[~survived_bool],
      y=y[~survived_bool],
      histnorm='probability',
      coloraxis='coloraxis'
    ),
    row=1,
    col=2
  )

  # Cambiar títulos
  fig.update_layout(title=f'Histograma 2D de "{x_name}" y "{y_name}"')
  fig.update_xaxes(title_text=x_name, row=1, col=1)
  fig.update_xaxes(title_text=x_name, row=1, col=2)
  fig.update_yaxes(title_text=y_name, row=1, col=1)
  fig.update_yaxes(title_text=y_name, row=1, col=2)

  return fig

In [6]:
fig = plot_histogram(data_orig, 'Sex')
fig.show()

In [7]:
fig = plot_2d_histogram(data_orig, 'Age', 'Sex')
fig.show()

## Transformación de Variables

In [8]:
def dummify(df: pd.DataFrame, target: str, target_set: set = None):
  if not target_set:
    target_set = df[target].dropna().unique()

  df_copy = df.copy()

  for item in target_set:
    df_copy[target + '_' + item] = df_copy[target].str.contains(item, regex=False).astype(float)

  return df_copy

In [9]:
data = data_orig.copy()

# Cambiar Sexo a entero
data['Sex_int'] = (data['Sex'] == 'female').astype(int)

# Crear dummies para 'Embarked'
data = dummify(data, 'Embarked')

# Eliminar columnas originales
data.drop(columns=['Sex', 'Embarked', 'Cabin'], inplace=True)

data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Sex_int,Embarked_S,Embarked_C,Embarked_Q
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,0,1.0,0.0,0.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,1,0.0,1.0,0.0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,1,1.0,0.0,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,1,1.0,0.0,0.0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",27.0,0,0,211536,13.0000,0,1.0,0.0,0.0
887,888,1,1,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30.0000,1,1.0,0.0,0.0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",,1,2,W./C. 6607,23.4500,1,1.0,0.0,0.0
889,890,1,1,"Behr, Mr. Karl Howell",26.0,0,0,111369,30.0000,0,0.0,1.0,0.0


In [10]:
# Número de ticket
filter = data['Ticket'] != 'LINE'
data.loc[filter, 'log_ticket_number'] = data.loc[filter, 'Ticket'].apply(lambda x: x.split(' ')[-1])
data['log_ticket_number'] = data['log_ticket_number'].astype(float)
data['log_ticket_number'] = np.log(data['log_ticket_number'])

plot_histogram(data, 'log_ticket_number')

In [11]:
# Prefijo de ticket
filter = data['Ticket'].apply(lambda x: len(x.split(' '))) > 1
data.loc[filter, 'ticket_prefix'] = data.loc[filter, 'Ticket'].apply(lambda x: x.split(' ')[0])
data['ticket_prefix'].fillna('NONE', inplace=True)

# Mayúsculas y quitar puntos
data['ticket_prefix'] = data['ticket_prefix'].str.upper().str.replace('.', '', regex=False)

# Reemplazar 'A/#' por 'A#'
repl = lambda m: 'A' + m.group(1)
data['ticket_prefix'] = data['ticket_prefix'].str.replace('A/(\d{1})', repl, regex=True)

# Crear variables dummy
prefix_set = set(['LINE'])
for prefix in data['ticket_prefix'].dropna().unique():
  prefix_set = prefix_set.union(set(prefix.split('/')))

data = dummify(data, 'ticket_prefix', prefix_set)

# Eliminar columnas extras
data.drop(columns=['Ticket', 'ticket_prefix'], inplace=True)

data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Fare,Sex_int,Embarked_S,...,ticket_prefix_O,ticket_prefix_FA,ticket_prefix_FC,ticket_prefix_A4,ticket_prefix_SOTON,ticket_prefix_SW,ticket_prefix_SCO,ticket_prefix_STON,ticket_prefix_WE,ticket_prefix_OQ
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,7.2500,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,71.2833,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,7.9250,1,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,53.1000,1,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,8.0500,0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",27.0,0,0,13.0000,0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
887,888,1,1,"Graham, Miss. Margaret Edith",19.0,0,0,30.0000,1,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",,1,2,23.4500,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
889,890,1,1,"Behr, Mr. Karl Howell",26.0,0,0,30.0000,0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Extraer honorífico
data['Honorific'] = data['Name'].str.extract('( \w+\.)')[0].str.replace('.', '', regex=False)

# Crear variable dummy
data = dummify(data, 'Honorific')

# Eliminar columnas extras
data.drop(columns=['Name', 'Honorific'], inplace=True)

data

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_int,Embarked_S,Embarked_C,...,Honorific_ Mme,Honorific_ Ms,Honorific_ Major,Honorific_ Lady,Honorific_ Sir,Honorific_ Mlle,Honorific_ Col,Honorific_ Capt,Honorific_ Countess,Honorific_ Jonkheer
0,1,0,3,22.0,1,0,7.2500,0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1,1,38.0,1,0,71.2833,1,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,1,3,26.0,0,0,7.9250,1,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,1,1,35.0,1,0,53.1000,1,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0,3,35.0,0,0,8.0500,0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,27.0,0,0,13.0000,0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
887,888,1,1,19.0,0,0,30.0000,1,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
888,889,0,3,,1,2,23.4500,1,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
889,890,1,1,26.0,0,0,30.0000,0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Manejar Valores Nulos

In [13]:
missing = data.isna().sum() / len(data)

fig = px.bar(x=missing.index, y=missing)

fig.show()

In [14]:
# log_ticket_number
data['log_ticket_number'].fillna(0, inplace=True)

In [15]:
# Embarked
data[f'Embarked_Q'].fillna(0, inplace=True)
data[f'Embarked_C'].fillna(0, inplace=True)
data[f'Embarked_S'].fillna(1, inplace=True)

In [16]:
missing = data.isna().sum() / len(data)

fig = px.bar(x=missing.index, y=missing)

fig.show()

### Modelo para llenado de Edad

In [17]:
def get_cross_validation_batch(
    X_orig: pd.DataFrame|np.ndarray,
    y_orig: pd.DataFrame|np.ndarray,
    test_size: int,
    n_batch: int
  ):

  # Copiar valores
  X = X_orig.copy()
  y = y_orig.copy()

  # Transformar a numpy
  try:
    X = X.to_numpy()
  except AttributeError:
    pass
  try:
    y = y.to_numpy()
  except AttributeError:
    pass


  # Calcular inicio y final del segmento
  start = test_size * n_batch
  stop = min(test_size * (n_batch + 1), len(y))

  # Dividir entre train y test
  X_train = np.concatenate([X[:start], X[stop:]])
  y_train = np.concatenate([y[:start], y[stop:]])

  X_test = X[start: stop]
  y_test = y[start: stop]

  return X_train, y_train, X_test, y_test

In [18]:
age_filter = data['Age'].notna()

X = data[age_filter].drop(columns=['Survived', 'Age'])
y = data.loc[age_filter, 'Age']

test_size = 20
iters = int(np.ceil(len(y) / test_size))
diff = np.array([])
for n_batch in tqdm(range(iters)):

  # Dividir entre train y test
  X_train, y_train, X_test, y_test = get_cross_validation_batch(X, y, test_size, n_batch)

  # Declarar y entrenar modelo
  reg = LinearRegression() ## Modelo a cambiar
  reg.fit(X_train, y_train)

  # Calcular differencia
  y_pred = reg.predict(X_test)
  # y_pred = y_train.mean()
  y_pred = np.clip(y_pred, 0, 100)
  diff = np.concatenate([diff, np.abs(y_test - y_pred)])

print(f'\nµ: {diff.mean():.2f}')
print(f'σ: {diff.std():.2f}')

# Histogram
fig = px.histogram(
    x=diff,
    histnorm='probability',
  )
fig.update_layout(title=f'Errores en la predicción de "Age"')

fig.show()

100%|██████████| 36/36 [00:00<00:00, 49.33it/s]



µ: 9.08
σ: 7.38


In [19]:
reg = LinearRegression().fit(X, y)

data.loc[~age_filter, 'Age'] = reg.predict(data[~age_filter].drop(columns=['Survived', 'Age']))
data['Age_is_predicted'] = 0
data.loc[~age_filter, 'Age_is_predicted'] = 1

missing = data.isna().sum() / len(data)

fig = px.bar(x=missing.index, y=missing)

fig.show()

# Creando el modelo

In [20]:
def evaluate_model(X: pd.DataFrame, y: pd.DataFrame, model):
  test_size = 20
  iters = int(np.ceil(len(y) / test_size))
  y_pred = np.array([])
  for n_batch in tqdm(range(iters)):

    # Dividir entre train y test
    X_train, y_train, X_test, y_test = get_cross_validation_batch(X, y, test_size, n_batch)

    # Declarar y entrenar modelo
    reg = deepcopy(model)
    reg.fit(X_train, y_train)

    # Calcular differencia
    y_pred = np.concatenate([y_pred, reg.predict(X_test)])

  fig = px.density_heatmap(x=y, y=y_pred, histnorm='probability', range_color=[0,1])
  fig.update_layout(
      title='Matriz de Confusión',
      xaxis_title='Original',
      yaxis_title='Predicción'
      )

  accuracy = sum(y_pred == y) / len(y) * 100
  return fig, accuracy

In [21]:
X = data.drop(columns=['Survived'])
y = data['Survived']
model = LogisticRegression(max_iter=10000)

fig, accuracy = evaluate_model(X, y, model)

print(f'\n\nAccuracy: {accuracy:.2f}%')
fig.show()

100%|██████████| 45/45 [00:52<00:00,  1.18s/it]



Accuracy: 82.72%



