In [273]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [274]:
test_data = pd.read_csv('data/test.csv')
train_data = pd.read_csv('data/train.csv')

## Train/Test Split

In [298]:
name_prefixes = ['Mr.', 'Miss.', 'Mrs.']
train_data['Name_prefix'] = train_data.Name.apply(lambda x: re.findall(r'\w{2,}\.', x)[0])
train_data.loc[~train_data.Name_prefix.isin(name_prefixes), 'Name_prefix'] = 'Rare'

from sklearn.model_selection import train_test_split
train, test = train_test_split(train_data, test_size=0.25, random_state=0)

## Data Analysis

In [276]:
sns.heatmap(train.isna())

In [277]:
train.SibSp.value_counts()

## Interesting Themes

- Wealthy survive?
- Age vs. ticket price?
- Young and wealthy variable?
- Total spent?

In [278]:
sns.heatmap(train.loc[:, num_cols + ['Survived']].corr(), annot=True)

In [279]:
train.groupby('Pclass').Fare.agg(['count', 'mean'])

In [280]:
sns.scatterplot(
    x=train.Age,
    y=train.Fare
)

## Notes for Feature Engineering

- Pclass vs. Fare are somewhat correlated, do we exclude one or the other?
- Parch vs. SibSp are somewhat correlated, do we exclude one or the other?
- Age has a lot of null values. How to impute?
- Cabin info is also missing. What does null mean in this scenario?
    - Reduce cabin by first letter
- Extract prefix from name
- Normalize Age + Fare

In [281]:
# Reduce cabin to first letter
train['Cabin_letter'] = train.Cabin.fillna('_').str[0]

In [282]:
# Get all name prefixes
train['Name_prefix'] = train.Name.apply(lambda x: re.findall(r'\w{2,}\.', x)[0])

## Feature Engineering Function

In [300]:
def feature_engineer(data):
    cat_cols = ['Pclass', 'Sex', 'Cabin_letter', 'Embarked', 'Name_prefix']
    num_cols = ['Age', 'Fare', 'SibSp', 'Parch']

    data_copy = data.copy()

    # Drop Misc column
    data_copy.drop('Ticket', axis=1, inplace=True)
    data_copy.drop('PassengerId', axis=1, inplace=True)
    data_copy.drop('Name', axis=1, inplace=True)

    # Cabin2Cabin_letter
    data_copy['Cabin_letter'] = data_copy.Cabin.fillna('_').str[0]
    data_copy.drop('Cabin', axis=1, inplace=True)

    # Embarked missing values
    modeEmbarked = data_copy['Embarked'].mode()
    data_copy.Embarked.fillna(modeEmbarked, inplace=True)

    # Age
    medianAge = data_copy['Age'].median()
    data_copy.Age.fillna(medianAge, inplace=True)

    # One-Hot Encoding
    for col in cat_cols:
        data_copy = data_copy.join(pd.get_dummies(data_copy[col], prefix=col, drop_first=True))
        data_copy.drop(col, axis=1, inplace=True)
    
    return data_copy.drop('Survived', axis=1), data_copy['Survived']

In [305]:
X_train, y_train = feature_engineer(train)
X_test, y_test = feature_engineer(test)

# Hack-y fix
X_test['Cabin_letter_T'] = 0

## Model Building

In [285]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [306]:
lr = LogisticRegression(max_iter = 2000)

lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)


In [307]:
(y_test == y_pred).mean()

In [309]:
lr = LogisticRegression(max_iter = 2000)

drop_cols = X_train.columns[X_train.columns.str.contains('Cabin_')]
lr.fit(X_train.drop(drop_cols, axis=1), y_train)
y_pred = lr.predict(X_test.drop(drop_cols, axis=1))

In [310]:
(y_test == y_pred).mean()

In [336]:
lr = LogisticRegression(max_iter = 1000)

lr.fit(X_train[['Fare', 'Sex_male', 'Age', 'Pclass_2', 'Pclass_3', 'SibSp', 'Parch']], y_train)
y_pred = lr.predict(X_test[['Fare', 'Sex_male', 'Age', 'Pclass_2', 'Pclass_3', 'SibSp', 'Parch']])

(y_test == y_pred).mean()

In [337]:
lr = LogisticRegression(max_iter = 1000)

lr.fit(X_train[['Sex_male']], y_train)
y_pred = lr.predict(X_test[['Sex_male']])