## References

* [A journey through Titanic](https://www.kaggle.com/omarelgabry/a-journey-through-titanic) (Kaggle)

In [None]:
import math
import matplotlib.pyplot as pp
import numpy as np
import pandas as pd
import seaborn as sb

np.random.seed(0)

In [None]:
def load_dataset(path):
    return pd.read_csv(path)

data = load_dataset('data/development.csv')
data_score = load_dataset('data/submission.csv')

In [None]:
data.info()
data.head()

In [None]:
data_score.info()
data_score.head()

In [None]:
def collect_statistics(data):
    count = len(data.columns)
    missing_counts = np.zeros(count, dtype=np.uint32)
    unique_counts = np.zeros(count, dtype=np.uint32)
    zero_counts = np.zeros(count, dtype=np.uint32)
    for (i, column) in enumerate(data.columns):
        missing_counts[i] = data[column].isnull().sum()
        unique_counts[i] = data[column].unique().size
        zero_counts[i] = (data[column] == 0).astype(int).sum()
    return pd.DataFrame({
        'Column': data.columns,
        'Unique': unique_counts,
        'Missing': missing_counts,
        'Zero': zero_counts,
    })

In [None]:
collect_statistics(data)

In [None]:
collect_statistics(data_score)

In [None]:
def round(value, precision=10000):
    return math.ceil(value * precision) / precision

def inspect_correlation(data, target, limit=10):
    for column in data.columns:
        if column == target: continue
        correlation = None
        if pd.api.types.is_numeric_dtype(data[column]):
            correlation = round(data[column].corr(data[target]))
        elif data[column].unique().size < limit:
            dummies = pd.get_dummies(data[column])
            correlation = [dummies[column].corr(data[target]) for column in dummies.columns]
            correlation = [round(value) for value in correlation]
        print('{}: {} correlation'.format(column, correlation))

inspect_correlation(data, 'Survived')

In [None]:
def compose_person(data):
    age, sex = data
    return 'Child' if age < 16 else sex.capitalize()

def compose_class(data):
    data = data[0]
    if data == 1: return 'Upper'
    elif data == 2: return 'Middle'
    elif data == 3: return 'Lower'
    raise 'Unknown class'

def fix_missing(data, column):
    mean = data[column].mean()
    deviation = data[column].std()
    count = data[column].isnull().sum()
    rand = np.random.randint(mean - deviation, mean + deviation, size=count)
    data.loc[np.isnan(data[column]), column] = rand
    return count

def prepare(data, columns=['PassengerId', 'Survived', 'Age', 'Person', 'Class']):
    fix_missing(data, 'Age')
    data['Age'] = data['Age'].astype(int)
    data['Person'] = data[['Age', 'Sex']].apply(compose_person, axis=1)
    data['Class'] = data[['Pclass']].apply(compose_class, axis=1)
    data.drop(list(set(data.columns.values) - set(columns)), axis=1, inplace=True)

prepare(data)
prepare(data_score)

data.head()

In [None]:
def plot_histogram(data, column):
    figure = pp.figure(figsize=(18, 4))
    data[column].hist(bins=70)
    pp.xlabel(column)
    pp.ylabel('Number of passengers')

def plot_survived(data, column, order):
    figure, axes = pp.subplots(1, 2, figsize=(18, 4))
    sb.countplot(x=column, data=data, ax=axes[0], order=order)
    data = data[[column, 'Survived']].groupby([column], as_index=False).mean()
    sb.barplot(x=column, y='Survived', data=data, ax=axes[1], order=order)
    axes[0].set_ylabel('Number of passengers')
    axes[1].set_ylabel('Fraction of survived')

plot_histogram(data, 'Age')
plot_survived(data, 'Person', ['Male', 'Female', 'Child'])
plot_survived(data, 'Class', ['Upper', 'Middle', 'Lower'])

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
def make_dummies(data, mapping):
    for column in mapping:
        dummies = pd.get_dummies(data[column])
        dummies.columns = mapping[column]
        data = data.join(dummies)
    data.drop(mapping.keys(), inplace=True, axis=1)
    return data

mapping = {
    'Person': ['Male', 'Female', 'Child'],
    'Class': ['Upper', 'Middle', 'Lower'],
}

data = make_dummies(data, mapping)
data_score = make_dummies(data_score, mapping)

data.head()

In [None]:
x = data.drop(['PassengerId', 'Survived'], axis=1)
y = data['Survived']

x_train, y_train = x, y

x_score = data_score.drop('PassengerId', axis=1)

In [None]:
model = RandomForestClassifier(n_estimators=100)
model.fit(x_train, y_train)

print('Score: {:.4}'.format(model.score(x_train, y_train)))

In [None]:
data_score = pd.DataFrame({
    'PassengerId': data_score['PassengerId'],
    'Survived': model.predict(x_score),
})
data_score.to_csv('solution.csv', index=False)