In [57]:
import pandas as pd
from numpy import zeros, nan

df = pd.read_csv('data/train.csv', header=0)

print("information schema description:")
print(df.info())
print(df.describe())

print("Summary of attributes with values missing:")
for i, s in enumerate(df):
    if df[s].count() < len(df):
        print(s, df[s].count())

print("Enumerating Sex:")
print(df['Sex'].unique())
df['Gender'] = df['Sex'].map({'female': 0, 'male': 1}).astype(int)
df = df.drop(['Sex'], axis=1)

print("Enumerating Embarked:")
print(df['Embarked'].unique())
df['EmbarkedEnum'] = df['Embarked'].dropna().map({'C': 1, 'S': 2, 'Q': 3}).astype(int)
df = df.drop(['Embarked'], axis=1)

print("Filling missing EmbarkedEnum values with zeroeth-order approximation -- the most probable value (mode):")
df['EmbarkedEnum'].fillna(df['EmbarkedEnum'].dropna().mode().values[0], inplace=True)

print("Filling Age:")
nGenders = len(df['Gender'].dropna().unique())
nPclasses = len(df['Pclass'].dropna().unique())
median_ages = zeros((nGenders, nPclasses))
for i in range(nGenders):
    for j in range(nPclasses):
        median_ages[i, j] = df[(df['Gender'] == i) & (df['Pclass'] == j + 1)]['Age'].dropna().median()
df['AgeFill'] = df['Age']
for i in range(nGenders):
    for j in range(nPclasses):
        df.loc[(df.Age.isnull()) & (df.Gender == i) & (df.Pclass == j + 1), 'AgeFill'] = median_ages[i, j]
df = df.drop(['Age'], axis=1)

print("Separating Survived, which is not part of the parameters (\"X\")")
y = df['Survived']
df = df.drop(['Survived'], axis=1)

print("don't even bother enumerating these fields, just drop them:")
df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

print("Feature engineering:")
df['FamilySize'] = df['SibSp'] + df['Parch']
df['Age*Class'] = df.AgeFill * df.Pclass

print("Cleansed training data:")
print(df.info())
print(df.describe())
assert len(
    df.count().unique()) == 1, "not all attributes have the same number of observations! check for missing values"
X = df.values

print(X.shape, y.shape)

information schema description:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB
None
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.0000


don't even bother enumerating these fields, just drop them:
Feature engineering:
Cleansed training data:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 10 columns):
PassengerId     891 non-null int64
Pclass          891 non-null int64
SibSp           891 non-null int64
Parch           891 non-null int64
Fare            891 non-null float64
Gender          891 non-null int32
EmbarkedEnum    891 non-null float64
AgeFill         891 non-null float64
FamilySize      891 non-null int64
Age*Class       891 non-null float64
dtypes: float64(4), int32(1), int64(5)
memory usage: 73.1 KB
None
       PassengerId      Pclass       SibSp       Parch        Fare  \
count   891.000000  891.000000  891.000000  891.000000  891.000000   
mean    446.000000    2.308642    0.523008    0.381594   32.204208   
std     257.353842    0.836071    1.102743    0.806057   49.693429   
min       1.000000    1.000000    0.000000    0.000000    0.000000   
25%     223.500

[https://www.kaggle.com/c/titanic/details/getting-started-with-random-forests]

    """
    http://nbconvert.readthedocs.org/en/latest/execute_api.html#a-quick-example
    """
    import nbformat
    from nbconvert.preprocessors import ExecutePreprocessor

    with open("python_ii_pandas/DataMunging.ipynb") as f:
        nb = nbformat.read(f, as_version=4)
    
    ep = ExecutePreprocessor()
    nb_execution_path = 'notebooks/'
    resources = {'metadata': {'path': nb_execution_path}}
    ep.preprocess(nb, resources)

In [54]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

model = RandomForestClassifier()

model = model.fit(X, y)
prediction = model.predict(X)
print(prediction.shape)
print(np.unique(prediction))
print(np.bincount(prediction, minlength=2))
score = model.score(X, y)
print(score)

(891,)
[0 1]
[557 334]
