[https://www.kaggle.com/c/titanic/details/getting-started-with-random-forests]

In [3]:
"""
http://nbconvert.readthedocs.org/en/latest/execute_api.html#a-quick-example
"""
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
with open("python_ii_pandas/DataMunging.ipynb") as f:
    nb = nbformat.read(f, as_version=4)

ep = ExecutePreprocessor()
nb_execution_path = 'notebooks/'
resources = {'metadata': {'path': nb_execution_path}}
ep.preprocess(nb, resources)

NotADirectoryError: [WinError 267] The directory name is invalid

In [7]:
import pandas as pd
from numpy import zeros


def process(data_file_path):
    df = pd.read_csv(data_file_path, header=0)
    
    print("information schema description:")
    print(df.info())
    print(df.describe())
    
    print("Summary of attributes with values missing:")
    for i, s in enumerate(df):
        if df[s].count() < len(df):
            print(s, df[s].count())
    
    print("Enumerating str/object data:")
    
    print("Enumerating Sex:")
    print(df['Sex'].unique())
    df['Gender'] = df['Sex'].map({'female': 0, 'male': 1}).astype(int)
    df = df.drop(['Sex'], axis=1)
    
    print("Enumerating Embarked:")
    print(df['Embarked'].unique())
    df['EmbarkedEnum'] = df['Embarked'].dropna().map({'C': 1, 'S': 2, 'Q': 3}).astype(int)
    df = df.drop(['Embarked'], axis=1)
    
    print("Filling missing data:")
    
    print("Filling other missing numeric values with zeroeth-order approximation -- the most probable value (mode):")
    for field in ('EmbarkedEnum', 'SibSp', 'Parch', 'Fare'):
        df[field].fillna(df[field].dropna().mode().values[0], inplace=True)
    
    print("Filling Age:")
    nGenders = len(df['Gender'].dropna().unique())
    nPclasses = len(df['Pclass'].dropna().unique())
    median_ages = zeros((nGenders, nPclasses))
    for i in range(nGenders):
        for j in range(nPclasses):
            median_ages[i, j] = df[(df['Gender'] == i) & (df['Pclass'] == j + 1)]['Age'].dropna().median()
    df['AgeFill'] = df['Age']
    for i in range(nGenders):
        for j in range(nPclasses):
            df.loc[(df.Age.isnull()) & (df.Gender == i) & (df.Pclass == j + 1), 'AgeFill'] = median_ages[i, j]
    df = df.drop(['Age'], axis=1)
    
    print("Separating attributes from labels and observation IDs")
    
    y = None
    if 'Survived' in df:
        print("Separating Survived, which is not part of the parameters (\"X\")")
        y = df['Survived']
        df = df.drop(['Survived'], axis=1)
    
    print("don't even bother enumerating these fields, just drop them:")
    passenger_ids = df['PassengerId'].values
    df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    
    print("Feature engineering:")
    df['FamilySize'] = df['SibSp'] + df['Parch']
    df['Age*Class'] = df.AgeFill * df.Pclass
       
    print("Cleansed %s:" % data_file_path)
    print(df.info())
    print(df.describe())
    assert len(
        df.count().unique()) == 1, "not all attributes have the same number of observations! check for missing values"
    X = df.values
    
    print(passenger_ids.shape, X.shape, None if y is None else y.shape)
    return passenger_ids, X, y

In [8]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

model = RandomForestClassifier(n_estimators=100)

passenger_ids, X, y = process('data/train.csv')
model = model.fit(X, y)

prediction = model.predict(X)
print(prediction.shape)
print(np.unique(prediction))
print(np.bincount(prediction, minlength=2))

score = model.score(X, y)
print(score)

information schema description:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.0000

(891,)
[0 1]
[556 335]
0.980920314254


In [9]:
# Test, save public test as csv: 'PassengerId', 'Survived'
import csv

passenger_ids, X, y = process('data/test.csv')

prediction = model.predict(X)
print(prediction.shape)
print(np.unique(prediction))
print(np.bincount(prediction, minlength=2))

with(open('survival.csv', 'w')) as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(["PassengerId","Survived"])
    csv_writer.writerows(zip(passenger_ids, prediction))

information schema description:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
None
       PassengerId      Pclass         Age       SibSp       Parch        Fare
count   418.000000  418.000000  332.000000  418.000000  418.000000  417.000000
mean   1100.500000    2.265550   30.272590    0.447368    0.392344   35.627188
std     120.810458    0.841838   14.181209    0.896760    0.981429   55.907576
min     892.000000    1.000000    0.170000    0.000000    0.000000    0.000000
25%     996.250000