A Kaggle submission based on code from the Dataquest tutorial, using an additional factor and a different regressor.

In [3]:
import pandas as pd

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train[:5]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
def process_age(df,cut_points,label_names):
    df["Age"] = df["Age"].fillna(-0.5)
    df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
    return df

cut_points = [-1,0,5,12,18,35,60,100]
label_names = ["Missing","Infant","Child","Teenager","Young Adult","Adult","Senior"]

train = process_age(train,cut_points,label_names)
test = process_age(test,cut_points,label_names)

train["Age_categories"][:10]

0    Young Adult
1          Adult
2    Young Adult
3    Young Adult
4    Young Adult
5        Missing
6          Adult
7         Infant
8    Young Adult
9       Teenager
Name: Age_categories, dtype: category
Categories (7, object): [Missing < Infant < Child < Teenager < Young Adult < Adult < Senior]

In [5]:
def process_family(df):
    df["Family"] = df["SibSp"] + df["Parch"]
    family_cut_points = [-1, 0.5, 1.5, 100]
    family_labels = ["Alone", "Pair", "Group"]
    df["Family_categories"] = pd.cut(df["Family"], family_cut_points, labels=family_labels)
    return df

train = process_family(train)
test = process_family(test)

train[["Family", "Family_categories"]][:10]

Unnamed: 0,Family,Family_categories
0,1,Pair
1,1,Pair
2,0,Alone
3,1,Pair
4,0,Alone
5,0,Alone
6,0,Alone
7,4,Group
8,2,Group
9,1,Pair


In [6]:
def create_dummies(df, column_name, dummy_list):
    dummies = pd.get_dummies(df[column_name], prefix=column_name)
    df = pd.concat([df,dummies], axis=1)
    for dummy in list(dummies):
        if dummy not in dummy_list:
            dummy_list.append(dummy)
    return df, dummy_list

dummy_list = []
experimental_cols = ["Pclass", "Sex", "Age_categories", "Family_categories"]

for col in experimental_cols:
    train, dummy_list = create_dummies(train, col, dummy_list)
    test, dummy_list = create_dummies(test, col, dummy_list)

train[dummy_list][:10]

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Age_categories_Missing,Age_categories_Infant,Age_categories_Child,Age_categories_Teenager,Age_categories_Young Adult,Age_categories_Adult,Age_categories_Senior,Family_categories_Alone,Family_categories_Pair,Family_categories_Group
0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0
1,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0
2,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0
3,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0
4,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0
5,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0
6,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0
7,0,0,1,0,1,0,1,0,0,0,0,0,0,0,1
8,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1
9,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0


In [7]:
from sklearn.model_selection import train_test_split

all_X = train[dummy_list]
all_y = train['Survived']

train_X, test_X, train_y, test_y = train_test_split(all_X, all_y, test_size=0.20,random_state=0)
train_X[:10]

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Age_categories_Missing,Age_categories_Infant,Age_categories_Child,Age_categories_Teenager,Age_categories_Young Adult,Age_categories_Adult,Age_categories_Senior,Family_categories_Alone,Family_categories_Pair,Family_categories_Group
140,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1
439,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0
817,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1
378,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0
491,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0
331,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0
588,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0
358,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0
674,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0
162,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(train_X, train_y)
predictions = clf.predict(test_X)
accuracy = accuracy_score(test_y, predictions)
accuracy

0.7821229050279329

In [9]:
from sklearn.model_selection import cross_val_score
import numpy as np
clf = RandomForestClassifier(max_depth=2, random_state=0)
scores = cross_val_score(clf, all_X, all_y, cv=10)
scores

array([0.81111111, 0.78888889, 0.75280899, 0.83146067, 0.78651685,
       0.82022472, 0.76404494, 0.71910112, 0.79775281, 0.77272727])

In [10]:
accuracy = np.mean(scores)
accuracy

0.7844637385086823

In [11]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(all_X,all_y)
test_predictions = clf.predict(test[dummy_list])
test_predictions

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,