In [2]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

In [3]:
df = pd.read_csv('data/train.csv', sep=",")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Exploring Embarked, Pclass and Fare

In [4]:
for city in df["Embarked"].unique():
    v = df[df["Embarked"] == city][["Pclass", "Fare"]]
    for pclass in v["Pclass"].unique():
        p = v[v["Pclass"] == pclass]
        fmedian = p["Fare"].median()
        print(city, pclass, fmedian)

S 3 8.05
S 1 52.0
S 2 13.5
C 1 78.2667
C 2 24.0
C 3 7.8958
Q 3 7.75
Q 1 90.0
Q 2 12.35


In [6]:
# import libraries and load in dataset
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import make_moons
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


Xm, ym = make_moons(n_samples=10000, noise=0.4, random_state=42)

Xm

array([[ 0.9402914 ,  0.12230559],
       [ 0.12454026, -0.42477546],
       [ 0.26198823,  0.50841438],
       ...,
       [-0.24177973,  0.20957199],
       [ 0.90679645,  0.54958215],
       [ 2.08837082, -0.05050728]])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(Xm, ym, test_size=0.2, shuffle=True)

# parameters to be tested - Iterate several times to find better ranges
parameters = {
    'min_samples_split': [20, 25, 30],
    'min_samples_leaf': [5, 8, 10],
    'max_leaf_nodes': [18, 19],
    'max_features': [2]
}

# instantiate model
model = DecisionTreeClassifier(random_state=42)

# grid search using cv
clf = GridSearchCV(model, parameters)
clf.fit(X_train, y_train)

# Output [17]: GridSearchCV(estimator=DecisionTreeClassifier(random_state=42),
             # param_grid={'max_features': [2], 'max_leaf_nodes': [18, 19],
                         # 'min_samples_leaf': [5, 8, 10],
                         # 'min_samples_split': [20, 25, 30]})
            
clf.best_estimator_
# Output [24]: DecisionTreeClassifier(max_features=2, max_leaf_nodes=18, min_samples_leaf=8, 
                                    # min_samples_split=20, random_state=42)

DecisionTreeClassifier(max_features=2, max_leaf_nodes=18, min_samples_leaf=5,
                       min_samples_split=20, random_state=42)

In [11]:
rs = ShuffleSplit(n_splits=1000, train_size=0.0125, random_state=42)
rs.get_n_splits(X_train)
print(rs)

X_subsets = dict()
y_subsets = dict()
for i, indexes in enumerate(rs.split(X_train)):
    X_subsets[i] = X_train[indexes[0], :]
    y_subsets[i] = y_train[indexes[0]]
    
# some checks
assert X_subsets[0].shape == (100, 2), ("Dimensional Error.")
assert y_subsets[0].shape == (100,), ("Dimensional Error.")
assert y_subsets[100].shape[0] == X_subsets[100].shape[0], ("Dimensional mismatch")
assert len(X_subsets.keys()) == 1000, ("The number of subsets is not 1000.")
assert len(y_subsets.keys()) == 1000, ("The number of subsets is not 1000.")

ShuffleSplit(n_splits=1000, random_state=42, test_size=None,
       train_size=0.0125)


In [14]:
clf = DecisionTreeClassifier(max_features=2, max_leaf_nodes=19, min_samples_leaf=5,
                             min_samples_split=20, random_state=42)
scores = []
for subset in range(0, 1000):
    #fit model
    clf.fit(X_subsets[subset], y_subsets[subset])
    # make predictions
    y_pred = clf.predict(X_test)
    # evaluate accuracy and store
    scores.append(accuracy_score(y_test, y_pred))


print("Max accuracy: {:.2f}%.".format(max(scores) * 100))
print("Min accuracy: {:.2f}%.".format(min(scores) * 100))
print("Average accuracy: {:.2f}%.".format(np.mean(scores) * 100))
print("Std of accuracy: {:.2f}.".format(np.std(scores) * 100))

Max accuracy: 85.30%.
Min accuracy: 67.55%.
Average accuracy: 79.97%.
Std of accuracy: 3.09.


In [15]:
for tree in scores:
    print(clf.score(X_test, y_test))

0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.788
0.78

In [10]:
y_pred = []
for instance in range(0, X_test.shape[0]):
    predictions = []
    # loop through all the subsets
    for subset in range(0, 1000):
        # build a decision tree
        clf.fit(X_subsets[subset], y_subsets[subset])
        # make prediction for the instance in the test set and append to list
        predictions.append(clf.predict(X_test[instance].reshape(-1, 2)))
    assert len(predictions) == 1000, "The number of predictions is not 1,000."
    assert len(predictions[0]) == 1, "Dimensional Error."
    # choose most voted class
    y_pred.append(stats.mode(np.array(predictions), axis=None)[0][0])

assert len(y_pred) == X_test.shape[0], "Dimensional mismatch."

# evaluate performance using accuracy
print("Accuracy on test set: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))


KeyboardInterrupt: 