In [1]:
import numpy as np
import pandas as pd

df = pd.read_excel('../datasets/titanic.xls')

# body and data columns probably don't have much of an impact on KMeans clustering 
# algorithm therefore we throwing them away
df = df.drop(['body', 'name'], axis=1)
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,home.dest
0,1,1,female,29.0,0,0,24160,211.3375,B5,S,2.0,"St Louis, MO"
1,1,1,male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,"Montreal, PQ / Chesterville, ON"
2,1,0,female,2.0,1,2,113781,151.55,C22 C26,S,,"Montreal, PQ / Chesterville, ON"
3,1,0,male,30.0,1,2,113781,151.55,C22 C26,S,,"Montreal, PQ / Chesterville, ON"
4,1,0,female,25.0,1,2,113781,151.55,C22 C26,S,,"Montreal, PQ / Chesterville, ON"


In [2]:
df = df.fillna(0)
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,home.dest
0,1,1,female,29.0,0,0,24160,211.3375,B5,S,2,"St Louis, MO"
1,1,1,male,0.9167,1,2,113781,151.55,C22 C26,S,11,"Montreal, PQ / Chesterville, ON"
2,1,0,female,2.0,1,2,113781,151.55,C22 C26,S,0,"Montreal, PQ / Chesterville, ON"
3,1,0,male,30.0,1,2,113781,151.55,C22 C26,S,0,"Montreal, PQ / Chesterville, ON"
4,1,0,female,25.0,1,2,113781,151.55,C22 C26,S,0,"Montreal, PQ / Chesterville, ON"


In [3]:
# encoding categorical data into numerical
def to_numeric_column(colname):
    d = {0: 0}
    x = 1
    for entry in df[colname]:
        if entry not in d and entry != 0:
            d[entry] = x
            x += 1

    return df[colname].map(d)

# note that this is not the best way to encode categorical data
# better approach would be to use something like OneHotEncoding 
df['cabin'] = to_numeric_column('cabin')
df['sex'] = to_numeric_column('sex')
df['embarked'] = to_numeric_column('embarked')
df['home.dest'] = to_numeric_column('home.dest')
df['ticket'] = to_numeric_column('ticket')
df['boat'] = to_numeric_column('boat')
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,home.dest
0,1,1,1,29.0,0,0,1,211.3375,1,1,1,1
1,1,1,2,0.9167,1,2,2,151.55,2,1,2,2
2,1,0,1,2.0,1,2,2,151.55,2,1,0,2
3,1,0,2,30.0,1,2,2,151.55,2,1,0,2
4,1,0,1,25.0,1,2,2,151.55,2,1,0,2


In [19]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

X = df.drop('survived', axis=1).astype(float).values
y = df['survived'].values

scaler = StandardScaler()
X = scaler.fit_transform(X)

classifier = KMeans(n_clusters=2)
classifier.fit(X)

correct = 0
for i in range(len(X)):
    predict_me = np.array(X[i].astype(float))
    predict_me = predict_me.reshape((1, -1))
    prediction = classifier.predict(predict_me)
    if prediction[0] == y[i]:
        correct += 1
        
print(correct / len(X))

0.6906035141329259
