In [0]:
# import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [0]:
def handle_non_numerical_data(data):
    # handling non-numerical data: must convert.
    columns = data.columns.values
    text_digit_val = {}

    def convert_to_int(val):
        return text_digit_val[val]
    for column in columns:
        # print(column, data[column].dtype)
        if data[column].dtype != np.int64 and data[column].dtype != np.float64:
            column_contents = data[column].values.tolist()
            # finding just the uniques
            unique_elements = set(column_contents)
            # great, found them.
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_val:
                    # creating dict that contains new id per unique string
                    text_digit_val[unique] = x
                    x += 1
            # now we map the new "id" value to replace the string.
            data[column] = list(map(convert_to_int, data[column]))
    return data

In [0]:
df = pd.read_excel('titanic.xls')
df.drop(['body', 'name', 'boat', 'home.dest', 'embarked', 'ticket'], 1, inplace=True)
df.fillna(0, inplace=True)

df = handle_non_numerical_data(df)
df.drop(['age'], 1, inplace=True)

X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

In [0]:
class KMeans:
    def __init__(self, k=2, tol=0.001, max_iter=300):
        self.k = k
        self.tol = tol
        self.max_iter = max_iter
        self.centroids = {}
        self.classifications = {}

    def fit(self, data):
        for k in range(self.k):
            self.centroids[k] = data[k]
        for iteration in range(self.max_iter):
            for j in range(self.k):
                self.classifications[j] = []
                
                
            for FeatureSet in data:
                distances = [np.linalg.norm(FeatureSet - self.centroids[centroid]) for centroid in self.centroids]
                classification = distances.index(min(distances))
                self.classifications[classification].append(FeatureSet)
            prev_centroids = dict(self.centroids)
            for classification in self.classifications:
                self.centroids[classification] = np.average(self.classifications[classification], axis=0)
            optimized = True
            for c in self.centroids:
                original_centroid = prev_centroids[c]
                current_centroid = self.centroids[c]
                if np.sum((current_centroid- original_centroid)/original_centroid * 100.0) > self.tol:
                  if i == self.max_iter -1:
                    print("percent change: ", np.sum((current_centroid- original_centroid)/original_centroid * 100.0))
                    optimized = False
            if optimized:
                break

    def predict(self, data):
        distances = [np.linalg.norm(data - self.centroids[centroid]) for centroid in self.centroids]
        classification = distances.index(min(distances))
        return classification


In [0]:
centroids = {}
for k in range(2):
  centroids[k] = X[k]

In [6]:
print(centroids)

{0: array([-1.54609786,  1.34499549, -0.47908676, -0.4449995 ,  3.44242751,
        1.93413985]), 1: array([-1.54609786, -0.74349692,  0.48128777,  1.86652569,  2.28660634,
        2.62560497])}


In [0]:
classifications = {}
for iteration in range(300):
  for j in range(2):
    classifications[j] = []

In [8]:
classifications

{0: [], 1: []}

In [0]:
for FeatureSet in X:
  distances = [np.linalg.norm(FeatureSet - centroids[centroid]) for centroid in centroids]
  classification = distances.index(min(distances))
  classifications[classification].append(FeatureSet)

In [10]:
distances

[5.589425405901829, 5.400949152331193]

In [11]:
classification

1

In [12]:
classifications

{0: [array([-1.54609786,  1.34499549, -0.47908676, -0.4449995 ,  3.44242751,
          1.93413985]),
  array([-1.54609786,  1.34499549,  0.48128777, -0.4449995 ,  0.86392026,
          0.50930263]),
  array([-1.54609786,  1.34499549,  1.4416623 , -0.4449995 ,  0.35202222,
          2.39511659]),
  array([-1.54609786, -0.74349692,  0.48128777, -0.4449995 ,  3.75536675,
          1.6617445 ]),
  array([-1.54609786,  1.34499549,  0.48128777, -0.4449995 ,  3.75536675,
          1.6617445 ]),
  array([-1.54609786,  1.34499549, -0.47908676, -0.4449995 ,  0.69653667,
          1.72460496]),
  array([-1.54609786,  1.34499549, -0.47908676, -0.4449995 ,  0.88115874,
         -0.45455784]),
  array([-1.54609786, -0.74349692, -0.47908676,  0.71076309,  4.14192864,
          1.36839566]),
  array([-1.54609786,  1.34499549, -0.47908676,  0.71076309,  4.14192864,
          1.36839566]),
  array([-1.54609786,  1.34499549, -0.47908676, -0.4449995 ,  0.83170129,
          2.83513985]),
  array([-1.54609

In [13]:
prev_centroids = dict(centroids)
prev_centroids

{0: array([-1.54609786,  1.34499549, -0.47908676, -0.4449995 ,  3.44242751,
         1.93413985]),
 1: array([-1.54609786, -0.74349692,  0.48128777,  1.86652569,  2.28660634,
         2.62560497])}

In [0]:
for classification in classifications:
  centroids[classification] = np.average(classifications[classification], axis=0)

In [15]:
centroids

{0: array([-0.28504754,  1.19980618, -0.11958827, -0.25958304,  0.38389618,
         0.14015147]),
 1: array([ 0.11401902, -0.47992247,  0.04783531,  0.10383322, -0.15355847,
        -0.05606059])}

In [16]:
optimized = True
for c in centroids:
    original_centroid = prev_centroids[c]
    current_centroid = centroids[c]
    if abs(np.sum((current_centroid - original_centroid) / original_centroid * 100.0)) > 0.001:
        print(np.sum((current_centroid - original_centroid) / original_centroid * 100.0))
        optimized = False

-390.66505984327546
-536.1740483032012


In [17]:
optimized

False