Internet Resources:

[Python Programming.net - machine learning episodes 39-42](https://pythonprogramming.net/hierarchical-clustering-mean-shift-machine-learning-tutorial/)

In [1]:
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import numpy as np
from sklearn.cluster import MeanShift
from sklearn import preprocessing
import pandas as pd

df = pd.read_excel('data/titanic.xls') # data from https://pythonprogramming.net/static/downloads/machine-learning-data/titanic.xls

'''
Pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
survival Survival (0 = No; 1 = Yes)
name Name
sex Sex
age Age
sibsp Number of Siblings/Spouses Aboard
parch Number of Parents/Children Aboard
ticket Ticket Number
fare Passenger Fare (British pound)
cabin Cabin
embarked Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
boat Lifeboat
body Body Identification Number
home.dest Home/Destination
'''

original_df = pd.DataFrame.copy(df)

df.drop(['body','name'], 1, inplace=True)
df.apply(pd.to_numeric, errors="ignore")
df.fillna(0, inplace=True)

def handle_non_numerical_data(df):
    columns = df.columns.values
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1

            df[column] = list(map(convert_to_int, df[column]))

    return df

df = handle_non_numerical_data(df)

X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

clf = MeanShift()
clf.fit(X)

MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, min_bin_freq=1,
          n_jobs=None, seeds=None)

In [2]:
labels = clf.labels_
cluster_centers = clf.cluster_centers_
original_df['cluster_group']=np.nan

for i in range(len(X)):
    original_df.at[i, 'cluster_group'] = labels[i] 

n_clusters_ = len(np.unique(labels))

survival_rates = {}
for i in range(n_clusters_):
    temp_df = original_df[ (original_df['cluster_group']==float(i)) ] # temp only has data with cluster_group i
    #print(temp_df.head())
    
    survival_cluster = temp_df[  (temp_df['survived'] == 1) ] # temp df just with only suvivors
    
    survival_rate = len(survival_cluster) / len(temp_df)
    print(i,": ",survival_rate)
    survival_rates[i] = survival_rate

0 :  0.37936384794414274
1 :  0.9090909090909091
2 :  0.1111111111111111


In [3]:
for i in range(n_clusters_):
    print("Cluster "+str(i)+ ":\n")
    print(original_df[(original_df['cluster_group'] == float(i))].describe())
    print("\n\n\n")

Cluster 0:

            pclass     survived          age        sibsp        parch  \
count  1289.000000  1289.000000  1028.000000  1289.000000  1289.000000   
mean      2.301009     0.379364    29.700065     0.491078     0.334368   
std       0.833689     0.485417    14.405586     1.041137     0.695420   
min       1.000000     0.000000     0.166700     0.000000     0.000000   
25%       2.000000     0.000000    21.000000     0.000000     0.000000   
50%       3.000000     0.000000    28.000000     0.000000     0.000000   
75%       3.000000     1.000000    38.000000     1.000000     0.000000   
max       3.000000     1.000000    80.000000     8.000000     5.000000   

              fare        body  cluster_group  
count  1288.000000  120.000000         1289.0  
mean     30.484575  160.966667            0.0  
std      41.361693   98.091274            0.0  
min       0.000000    1.000000            0.0  
25%       7.895800   71.500000            0.0  
50%      14.254150  155.500000   