Internet Resources:

[Python Programming.net - machine learning episodes 39-42](https://pythonprogramming.net/hierarchical-clustering-mean-shift-machine-learning-tutorial/)

In [1]:
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import numpy as np
from sklearn.cluster import MeanShift
from sklearn import preprocessing
import pandas as pd

# data from https://pythonprogramming.net/static/downloads/machine-learning-data/titanic.xls
df = pd.read_excel('data/titanic.xls') 
'''
Pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
survival Survival (0 = No; 1 = Yes)
name Name
sex Sex
age Age
sibsp Number of Siblings/Spouses Aboard
parch Number of Parents/Children Aboard
ticket Ticket Number
fare Passenger Fare (British pound)
cabin Cabin
embarked Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
boat Lifeboat
body Body Identification Number
home.dest Home/Destination
'''

original_df = pd.DataFrame.copy(df)

df.drop(['body','name'], 1, inplace=True)
df.apply(pd.to_numeric, errors="ignore")
df.fillna(0, inplace=True)

# handling non linear data:
# every unique feature value gets a different integer number
# e.g. female becomes 0, male becomes 1
def handle_non_numerical_data(df):
    columns = df.columns.values
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1

            df[column] = list(map(convert_to_int, df[column]))

    return df

df = handle_non_numerical_data(df)
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,home.dest
0,1,1,0,29.0,0,0,761,211.3375,83,3,1,259
1,1,1,1,0.9167,1,2,511,151.55,20,3,4,277
2,1,0,0,2.0,1,2,511,151.55,20,3,0,277
3,1,0,1,30.0,1,2,511,151.55,20,3,0,277
4,1,0,0,25.0,1,2,511,151.55,20,3,0,277


In [2]:
# train
X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X) # https://scikit-learn.org/stable/modules/preprocessing.html
y = np.array(df['survived'])

clf = MeanShift()
clf.fit(X)

# lets take a look at the different clusters mean shift has found
labels = clf.labels_
cluster_centers = clf.cluster_centers_
original_df['cluster_group']=np.nan

for i in range(len(X)):
    original_df.at[i, 'cluster_group'] = labels[i] 

n_clusters_ = len(np.unique(labels))

survival_rates = {}
for i in range(n_clusters_):
    temp_df = original_df[ (original_df['cluster_group']==float(i)) ] # temp only has data with cluster_group i
    #print(temp_df.head())
    
    survival_cluster = temp_df[  (temp_df['survived'] == 1) ] # temp df just with only suvivors
    
    survival_rate = len(survival_cluster) / len(temp_df)
    print("Cluster ", i,": ",survival_rate)
    survival_rates[i] = survival_rate

Cluster  0 :  0.3783151326053042
Cluster  1 :  0.8235294117647058
Cluster  2 :  0.1


In [3]:
# detailed look at clusters
for i in range(n_clusters_):
    print("Cluster "+str(i)+ ":\n")
    print(original_df[(original_df['cluster_group'] == float(i))].describe())
    print("\n\n\n")

Cluster 0:

            pclass     survived          age        sibsp        parch  \
count  1282.000000  1282.000000  1021.000000  1282.000000  1282.000000   
mean      2.306552     0.378315    29.689194     0.485959     0.322933   
std       0.830961     0.485156    14.383434     1.038767     0.670736   
min       1.000000     0.000000     0.166700     0.000000     0.000000   
25%       2.000000     0.000000    21.000000     0.000000     0.000000   
50%       3.000000     0.000000    28.000000     0.000000     0.000000   
75%       3.000000     1.000000    38.000000     1.000000     0.000000   
max       3.000000     1.000000    80.000000     8.000000     4.000000   

              fare        body  cluster_group  
count  1281.000000  119.000000         1282.0  
mean     29.398523  159.571429            0.0  
std      38.294234   97.302914            0.0  
min       0.000000    1.000000            0.0  
25%       7.895800   71.000000            0.0  
50%      13.900000  155.000000   