In [1]:
import numpy as np
from sklearn.cluster import MeanShift
from sklearn.datasets.samples_generator import make_blobs
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import style
style.use("ggplot")

centers = [[1,1,1],[5,5,5],[3,10,10]] #initialization of centers, we have to initialize with some points

X, _ = make_blobs(n_samples = 100, centers = centers, cluster_std = 1.5)

ms = MeanShift()
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

print(cluster_centers)
n_clusters_ = len(np.unique(labels))
print("Number of estimated clusters:", n_clusters_)

colors = 10*['r','g','b','c','k','y','m']
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

for i in range(len(X)):
    ax.scatter(X[i][0], X[i][1], X[i][2], c=colors[labels[i]], marker='o')

ax.scatter(cluster_centers[:,0],cluster_centers[:,1],cluster_centers[:,2],
            marker="x",color='k', s=150, linewidths = 5, zorder=10)

plt.show()

[[ 3.1458301   3.29711826  2.98541184]
 [ 2.5013923  10.16252742 10.15865821]]
Number of estimated clusters: 2


<matplotlib.figure.Figure at 0x8b7a240>

In [2]:
# mean shift on titanic dataset
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
from sklearn.cluster import MeanShift
from sklearn import preprocessing
import pandas as pd

df = pd.read_excel("C:/Users/Dell/Desktop/Python/titanic.xls")
original_df = pd.DataFrame.copy(df)
df.drop(['body','name'],1, inplace= True)
df.convert_objects(convert_numeric=True)
df.fillna(0, inplace=True)
#df.head()

def handle_non_numeric_data(df):
    columns = df.columns.values
    for column in columns:
        text_digit_vals = { }
        def convert_to_int(val):
            return text_digit_vals[val]
        
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_content = df[column].values.tolist()
            unique_elements = set(column_content)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x += 1
        
            df[column] = list(map(convert_to_int, df[column]))
    return df


df = handle_non_numeric_data(df)
#df[:50]

X = np.array(df.drop(['survived'],1).astype(float))
X = preprocessing.scale(X) # without this accu is 50%
y = np.array(df['survived'])

clf = MeanShift()
clf.fit(X)



For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  del sys.path[0]


MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)

In [3]:
labels = clf.labels_
cluster_centers = clf.cluster_centers_
original_df['cluster_group'] = np.nan

for i in range(len(X)):
    original_df['cluster_group'].iloc[i] = labels[i]

n_clusters = len(np.unique(labels))
survival_rates = {}

for i in range(n_clusters):
    temp_df = original_df[ (original_df['cluster_group'] == float(i)) ]
    survival_cluster = temp_df[(temp_df['survived']==1)]
    survival_rate = len(survival_cluster)/ len(temp_df)
    survival_rates[i] = survival_rate
print(survival_rates)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


{0: 0.38351822503961963, 1: 0.045454545454545456, 2: 0.9333333333333333, 3: 0.1}


In [4]:
original_df.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,cluster_group
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0,121.0,1309.0
mean,2.294882,0.381971,29.881135,0.498854,0.385027,33.295479,160.809917,0.062643
std,0.837836,0.486055,14.4135,1.041658,0.86556,51.758668,97.696922,0.357171
min,1.0,0.0,0.1667,0.0,0.0,0.0,1.0,0.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,72.0,0.0
50%,3.0,0.0,28.0,0.0,0.0,14.4542,155.0,0.0
75%,3.0,1.0,39.0,1.0,0.0,31.275,256.0,0.0
max,3.0,1.0,80.0,8.0,9.0,512.3292,328.0,3.0


In [5]:
original_df[(original_df['cluster_group']==0)]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,cluster_group
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO",0.0
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON",0.0
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0.0
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON",0.0
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0.0
5,1,1,"Anderson, Mr. Harry",male,48.0000,0,0,19952,26.5500,E12,S,3,,"New York, NY",0.0
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0000,1,0,13502,77.9583,D7,S,10,,"Hudson, NY",0.0
7,1,0,"Andrews, Mr. Thomas Jr",male,39.0000,0,0,112050,0.0000,A36,S,,,"Belfast, NI",0.0
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0000,2,0,11769,51.4792,C101,S,D,,"Bayside, Queens, NY",0.0
9,1,0,"Artagaveytia, Mr. Ramon",male,71.0000,0,0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay",0.0


In [6]:
original_df[(original_df['cluster_group']==0)].describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,cluster_group
count,1262.0,1262.0,1009.0,1262.0,1262.0,1261.0,118.0,1262.0
mean,2.292393,0.383518,29.968451,0.395404,0.29794,29.399871,160.355932,0.0
std,0.834005,0.486435,14.270147,0.725481,0.64584,39.603488,97.339175,0.0
min,1.0,0.0,0.1667,0.0,0.0,0.0,1.0,0.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,72.75,0.0
50%,3.0,0.0,28.0,0.0,0.0,13.775,155.5,0.0
75%,3.0,1.0,38.5,1.0,0.0,29.0,255.75,0.0
max,3.0,1.0,80.0,4.0,4.0,263.0,328.0,0.0


In [7]:
original_df[(original_df['cluster_group']==1)]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,cluster_group
622,3,0,"Andersson, Master. Sigvard Harald Elias",male,4.0,4,2,347082,31.275,,S,,,"Sweden Winnipeg, MN",1.0
623,3,0,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.275,,S,,,"Sweden Winnipeg, MN",1.0
624,3,0,"Andersson, Miss. Ellis Anna Maria",female,2.0,4,2,347082,31.275,,S,,,"Sweden Winnipeg, MN",1.0
627,3,0,"Andersson, Miss. Ingeborg Constanzia",female,9.0,4,2,347082,31.275,,S,,,"Sweden Winnipeg, MN",1.0
628,3,0,"Andersson, Miss. Sigrid Elisabeth",female,11.0,4,2,347082,31.275,,S,,,"Sweden Winnipeg, MN",1.0
639,3,0,"Asplund, Master. Carl Edgar",male,5.0,4,2,347077,31.3875,,S,,,"Sweden Worcester, MA",1.0
643,3,1,"Asplund, Miss. Lillian Gertrud",female,5.0,4,2,347077,31.3875,,S,15.0,,"Sweden Worcester, MA",1.0
825,3,0,"Goodwin, Master. Harold Victor",male,9.0,5,2,CA 2144,46.9,,S,,,"Wiltshire, England Niagara Falls, NY",1.0
826,3,0,"Goodwin, Master. Sidney Leonard",male,1.0,5,2,CA 2144,46.9,,S,,,"Wiltshire, England Niagara Falls, NY",1.0
827,3,0,"Goodwin, Master. William Frederick",male,11.0,5,2,CA 2144,46.9,,S,,,"Wiltshire, England Niagara Falls, NY",1.0


In [8]:
original_df[(original_df['cluster_group']==1)].describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,cluster_group
count,22.0,22.0,14.0,22.0,22.0,22.0,1.0,22.0
mean,3.0,0.045455,8.392857,5.909091,2.0,51.204545,67.0,1.0
std,0.0,0.213201,4.699339,1.823369,0.0,16.778019,,0.0
min,3.0,0.0,1.0,4.0,2.0,31.275,67.0,1.0
25%,3.0,0.0,5.0,4.0,2.0,31.3875,67.0,1.0
50%,3.0,0.0,9.0,5.0,2.0,46.9,67.0,1.0
75%,3.0,0.0,11.0,8.0,2.0,69.55,67.0,1.0
max,3.0,1.0,16.0,8.0,2.0,69.55,67.0,1.0


In [9]:
original_df[(original_df['cluster_group']==2)]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,cluster_group
17,1,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",female,50.0,0,1,PC 17558,247.5208,B58 B60,C,6.0,,"Montreal, PQ",2.0
35,1,1,"Bowen, Miss. Grace Scott",female,45.0,0,0,PC 17608,262.375,,C,4.0,,"Cooperstown, NY",2.0
49,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36.0,0,1,PC 17755,512.3292,B51 B53 B55,C,3.0,,"Austria-Hungary / Germantown, Philadelphia, PA",2.0
50,1,1,"Cardeza, Mrs. James Warburton Martinez (Charlo...",female,58.0,0,1,PC 17755,512.3292,B51 B53 B55,C,3.0,,"Germantown, Philadelphia, PA",2.0
66,1,1,"Chaudanson, Miss. Victorine",female,36.0,0,0,PC 17608,262.375,B61,C,4.0,,,2.0
111,1,1,"Fortune, Miss. Alice Elizabeth",female,24.0,3,2,19950,263.0,C23 C25 C27,S,10.0,,"Winnipeg, MB",2.0
112,1,1,"Fortune, Miss. Ethel Flora",female,28.0,3,2,19950,263.0,C23 C25 C27,S,10.0,,"Winnipeg, MB",2.0
113,1,1,"Fortune, Miss. Mabel Helen",female,23.0,3,2,19950,263.0,C23 C25 C27,S,10.0,,"Winnipeg, MB",2.0
115,1,0,"Fortune, Mr. Mark",male,64.0,1,4,19950,263.0,C23 C25 C27,S,,,"Winnipeg, MB",2.0
116,1,1,"Fortune, Mrs. Mark (Mary McDougald)",female,60.0,1,4,19950,263.0,C23 C25 C27,S,10.0,,"Winnipeg, MB",2.0


In [10]:
original_df[(original_df['cluster_group']==2)].describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,cluster_group
count,15.0,15.0,15.0,15.0,15.0,15.0,0.0,15.0
mean,1.0,0.933333,38.733333,1.066667,1.6,328.247507,,2.0
std,0.0,0.258199,14.834644,1.222799,1.352247,114.966828,,0.0
min,1.0,0.0,18.0,0.0,0.0,247.5208,,2.0
25%,1.0,1.0,26.0,0.0,0.5,262.375,,2.0
50%,1.0,1.0,36.0,1.0,2.0,263.0,,2.0
75%,1.0,1.0,49.0,2.0,2.0,387.6646,,2.0
max,1.0,1.0,64.0,3.0,4.0,512.3292,,2.0
