In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('./datasets/titanic.csv')

In [3]:
df.shape

(891, 12)

In [5]:
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [6]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [7]:
df = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])

In [9]:
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])

In [11]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,S
1,1,1,0,38.0,1,0,71.2833,C
2,1,3,0,26.0,0,0,7.925,S
3,1,1,0,35.0,1,0,53.1,S
4,0,3,1,35.0,0,0,8.05,S


In [12]:
df = pd.get_dummies(df, columns=['Embarked'])
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,22.0,1,0,7.25,False,False,True
1,1,1,0,38.0,1,0,71.2833,True,False,False
2,1,3,0,26.0,0,0,7.925,False,False,True
3,1,1,0,35.0,1,0,53.1,False,False,True
4,0,3,1,35.0,0,0,8.05,False,False,True


In [14]:
df.shape

(891, 10)

In [15]:
df[df.isnull().any(axis=1)]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
5,0,3,1,,0,0,8.4583,False,True,False
17,1,2,1,,0,0,13.0000,False,False,True
19,1,3,0,,0,0,7.2250,True,False,False
26,0,3,1,,0,0,7.2250,True,False,False
28,1,3,0,,0,0,7.8792,False,True,False
...,...,...,...,...,...,...,...,...,...,...
859,0,3,1,,0,0,7.2292,True,False,False
863,0,3,0,,8,2,69.5500,False,False,True
868,0,3,1,,0,0,9.5000,False,False,True
878,0,3,1,,0,0,7.8958,False,False,True


In [16]:
df = df.dropna()

In [17]:
df.shape

(714, 10)

In [18]:
df_feature = df.drop(columns=['Survived'])
df_labels = df['Survived']

In [20]:
bandwidth = estimate_bandwidth(df_feature)
bandwidth

np.float64(30.43381713199512)

In [21]:
model = MeanShift(bandwidth=30, n_jobs=-1)
model.fit(df_feature)

0,1,2
,"bandwidth  bandwidth: float, default=None Bandwidth used in the flat kernel. If not given, the bandwidth is estimated using sklearn.cluster.estimate_bandwidth; see the documentation for that function for hints on scalability (see also the Notes, below).",30
,"seeds  seeds: array-like of shape (n_samples, n_features), default=None Seeds used to initialize kernels. If not set, the seeds are calculated by clustering.get_bin_seeds with bandwidth as the grid size and default values for other parameters.",
,"bin_seeding  bin_seeding: bool, default=False If true, initial kernel locations are not locations of all points, but rather the location of the discretized version of points, where points are binned onto a grid whose coarseness corresponds to the bandwidth. Setting this option to True will speed up the algorithm because fewer seeds will be initialized. The default value is False. Ignored if seeds argument is not None.",False
,"min_bin_freq  min_bin_freq: int, default=1 To speed up the algorithm, accept only those bins with at least min_bin_freq points as seeds.",1
,"cluster_all  cluster_all: bool, default=True If true, then all points are clustered, even those orphans that are not within any kernel. Orphans are assigned to the nearest kernel. If false, then orphans are given cluster label -1.",True
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. The following tasks benefit from the parallelization: - The search of nearest neighbors for bandwidth estimation and label  assignments. See the details in the docstring of the  ``NearestNeighbors`` class. - Hill-climbing optimization for all seeds. See :term:`Glossary ` for more details. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",-1
,"max_iter  max_iter: int, default=300 Maximum number of iterations, per seed point before the clustering operation terminates (for that seed point), if has not converged yet. .. versionadded:: 0.22",300


In [22]:
labels = model.labels_
np.unique(labels)

array([0, 1, 2, 3, 4])

In [23]:
analyser = MeanShift(bandwidth=bandwidth, n_jobs=-1)
analyser.fit(df)

0,1,2
,"bandwidth  bandwidth: float, default=None Bandwidth used in the flat kernel. If not given, the bandwidth is estimated using sklearn.cluster.estimate_bandwidth; see the documentation for that function for hints on scalability (see also the Notes, below).",np.float64(30.43381713199512)
,"seeds  seeds: array-like of shape (n_samples, n_features), default=None Seeds used to initialize kernels. If not set, the seeds are calculated by clustering.get_bin_seeds with bandwidth as the grid size and default values for other parameters.",
,"bin_seeding  bin_seeding: bool, default=False If true, initial kernel locations are not locations of all points, but rather the location of the discretized version of points, where points are binned onto a grid whose coarseness corresponds to the bandwidth. Setting this option to True will speed up the algorithm because fewer seeds will be initialized. The default value is False. Ignored if seeds argument is not None.",False
,"min_bin_freq  min_bin_freq: int, default=1 To speed up the algorithm, accept only those bins with at least min_bin_freq points as seeds.",1
,"cluster_all  cluster_all: bool, default=True If true, then all points are clustered, even those orphans that are not within any kernel. Orphans are assigned to the nearest kernel. If false, then orphans are given cluster label -1.",True
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. The following tasks benefit from the parallelization: - The search of nearest neighbors for bandwidth estimation and label  assignments. See the details in the docstring of the  ``NearestNeighbors`` class. - Hill-climbing optimization for all seeds. See :term:`Glossary ` for more details. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",-1
,"max_iter  max_iter: int, default=300 Maximum number of iterations, per seed point before the clustering operation terminates (for that seed point), if has not converged yet. .. versionadded:: 0.22",300


In [24]:
labels = model.labels_
np.unique(labels)

array([0, 1, 2, 3, 4])

In [27]:
df['cluster_group'] = labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cluster_group'] = labels


In [28]:
df.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,cluster_group
count,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0
mean,0.406162,2.236695,0.634454,29.699118,0.512605,0.431373,34.694514,0.317927
std,0.49146,0.83825,0.481921,14.526497,0.929783,0.853289,52.91893,0.691392
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,20.125,0.0,0.0,8.05,0.0
50%,0.0,2.0,1.0,28.0,0.0,0.0,15.7417,0.0
75%,1.0,3.0,1.0,38.0,1.0,1.0,33.375,0.0
max,1.0,3.0,1.0,80.0,5.0,6.0,512.3292,4.0


In [29]:
df_cluster_data = df.groupby(['cluster_group']).mean()

In [30]:
df_cluster_data['Counts'] = df.groupby(['cluster_group']).size()

In [31]:
df_cluster_data

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Counts
cluster_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0.334532,2.528777,0.679856,28.218076,0.438849,0.368705,15.351026,0.118705,0.046763,0.834532,556
1,0.618182,1.3,0.527273,36.2,0.809091,0.509091,65.130268,0.345455,0.018182,0.618182,110
2,0.733333,1.0,0.366667,32.430667,0.6,0.866667,131.183883,0.5,0.0,0.5,30
3,0.733333,1.0,0.266667,30.333333,1.0,1.333333,239.99194,0.533333,0.0,0.466667,15
4,1.0,1.0,0.666667,35.333333,0.0,0.333333,512.3292,1.0,0.0,0.0,3


In [32]:
df.loc[df['cluster_group'] == 0].describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,cluster_group
count,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0
mean,0.334532,2.528777,0.679856,28.218076,0.438849,0.368705,15.351026,0.0
std,0.472252,0.655592,0.466952,14.077084,0.89132,0.797988,9.064108,0.0
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,19.0,0.0,0.0,7.8958,0.0
50%,0.0,3.0,1.0,27.0,0.0,0.0,12.4125,0.0
75%,1.0,3.0,1.0,36.0,1.0,0.0,24.0,0.0
max,1.0,3.0,1.0,80.0,4.0,5.0,41.5792,0.0


In [33]:
df.loc[df['cluster_group'] == 0]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,cluster_group
0,0,3,1,22.0,1,0,7.2500,False,False,True,0
2,1,3,0,26.0,0,0,7.9250,False,False,True,0
4,0,3,1,35.0,0,0,8.0500,False,False,True,0
7,0,3,1,2.0,3,1,21.0750,False,False,True,0
8,1,3,0,27.0,0,2,11.1333,False,False,True,0
...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,0,39.0,0,5,29.1250,False,True,False,0
886,0,2,1,27.0,0,0,13.0000,False,False,True,0
887,1,1,0,19.0,0,0,30.0000,False,False,True,0
889,1,1,1,26.0,0,0,30.0000,True,False,False,0


In [34]:
n_clusters = len(np.unique(model.labels_))
n_clusters

5

In [35]:
model.cluster_centers_

array([[2.56250000e+00, 6.68560606e-01, 2.70279356e+01, 4.28030303e-01,
        3.86363636e-01, 1.50454616e+01, 1.23106061e-01, 4.35606061e-02,
        8.33333333e-01],
       [1.34000000e+00, 5.30000000e-01, 3.34800000e+01, 8.30000000e-01,
        5.00000000e-01, 6.28027530e+01, 3.10000000e-01, 2.00000000e-02,
        6.60000000e-01],
       [1.00000000e+00, 4.16666667e-01, 3.24166667e+01, 6.25000000e-01,
        7.50000000e-01, 1.25112492e+02, 5.83333333e-01, 0.00000000e+00,
        4.16666667e-01],
       [1.00000000e+00, 2.00000000e-01, 2.77000000e+01, 1.40000000e+00,
        1.20000000e+00, 2.49136660e+02, 7.00000000e-01, 0.00000000e+00,
        3.00000000e-01],
       [1.00000000e+00, 6.66666667e-01, 3.53333333e+01, 0.00000000e+00,
        3.33333333e-01, 5.12329200e+02, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00]])