In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import datetime as dt

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree

In [3]:
ind_df = pd.read_csv("Cricket.csv",sep=",",encoding="ISO-8859-1", header=0)
ind_df.head()

Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0
0,SR Tendulkar (INDIA),1989-2012,463,452,41,18426,200*,44.83,21367,86.23,49,96,20
1,KC Sangakkara (Asia/ICC/SL),2000-2015,404,380,41,14234,169,41.98,18048,78.86,25,93,15
2,RT Ponting (AUS/ICC),1995-2012,375,365,39,13704,164,42.03,17046,80.39,30,82,20
3,ST Jayasuriya (Asia/SL),1989-2011,445,433,18,13430,189,32.36,14725,91.2,28,68,34
4,DPMD Jayawardene (Asia/SL),1998-2015,448,418,39,12650,144,33.37,16020,78.96,19,77,28


In [4]:
ind_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79 entries, 0 to 78
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  79 non-null     object 
 1   Span    79 non-null     object 
 2   Mat     79 non-null     int64  
 3   Inns    79 non-null     int64  
 4   NO      79 non-null     int64  
 5   Runs    79 non-null     int64  
 6   HS      79 non-null     object 
 7   Ave     79 non-null     float64
 8   BF      79 non-null     int64  
 9   SR      79 non-null     float64
 10  100     79 non-null     int64  
 11  50      79 non-null     int64  
 12  0       79 non-null     int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 8.1+ KB


In [16]:
ind_df['HS'] = ind_df['HS'].apply(lambda x: int(x.replace('*','')))

In [18]:
ind_df.dtypes

Player     object
Span       object
Mat         int64
Inns        int64
NO          int64
Runs        int64
HS          int64
Ave       float64
BF          int64
SR        float64
100         int64
50          int64
0           int64
dtype: object

In [20]:
a = ind_df.columns
a

Index(['Player', 'Span', 'Mat', 'Inns', 'NO', 'Runs', 'HS', 'Ave', 'BF', 'SR',
       '100', '50', '0'],
      dtype='object')

In [21]:
ind_df_sc = ind_df[['Mat', 'Inns', 'NO', 'Runs', 'HS', 'Ave', 'BF', 'SR','100', '50', '0']]
ind_df_sc

Unnamed: 0,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0
0,463,452,41,18426,200,44.83,21367,86.23,49,96,20
1,404,380,41,14234,169,41.98,18048,78.86,25,93,15
2,375,365,39,13704,164,42.03,17046,80.39,30,82,20
3,445,433,18,13430,189,32.36,14725,91.20,28,68,34
4,448,418,39,12650,144,33.37,16020,78.96,19,77,28
...,...,...,...,...,...,...,...,...,...,...,...
74,128,127,13,5134,133,45.03,7908,64.92,11,31,3
75,162,149,31,5122,96,43.40,6945,73.75,0,42,6
76,197,181,37,5092,120,35.36,6614,76.98,5,26,7
77,198,161,33,5088,156,39.75,5504,92.44,6,30,15


In [31]:
s = StandardScaler()
ind_df_sc = s.fit_transform(ind_df_sc)
ind_df_sc.shape

(79, 11)

In [26]:
ind_df_pd = pd.DataFrame(ind_df_sc)
ind_df_pd.columns = ['Mat', 'Inns', 'NO', 'Runs', 'HS', 'Ave', 'BF', 'SR','100', '50', '0']
ind_df_pd.head()

Unnamed: 0,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0
0,2.955282,3.169333,0.764963,4.262328,1.632443,1.072294,3.681214,0.703152,4.656726,3.050057,1.145837
1,2.155179,2.138915,0.764963,2.609117,0.635224,0.587725,2.635385,-0.044139,1.671888,2.865418,0.296671
2,1.761908,1.924245,0.625397,2.400099,0.474382,0.596226,2.319651,0.110997,2.293729,2.188406,1.145837
3,2.711183,2.897417,-0.840046,2.292041,1.278591,-1.047909,1.588295,1.207091,2.044992,1.326755,3.523501
4,2.751866,2.682747,0.625397,1.98443,-0.168986,-0.876185,1.996354,-0.034,0.925678,1.880674,2.504502


In [32]:
ind_df_n = ind_df[['Player', 'Span']]
ind_df_n

Unnamed: 0,Player,Span
0,SR Tendulkar (INDIA),1989-2012
1,KC Sangakkara (Asia/ICC/SL),2000-2015
2,RT Ponting (AUS/ICC),1995-2012
3,ST Jayasuriya (Asia/SL),1989-2011
4,DPMD Jayawardene (Asia/SL),1998-2015
...,...,...
74,CG Greenidge (WI),1975-1991
75,Misbah-ul-Haq (PAK),2002-2015
76,PD Collingwood (ENG),2001-2011
77,A Symonds (AUS),1998-2009


In [28]:
# k-means with some arbitrary k
kmeans = KMeans(n_clusters=4, max_iter=50,random_state=100)
kmeans.fit(ind_df_pd)

KMeans(max_iter=50, n_clusters=4, random_state=100)

In [29]:
kmeans.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 0, 2, 2, 2, 0, 3, 3, 3, 2, 0, 3, 3, 3, 0, 0, 3, 3, 0, 3,
       3, 3, 0, 3, 3, 0, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 0, 0, 0,
       3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3])

In [41]:
ind_df_n['cluster_id'] = kmeans.labels_
ind_df_n

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ind_df_n['cluster_id'] = kmeans.labels_


Unnamed: 0,Player,Span,cluster_id
0,SR Tendulkar (INDIA),1989-2012,1
1,KC Sangakkara (Asia/ICC/SL),2000-2015,1
2,RT Ponting (AUS/ICC),1995-2012,1
3,ST Jayasuriya (Asia/SL),1989-2011,1
4,DPMD Jayawardene (Asia/SL),1998-2015,1
...,...,...,...
74,CG Greenidge (WI),1975-1991,0
75,Misbah-ul-Haq (PAK),2002-2015,3
76,PD Collingwood (ENG),2001-2011,3
77,A Symonds (AUS),1998-2009,3


In [42]:
ind_df_pd['cluster_id']= kmeans.labels_
ind_df_pd

Unnamed: 0,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,cluster_id
0,2.955282,3.169333,0.764963,4.262328,1.632443,1.072294,3.681214,0.703152,4.656726,3.050057,1.145837,1
1,2.155179,2.138915,0.764963,2.609117,0.635224,0.587725,2.635385,-0.044139,1.671888,2.865418,0.296671,1
2,1.761908,1.924245,0.625397,2.400099,0.474382,0.596226,2.319651,0.110997,2.293729,2.188406,1.145837,1
3,2.711183,2.897417,-0.840046,2.292041,1.278591,-1.047909,1.588295,1.207091,2.044992,1.326755,3.523501,1
4,2.751866,2.682747,0.625397,1.984430,-0.168986,-0.876185,1.996354,-0.034000,0.925678,1.880674,2.504502,1
...,...,...,...,...,...,...,...,...,...,...,...,...
74,-1.587674,-1.481860,-1.188961,-0.979677,-0.522838,1.106299,-0.559768,-1.457604,-0.069268,-0.950465,-1.741328,0
75,-1.126598,-1.167010,0.067133,-0.984410,-1.713068,0.829159,-0.863213,-0.562275,-1.437319,-0.273453,-1.231828,3
76,-0.651961,-0.709047,0.485831,-0.996241,-0.941027,-0.537836,-0.967512,-0.234765,-0.815478,-1.258197,-1.061995,3
77,-0.638400,-0.995274,0.206699,-0.997819,0.217035,0.208570,-1.317278,1.332823,-0.691110,-1.012011,0.296671,3


In [46]:
ind_m = pd.merge(ind_df_n,ind_df_pd,how='inner',on='cluster_id')
ind_m

Unnamed: 0,Player,Span,cluster_id,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0
0,SR Tendulkar (INDIA),1989-2012,1,2.955282,3.169333,0.764963,4.262328,1.632443,1.072294,3.681214,0.703152,4.656726,3.050057,1.145837
1,SR Tendulkar (INDIA),1989-2012,1,2.155179,2.138915,0.764963,2.609117,0.635224,0.587725,2.635385,-0.044139,1.671888,2.865418,0.296671
2,SR Tendulkar (INDIA),1989-2012,1,1.761908,1.924245,0.625397,2.400099,0.474382,0.596226,2.319651,0.110997,2.293729,2.188406,1.145837
3,SR Tendulkar (INDIA),1989-2012,1,2.711183,2.897417,-0.840046,2.292041,1.278591,-1.047909,1.588295,1.207091,2.044992,1.326755,3.523501
4,SR Tendulkar (INDIA),1989-2012,1,2.751866,2.682747,0.625397,1.984430,-0.168986,-0.876185,1.996354,-0.034000,0.925678,1.880674,2.504502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1952,Abdul Razzaq (Asia/PAK),1996-2011,3,-0.434984,-0.465753,-0.491131,-0.968635,-0.973195,-1.535879,-0.366609,-1.898679,-0.939846,-0.704279,0.296671
1953,Abdul Razzaq (Asia/PAK),1996-2011,3,-1.126598,-1.167010,0.067133,-0.984410,-1.713068,0.829159,-0.863213,-0.562275,-1.437319,-0.273453,-1.231828
1954,Abdul Razzaq (Asia/PAK),1996-2011,3,-0.651961,-0.709047,0.485831,-0.996241,-0.941027,-0.537836,-0.967512,-0.234765,-0.815478,-1.258197,-1.061995
1955,Abdul Razzaq (Asia/PAK),1996-2011,3,-0.638400,-0.995274,0.206699,-0.997819,0.217035,0.208570,-1.317278,1.332823,-0.691110,-1.012011,0.296671
