# Unsupervised algorithms

In [1]:
#importing libraries
import pandas as pd
import numpy as np

In [2]:
filename = 'abalone.data'

In [3]:
df = pd.DataFrame()

In [4]:
#creating columns
names = [ 'Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']

In [5]:
#importing dataset
df =  pd.read_csv(filename ,names = names, delimiter=',')

In [6]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [7]:
#replacing alphabetical values with numerical values
replace_list = {"Sex" : {"M": 0, "F" : 1, "I": 2}}
df.replace(replace_list,inplace=True)

In [8]:
#calculating age
#df['age'] = df['Rings'] + 1.5

In [9]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,2,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [10]:
#assigning input array
x_df = df.iloc[:,:9].values

In [11]:
#normalizing values before applying clustering method
from sklearn.preprocessing import Normalizer
scaler = Normalizer()
scaler.fit(x_df)
x_scaled = scaler.transform(x_df)

# 1.Clustering method : agglomerative or hierarchical clustering

In [12]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

In [13]:
#  Run some clustering method (except k-means) with different number of clusters.
# using silhouette_score to measure accuracy.
n_clusters = (2,3,4,5,7,9,11,13)
for i in n_clusters:
    clustering = AgglomerativeClustering(n_clusters=i, linkage = 'ward')
    predicted = clustering.fit_predict(x_scaled)
    print("\nClusters present: {}".format(np.unique(predicted)))
    print("Cluster sizes: {}".format(np.bincount(predicted)))
    print("silhouette_score for ", i , ' clusters is :%.3f' %(silhouette_score(x_scaled,predicted)))


Clusters present: [0 1]
Cluster sizes: [2845 1332]
silhouette_score for  2  clusters is :0.595

Clusters present: [0 1 2]
Cluster sizes: [1332 1528 1317]
silhouette_score for  3  clusters is :0.454

Clusters present: [0 1 2 3]
Cluster sizes: [1528 1167 1317  165]
silhouette_score for  4  clusters is :0.455

Clusters present: [0 1 2 3 4]
Cluster sizes: [1167 1012 1317  165  516]
silhouette_score for  5  clusters is :0.426

Clusters present: [0 1 2 3 4 5 6]
Cluster sizes: [1012  165  684  733  516  483  584]
silhouette_score for  7  clusters is :0.413

Clusters present: [0 1 2 3 4 5 6 7 8]
Cluster sizes: [684 733  65 686 516 483 584 326 100]
silhouette_score for  9  clusters is :0.387

Clusters present: [ 0  1  2  3  4  5  6  7  8  9 10]
Cluster sizes: [733 516 541 686  64 483 584 326 100 143   1]
silhouette_score for  11  clusters is :0.374

Clusters present: [ 0  1  2  3  4  5  6  7  8  9 10 11 12]
Cluster sizes: [584 516 483 686  64 284 449 326 100 143   1 358 183]
silhouette_score f

# 2. KernalPCA for dimension reductionality and Kmeans clustering method for unsupervised training

In [14]:
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import StandardScaler

In [15]:
# standardising the input features before applying clustering method
scaler = StandardScaler().fit(x_df)
X_tr = scaler.transform(x_df)

In [16]:
pca = KernelPCA(n_components=3, kernel="cosine", fit_inverse_transform= True)
X_pca = pca.fit_transform(X_tr)

In [17]:
# the shape of the original and reduced dimension 
print("Original shape: {}".format(str(x_df.shape)))
print("Reduced shape: {}".format(str(X_pca.shape)))

Original shape: (4177, 9)
Reduced shape: (4177, 3)


In [18]:
# fitting by kmeans clustering method
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10)
kmeans.fit(X_pca)


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [19]:
#inverse transforming the features
#X_inv = scaler.inverse_transform(X_tr)

In [20]:
#printing cluster labels
print("Cluster labels:", (kmeans.labels_))

Cluster labels: [6 4 1 ... 8 7 8]


In [21]:
pred_k = kmeans.predict(X_pca)

In [22]:
#finding the cluster centroids
print("coordinates of cluster centre:\n" , kmeans.cluster_centers_)

coordinates of cluster centre:
 [[ 0.35891726  0.64357809 -0.37111515]
 [-0.94866077 -0.11369428 -0.05420928]
 [ 0.69340829 -0.11367745  0.50851312]
 [ 0.14733908 -0.23789646 -0.60207329]
 [-0.63498303  0.66778696 -0.08651183]
 [ 0.30589517  0.61595544  0.4975331 ]
 [-0.35247195  0.04404811  0.72243619]
 [ 0.84091756 -0.30813466 -0.11736684]
 [ 0.88892184  0.17567227 -0.14869296]
 [-0.34488204 -0.78309701  0.01581133]]


In [23]:
centers = kmeans.cluster_centers_
centers.shape

(10, 3)

In [24]:
inverse1 = pca.inverse_transform(centers)
inverse1.shape

(10, 9)

In [25]:
inverse2 = scaler.inverse_transform(inverse1)
inverse2.shape

(10, 9)

In [26]:
X_centers = pd.DataFrame(data=inverse2, columns=df.columns)
X_centers

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,-0.42375,0.548219,0.427637,0.143421,0.951345,0.438374,0.210881,0.252723,8.412129
1,1.689762,0.402771,0.306475,0.100538,0.318317,0.142518,0.068784,0.094988,7.444543
2,0.743286,0.630704,0.499508,0.179947,1.267789,0.508921,0.273584,0.387509,15.151916
3,1.19935,0.566677,0.437303,0.140669,1.009119,0.506964,0.224595,0.2454,4.730017
4,0.357028,0.408018,0.312932,0.103806,0.358517,0.155122,0.078847,0.105657,8.192
5,-0.1724,0.540702,0.427041,0.153875,0.908453,0.344913,0.195829,0.289571,14.554925
6,1.251875,0.465992,0.364845,0.132695,0.577779,0.184484,0.120781,0.211849,14.504957
7,0.861676,0.655949,0.515906,0.178386,1.376277,0.610058,0.300975,0.383427,10.977938
8,0.107398,0.63941,0.503438,0.174125,1.318518,0.582468,0.289141,0.366789,11.067973
9,2.445152,0.509309,0.393274,0.132878,0.74756,0.330942,0.161546,0.215862,8.838415


In [27]:
#Age of cluster centroids is Rings+1.5
X_centers['Age'] = X_centers.Rings + 1.5

In [28]:
age_cluster_Centroids = X_centers.Age
age_cluster_Centroids

0     9.912129
1     8.944543
2    16.651916
3     6.230017
4     9.692000
5    16.054925
6    16.004957
7    12.477938
8    12.567973
9    10.338415
Name: Age, dtype: float64

In [30]:
from sklearn.metrics import pairwise_distances_argmin_min

In [32]:
#finding the data point closest to the centroids of the clusters
closest,_= pairwise_distances_argmin_min(kmeans.cluster_centers_, X_pca)
closest

array([3725, 3717,  502, 2654, 2571,  201,  279, 1030, 1502, 1871],
      dtype=int64)

In [33]:
# x[3725] is the data point closest to the centroid of cluster 0
#the age of the data point closest to the centroid 0 is 2.5
x_df[3725]

array([0.    , 0.495 , 0.375 , 0.155 , 0.976 , 0.45  , 0.2285, 0.2475,
       9.    ])

In [34]:
# x[3717] is the data point closest to the centroid of cluster 1
#the age of the data point closest to the centroid 0 is 10.5
x_df[3717]

array([2.    , 0.35  , 0.25  , 0.1   , 0.4015, 0.1725, 0.063 , 0.1255,
       7.    ])

In [35]:
#cross checking with the df at the index location mentioned by 'closest' array
df.loc[3717]

Sex               2.0000
Length            0.3500
Diameter          0.2500
Height            0.1000
Whole weight      0.4015
Shucked weight    0.1725
Viscera weight    0.0630
Shell weight      0.1255
Rings             7.0000
Name: 3717, dtype: float64

# 3.Gaussian Mixture Model

### cluster Abalone dataset without age (rings) feature using Gaussian Mixture Model

In [73]:
#defining input array 
#excluding rings column
X_df = np.array(df.drop(['Rings'],1))

In [74]:
from sklearn import cluster, datasets, mixture

In [75]:
#Normalizing values before applying GMM
scaler = Normalizer()
scaler.fit(X_df)
x_scaled = scaler.transform(X_df)

In [76]:
gmm = mixture.GaussianMixture(n_components=12)

In [77]:
gmm.fit(x_scaled)

GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,
        means_init=None, n_components=12, n_init=1, precisions_init=None,
        random_state=None, reg_covar=1e-06, tol=0.001, verbose=0,
        verbose_interval=10, warm_start=False, weights_init=None)

In [78]:
pred_gmm = gmm.predict(x_scaled)
print("\nClusters present: {}".format(np.unique(pred_gmm)))
print("Cluster sizes: {}".format(np.bincount(pred_gmm)))


Clusters present: [ 0  1  2  3  4  5  6  7  8  9 10 11]
Cluster sizes: [645 339 347 248 250 517 132 174 391 392 234 508]


### Take randomly 20 samples with age from the original Abalone dataset, predict their probabilities with GaussianMM 

In [79]:
#random sampling of 20 datapoints
df_small = pd.read_csv(filename ,names = names, delimiter=',').sample(20)
del df_small['Rings']
rings = df['Rings'].sample(20)

In [80]:
#replacing alphabetical values with numerical values
replace_list = {"Sex" : {"M": 0, "F" : 1, "I": 2}}
df_small.replace(replace_list,inplace=True)

In [81]:
df_small.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight
3220,0,0.5,0.4,0.145,0.6025,0.216,0.138,0.21
2484,0,0.46,0.36,0.135,0.6105,0.1955,0.107,0.235
490,0,0.59,0.455,0.145,1.073,0.475,0.19,0.285
3808,0,0.515,0.385,0.13,0.623,0.2855,0.1285,0.175
2033,0,0.61,0.49,0.16,1.112,0.465,0.228,0.341


In [82]:
#including Rings column
x_small = np.array(df_small)

In [83]:
#Normalizing the values
scaler = Normalizer()
scaler.fit(x_small)
x_scaled = scaler.transform(x_small)

In [84]:
gmm1 = mixture.GaussianMixture(n_components=12)

In [85]:
gmm1.fit(x_scaled)

GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,
        means_init=None, n_components=12, n_init=1, precisions_init=None,
        random_state=None, reg_covar=1e-06, tol=0.001, verbose=0,
        verbose_interval=10, warm_start=False, weights_init=None)

In [86]:
#inverse transforming the features
#_inv1 = scaler.inverse_transform(x_scaled)

In [87]:
cluster = gmm1.predict_proba(x_scaled)

In [88]:
cluster[:5]

array([[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]])

In [89]:
#Transform 'Rings' to multiple class
rings=pd.get_dummies(rings)
ring_values= np.array(rings.columns)

In [90]:
#Get cluster x class matrix
cluster_age = cluster.T @ rings
# normalize so probabilities sum up to 1
cluster_age = cluster_age / cluster_age.sum(1)[:,None]
#cluster_age

In [91]:
#Get samples(without rings) x class matrix
predicted_classes= gmm1.predict_proba(X_df) @ cluster_age
#predicted_classes[:3]

In [92]:
#Predict rings by multiplying 'Rings' values of classes with class probabilities

predicted_age= predicted_classes @ ring_values

In [93]:
predicted_age.shape

(4177,)

In [94]:
predicted_age.round(3)

array([10. ,  8. ,  8. , ...,  7.5,  8. ,  8. ])