In [1]:
import pandas as pd

In [175]:
def External_indices_validation(predicted, labels, n_clusters, info_for_clusters=False):
    '''Clustering validation for unsupervised learning using external indices.
    Input:
        info_for_clustering: 
            True: Prints percentages of clusters containing each label
            False: Prints percentages of labels in each cluster
        
    Output:
       '''
    
    #####
    if not info_for_clusters:
        lst = []
        tmp = []
        for rating in sorted(labels.unique()):
            for ind, val in enumerate(labels):
                if val == rating:
                    tmp.append(predicted[ind])

            lst.append((rating, tmp))
            tmp = []


        #print statistics for each cluster
        for ind, sublist in lst:
            c = Counter(sublist)
            tmp = [(i, c[i] / len(sublist) * 100.0) for i, count in c.most_common()]
            print (f'Info for rating={ind} \n')
            for el in tmp:
                print(f'{el[1]:.2f} % in cluster {el[0]}')
            print ('\n')
    else:
        # create a list that on index i includes all the values of the labels under cluster #i
        lst = [[] for _ in range(n_clusters)]
        for ind, val in enumerate(predicted):
            lst[val].append(labels.values[ind])

        #print statistics for each cluster
        for ind,sublist in enumerate(lst):
            c = Counter(sublist)
            tmp = [(i, c[i] / len(sublist) * 100.0) for i, count in c.most_common()]
            print (f'Info for cluster #{ind} \n')
            for el in tmp:
                print(f'{el[1]:.2f} % of rating {el[0]}')
            print ('\n')

In [12]:
red = pd.read_csv('./csv_cache/red_clean.csv', delimiter='\t',index_col=0)

In [14]:
red.head(3)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5


In [16]:
red_X = red.drop(labels='quality', axis=1)
red_y = red['quality']

In [18]:
red_y.head()

0    5
1    5
2    5
3    6
4    5
Name: quality, dtype: int64

In [187]:
from collections import Counter
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(red_X, red_y, test_size=0.2, random_state=0)

kmeans = KMeans(n_clusters=12)
kmeans.fit(X_train)
predicted = kmeans.predict(X_test)

External_indices_validation(predicted, y_test, 12, info_for_clusters=True)


Info for cluster #0 

50.00 % of rating 5
27.78 % of rating 6
11.11 % of rating 7
11.11 % of rating 4


Info for cluster #1 

40.00 % of rating 6
37.14 % of rating 5
14.29 % of rating 7
5.71 % of rating 4
2.86 % of rating 8


Info for cluster #2 

55.56 % of rating 5
38.89 % of rating 6
5.56 % of rating 7


Info for cluster #3 

52.63 % of rating 6
26.32 % of rating 5
21.05 % of rating 7


Info for cluster #4 

47.37 % of rating 6
42.11 % of rating 5
5.26 % of rating 8
5.26 % of rating 7


Info for cluster #5 

57.69 % of rating 5
34.62 % of rating 6
3.85 % of rating 8
3.85 % of rating 7


Info for cluster #6 

100.00 % of rating 5


Info for cluster #7 

60.00 % of rating 6
20.00 % of rating 7
20.00 % of rating 5


Info for cluster #8 

53.33 % of rating 5
30.00 % of rating 6
16.67 % of rating 7


Info for cluster #9 

85.71 % of rating 5
14.29 % of rating 6


Info for cluster #10 

61.11 % of rating 6
22.22 % of rating 5
16.67 % of rating 7


Info for cluster #11 

70.00 % of rating 

In [182]:
predicted

array([1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1],
      dtype=int32)

In [183]:
y_test.values

array([5, 5, 6, 5, 6, 5, 6, 5, 5, 6, 5, 7, 5, 6, 5, 7, 6, 6, 6, 6, 7, 5,
       6, 6, 5, 6, 7, 5, 6, 6, 6, 6, 5, 6, 6, 7, 6, 5, 5, 5, 5, 6, 6, 5,
       6, 6, 5, 4, 6, 5, 5, 7, 6, 5, 5, 5, 7, 5, 7, 7, 6, 5, 5, 7, 5, 6,
       5, 5, 5, 6, 6, 8, 7, 6, 5, 5, 6, 8, 6, 6, 5, 5, 5, 7, 6, 7, 5, 5,
       6, 6, 5, 5, 6, 5, 7, 5, 5, 5, 5, 5, 5, 5, 7, 5, 6, 6, 6, 5, 5, 4,
       7, 6, 5, 6, 7, 5, 6, 5, 6, 5, 7, 6, 5, 6, 5, 6, 6, 7, 6, 7, 7, 6,
       5, 5, 7, 6, 6, 6, 5, 5, 5, 6, 6, 6, 5, 5, 5, 6, 6, 5, 6, 5, 5, 6,
       5, 6, 6, 5, 6, 5, 4, 4, 5, 5, 6, 6, 5, 6, 5, 5, 6, 8, 5, 6, 5, 6,
       5, 5, 6, 6, 6, 6, 5, 5, 5, 6, 5, 6, 6, 5, 5, 5, 6, 6, 5, 6, 5, 7,
       5, 5, 6, 4, 6, 6, 7, 5, 5, 7, 6, 6, 7, 5, 6, 6, 7, 5, 5, 5, 5, 7,
       5, 6, 7, 6, 5, 6, 6, 6, 5, 5, 6, 6, 5, 6, 5, 5, 5, 6, 6])