# Heirarchial Aggolomerative Clustering is best suited for Non-Globular Dataset

## Here we consider only Input in Amazon Mobile DataSet 

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
import string
from nltk.stem.snowball import SnowballStemmer

In [2]:
# Reading the data file into a DATAFRAME and checking the shape 
data=pd.read_csv('35.1AmazonMobileDataUncleaned.csv')
print(data.shape)

(71922, 2)


In [3]:
def datapreprocess(sen):
    
    sen = re.sub(r"didn't", "did not", sen)
    sen = re.sub(r"don't", "do not", sen)
    sen = re.sub(r"won't", "will not", sen)
    sen = re.sub(r"can't", "can not", sen)
    sen = re.sub(r"wasn't", "do not", sen)
    sen = re.sub(r"\'ve", " have", sen)
    sen = re.sub(r"\'m", " am", sen)
    sen = re.sub(r"\'ll", " will", sen)
    sen = re.sub(r"\'re", " are", sen)
    sen = re.sub(r"\'s", " is", sen)
    sen = re.sub(r"\'d", " would", sen)
    sen = re.sub(r"\'t", " not", sen)
    sen = re.sub(r"\'m", " am", sen)
    sen = re.sub(r"n\'t", " not", sen)
    
    p = set(string.punctuation) # p take all the punctuations, punctuations displayed below
    sen=sen.lower() # convert text to lower case
    words=sen.split() # split the text into words
    ctext=[]
    for i in range(10): #adding numbers from 0 - 9 to p 
        p.add(str(i))
    for i in words:
        t=''.join([x for x in i.encode("ascii","ignore").decode("ascii") if x not in p]) # ignoring non ascii charecters and numbers
        ctext.append(t)
    return " ".join([i for i in ctext]) # joining the cleaned words to text
    
    

stop=set(stopwords.words('english'))
print(stop)
stop.remove('no')
stop.remove('not')
stop.remove('nor')

preprocessed_essays = []
for sentance in data['uncleanedreview'].values:
    try:
        ctxt=datapreprocess(str(sentance))
        if len(ctxt)==0: raise()
        ctxt = ' '.join(e for e in ctxt.split() if e not in stop)
        preprocessed_essays.append(ctxt.lower().strip())
    except:
        preprocessed_essays.append("NAN")  
data['cleanedtext'] = preprocessed_essays
data.drop(data[data["cleanedtext"]=="NAN"].index,axis = 0,inplace = True) # deleting rows that have no text 
data=data.reset_index(drop='true') # after deleting rows index should be reset
data['decision']= data['decision'].replace('positive',1)
data['decision']= data['decision'].replace('negative',0)


s = SnowballStemmer("english")
p=[]
for i in data['cleanedtext'].values:
    k=[]
    for word in i.split():
        k.append(s.stem(word))
    l=' '.join(k)
    p.append(l)

data['j']=p

data.to_csv (r'AmazonMobileDataCleaned.csv')

{'o', "you'll", 'needn', "should've", 'didn', 'own', 'their', 'him', 'some', 're', "you're", 'whom', 'up', 've', 'we', 'in', 'any', 'does', 'ma', 'was', 'over', 't', 'our', 'your', 'off', 'under', 'by', 'haven', "mustn't", "didn't", 'do', "shouldn't", 'won', 'being', 'and', 'those', 'of', 'are', 'each', 'should', 'its', 'an', 'me', 'you', 'his', 'why', 'them', 'how', 'ain', 'be', 'i', 'weren', 'or', 'just', 'most', 'have', 'themselves', 'further', "you'd", 'where', 'too', 'yourself', 'after', 'down', 'below', 'ourselves', 'other', 'hadn', 'between', 'so', 'yourselves', 'having', 'to', 'from', 'nor', 'during', "she's", 'will', 'wouldn', 'as', 'were', 'there', 'these', 'into', "haven't", 'ours', 'few', 'with', 'don', 'd', 'for', 'out', 'this', 'been', "couldn't", 'himself', 'shouldn', 'she', 'through', 'hers', 's', 'that', 'then', 'here', 'had', 'couldn', 'did', 'they', 'yours', "that'll", 'has', "hasn't", 'am', 'he', 'aren', 'theirs', 'doing', 'who', 'now', 'on', 'not', "hadn't", 'becau

In [4]:
data=pd.read_csv('AmazonMobileDataCleaned.csv',nrows=5000)
print(data.shape)
label = data['decision'].values
feedback = data.drop(['decision'], axis=1)
print(label.shape)
print(feedback.shape)
print("------------------------------")

from sklearn.model_selection import train_test_split
# random_state : DEFAULT = None
# random_state in train_test_split = 3 or AnyInteger : Reproducing the same dataset into Train and Test, everytime you run this cell
inputtrain, inputtest, outputtrain, outputtest = train_test_split(feedback, label, test_size=0.20, stratify=label)
print(inputtrain.shape)
print(inputtest.shape)
print(outputtrain.shape)
print(outputtest.shape)

(5000, 5)
(5000,)
(5000, 4)
------------------------------
(4000, 4)
(1000, 4)
(4000,)
(1000,)


In [5]:
# convert into Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
featur = []
vec = CountVectorizer(min_df=10,ngram_range=(1,4))
vec.fit(inputtrain['j'].values) # fit has to happen only on train data


train_cleanedtext = vec.transform(inputtrain['j'].values)
test_cleanedtext= vec.transform(inputtest['j'].values)

print("After vectorizations")
print(train_cleanedtext.shape)
print(test_cleanedtext.shape)
print(outputtrain.shape)
print(outputtest.shape)
print("...............................")

featur= featur+vec.get_feature_names()

After vectorizations
(4000, 2743)
(1000, 2743)
(4000,)
(1000,)
...............................


### STEPS INVOLVED IN FINDING BEST CLUSTER SIZE(K)

In [7]:

'''
for K=1,
 1. Use K Means ++             --->  initial positioning of Centoids(C1)
 2. AggolomerativeClustering   --->  Identify Cluster SC1
 3. Calculate SilhoutteMean for K=1
 
for K=2,
 1. Use K Means ++             --->  initial positioning of Centoids(C1,C2)
 2. AggolomerativeClustering   --->  Identify Clusters SC1,SC2
 3. Calculate SilhoutteMean for K=2
 
for K=3,
 1. Use K Means ++             --->  initial positioning of Centoids(C1,C2,C3)
 2. AggolomerativeClustering   --->  Identify Clusters SC1,SC2,SC3
 3. Calculate SilhoutteMean for K=3
 
for K=4,5,6..........
 
Choose Value Of K Which Has Maximum SilhoutteMean
'''

'\nfor K=1,\n 1. Use K Means ++             --->  initial positioning of Centoids(C1)\n 2. AggolomerativeClustering   --->  Identify Cluster SC1\n 3. Calculate SilhoutteMean for K=1\n \nfor K=2,\n 1. Use K Means ++             --->  initial positioning of Centoids(C1,C2)\n 2. AggolomerativeClustering   --->  Identify Clusters SC1,SC2\n 3. Calculate SilhoutteMean for K=2\n \nfor K=3,\n 1. Use K Means ++             --->  initial positioning of Centoids(C1,C2,C3)\n 2. AggolomerativeClustering   --->  Identify Clusters SC1,SC2,SC3\n 3. Calculate SilhoutteMean for K=3\n \nfor K=4,5,6..........\n \nChoose Value Of K Which Has Maximum SilhoutteMean\n'

In [6]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples, silhouette_score



L=train_cleanedtext.toarray()
range_n_clusters = [2, 3,4,5,6,7,8]

for n_clusters in range_n_clusters:
    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = AgglomerativeClustering(n_clusters=n_clusters)
    cluster_labels = clusterer.fit_predict(L)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(L, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)


For n_clusters = 2 The average silhouette_score is : 0.6113932780214736
For n_clusters = 3 The average silhouette_score is : 0.5919850470490015
For n_clusters = 4 The average silhouette_score is : 0.212772542569957
For n_clusters = 5 The average silhouette_score is : 0.2144183598467373
For n_clusters = 6 The average silhouette_score is : 0.2146546164440003
For n_clusters = 7 The average silhouette_score is : 0.2149444023367481
For n_clusters = 8 The average silhouette_score is : 0.1751789508341463


### Model trained only on input

In [11]:
# n_clusters --> No of Clusters
# linkage{‘ward’, ‘complete’, ‘average’, ‘single’}, default=’ward’

Agglo = AgglomerativeClustering(n_clusters=2).fit(train_cleanedtext.toarray()) # DenseMatrix is passed instead of SparseMatrix
Predicted_trained=Agglo.labels_ #Training set Predicted Cluster values
print(Predicted_trained)

[1 1 1 ... 1 1 0]


In [16]:
clusters = Agglo.fit_predict(test_cleanedtext.toarray())
print(len(clusters))
print(clusters)# Testing set Predicted Cluster value

1000
[0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 1 1 0 1 0 1 0 1 1 0 0 1 1 1 0
 0 1 0 1 0 1 1 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1
 0 1 1 0 1 1 0 1 1 1 0 1 1 1 0 1 1 0 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1 0 1 0 1
 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 0 0 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 0
 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 0 0 1 1 0 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1
 1 0 1 0 0 1 1 0 1 1 0 0 0 0 1 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1 0 0 1 1 1 1 1
 0 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 0 1 0 1 1 1 1
 1 1 0 0 1 1 1 1 1 1 1 1 0 1 0 1 1 0 1 0 1 0 0 1 1 0 1 1 1 0 1 0 1 0 1 0 1
 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 1 1
 0 1 0 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 0 1 1 1 1 0 0 1 1
 1 1 0 0 1 1 1 1 0 0 0 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1
 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 1 0 0 1 1 1 1 0 0
 1 0 1 1 1 0 1 1 0 1