K MEANS CLUSTERING

In [1]:

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from nltk.corpus import stopwords
from nltk.cluster import KMeansClusterer,cosine_distance
from sklearn.cluster import KMeans
from sklearn import mixture
import numpy as np
from collections import OrderedDict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import gensim
from gensim import corpora
from gensim.models import CoherenceModel
# Add your import statement


In [2]:

train = pd.read_csv("D:/clustering/Train_Data.csv")
train.head()

test = pd.read_csv("D:/clustering/Test_Data.csv")
test.head()

test_text = test["text"]
test_label = test["label"]

Unnamed: 0,text
0,Would you rather get a gift that you knew what...
1,Is the internet ruining people's ability to co...
2,Permanganate?\r\nSuppose permanganate was used...
3,If Rock-n-Roll is really the work of the devil...
4,Has anyone purchased software to watch TV on y...


Unnamed: 0,label,text
0,3,"No desire to visit mother in jail, am I a bad ..."
1,7,what types of desirable products/materials can...
2,2,what is teleportation? why an unknown indian i...
3,1,Do you have to read the whole Bible to get int...
4,3,6 yr old son with a Deviated Septum!!!!?\r\nMy...




Defined a function `cluster_kmean(train_text, test_text, text_label)` as follows:
- Take three inputs: 
    - `train_text` is a list of documents for traing 
    - `test_text` is a list of documents for test
    - `test_label` is the labels corresponding to documents in `test_text` 
- First step- Creates a TFIDF weights
- Uses `KMeans` to cluster documents in `train_text` into 4 clusters. 
    
- Tests the clustering model performance using `test_label` as follows: 
  - Predicts the cluster ID for each document in `test_text`.
  - `majority vote` rule is applied to dynamically map the predicted cluster IDs to `test_label`. 
  - Print out the cross tabluation between cluster ids and class labels
  - print out the classification report for the test subset 
  
  


    

In [3]:
def cluster_kmean(train, test_text, test_label):
    tfidf = TfidfVectorizer(stop_words="english",min_df=2)
    tfidf_matrix = tfidf.fit_transform(train)
    
    #****COSINE DISTANCE****
    
    clusterer = KMeansClusterer(4, cosine_distance, repeats=30)
    clusters = clusterer.cluster(tfidf_matrix.toarray(), assign_clusters=True)
    centroids=np.array(clusterer.means())
    sorted_centroids = centroids.argsort()[:, ::-1] 
    voc_lookup= tfidf.get_feature_names_out()
    num_clusters = 4
    for i in range(num_clusters):
        top_words=[voc_lookup[word_index] for word_index in sorted_centroids[i, :20]]
        print("Cluster %d:\n %s " % (i, "; ".join(top_words)))
    test_tfidf_matrix = tfidf.transform(test_text)
    predicted_cluster = [clusterer.classify(x) for x in test_tfidf_matrix.toarray()]
    confusion_df = pd.DataFrame(list(zip(test_label.values, predicted_cluster)), columns = ["actual_class", "cluster"])
    crosstab = pd.crosstab( index=confusion_df.cluster, columns=confusion_df.actual_class)
    crosstab_idx = crosstab.idxmax(axis = 0)
    crosstab_idx = crosstab_idx.sort_values(ascending=True) 
    labels_name=[]
    cluster_idx = []
    for x,y in crosstab_idx.iteritems():
        labels_name.append(x)
        cluster_idx.append(y)

    cluster_dctnry = dict(zip(cluster_idx,labels_name))
    print(cluster_dctnry)
    predicted_target=[cluster_dctnry[i] for i in predicted_cluster]
    #print(predicted_target)
    print('FOR COSINE')
    print(crosstab)
    print(metrics.classification_report(test_label, predicted_target,zero_division = 0))
    
    #****EUCLIDEAN DISTANCE****
    
    km = KMeans(n_clusters=4, n_init=30).fit(tfidf_matrix)
    clusters_eu = km.labels_.tolist()
    test_tfidf_matrix = tfidf.transform(test_text)
    predicted_clusters_eu = km.predict(test_tfidf_matrix)
    confusion_df_eu = pd.DataFrame(list(zip(test_label.values, predicted_clusters_eu)),columns = ["actual_class", "cluster"])
    crosstab_eu = pd.crosstab( index=confusion_df_eu.cluster, columns=confusion_df_eu.actual_class)
    
    crosstab_idx_eu = crosstab_eu.idxmax(axis = 1)
    #crosstab_idx_eu = crosstab_idx_eu.sort_values(ascending=True) 
    labels_name_eu=[]
    cluster_idx_eu = []
    for x,y in crosstab_idx_eu.iteritems():
        labels_name_eu.append(y)
        cluster_idx_eu.append(x)

    cluster_dctnry_eu = dict(zip(cluster_idx_eu, labels_name_eu))
    print(cluster_dctnry_eu)
    predicted_target_eu=[cluster_dctnry_eu[i] for i in predicted_clusters_eu]
    
    print('FOR EUCLIDEAN')
    print(crosstab_eu)
    print(metrics.classification_report(test_label, predicted_target_eu, zero_division = 0))

In [4]:
cluster_kmean(train["text"], test_text, test_label)

Cluster 0:
 water; energy; light; earth; number; mass; nthe; air; speed; 10; used; gas; sun; force; equation; heat; space; does; universe; cell 
Cluster 1:
 com; job; business; money; www; credit; work; nhttp; pay; good; want; company; need; help; know; don; like; http; looking; home 
Cluster 2:
 like; just; know; don; help; people; think; weight; time; really; good; want; need; body; feel; day; make; doctor; life; eat 
Cluster 3:
 god; people; jesus; bible; believe; religion; christians; think; just; christian; church; world; like; life; know; question; man; does; don; say 
{0: 2, 1: 7, 2: 3, 3: 1}
FOR COSINE
actual_class    1    2    3    7
cluster                         
0               6  214   12    5
1              39   33   20  212
2              93   56  319   44
3             194   11    4   12
              precision    recall  f1-score   support

           1       0.88      0.58      0.70       332
           2       0.90      0.68      0.78       314
           3       0.

##  Clustering by Gaussian Mixture Model


In [7]:
def cluster_gmm(train, test_text, test_label):
    tfidf = TfidfVectorizer(stop_words="english",min_df=5)
    tfidf_matrix = tfidf.fit_transform(train)
    
    number_of_clusters = 4
    covariance_types =  'diag'
    gmm = mixture.GaussianMixture(n_components= number_of_clusters,
                                      covariance_type=covariance_types,random_state= 32, n_init= 30)
    gmm.fit(tfidf_matrix.toarray())
    bic = gmm.bic(tfidf_matrix.toarray())    
    
    print(bic)
    print(gmm)
    test_tfidf_matrix = tfidf.transform(test_text)

    predicted = gmm.predict(test_tfidf_matrix.toarray())
   
    confusion_df = pd.DataFrame(list(zip(test_label.values, predicted)),columns = ["actual_class", "cluster"])
    print(pd.crosstab( index=confusion_df.cluster, columns=confusion_df.actual_class))
    cr = pd.crosstab( index=confusion_df.cluster, columns=confusion_df.actual_class)
    crosstab_idx = cr.idxmax(axis = 1)
    #print(crosstab_idx)
    crosstab_idx = crosstab_idx.sort_values(ascending=True) 
    labels_name=[]
    cluster_idx = []
    for x,y in crosstab_idx.iteritems():
        labels_name.append(y)
        cluster_idx.append(x)

    cluster_dctnry = dict(zip(cluster_idx,labels_name))
    cl_dic = OrderedDict(sorted(cluster_dctnry.items()))
    #print(cl_dic)
   
    predicted_target=[cl_dic[i] for i in predicted]
    print(metrics.classification_report      (test_label, predicted_target))

In [8]:
cluster_gmm(train["text"], test_text, test_label)

-213115521.69835934
GaussianMixture(covariance_type='diag', n_components=4, n_init=30,
                random_state=32)
actual_class    1    2    3    7
cluster                         
0              11   18  244   20
1             231   15   49   34
2              20    9   14  159
3              70  272   48   60
              precision    recall  f1-score   support

           1       0.70      0.70      0.70       332
           2       0.60      0.87      0.71       314
           3       0.83      0.69      0.75       355
           7       0.79      0.58      0.67       273

    accuracy                           0.71      1274
   macro avg       0.73      0.71      0.71      1274
weighted avg       0.73      0.71      0.71      1274



## Clustering by LDA


In [11]:
def cluster_lda(train, test_text, test_label):
    tf_vectorizer = CountVectorizer(min_df=5, stop_words='english')
    tf = tf_vectorizer.fit_transform(train)
    tf_feature_names = tf_vectorizer.get_feature_names_out()
    test_tfidf_matrix = tf_vectorizer.transform(test_text)
    num_topics = 4
    lda = LatentDirichletAllocation(n_components=num_topics,                                 max_iter=40,verbose=1,
                                evaluate_every=1, n_jobs=1,
                                random_state=2).fit(tf)
    number_of_top_words=30
    for topic_idx, topic in enumerate(lda.components_):
        print ("Topic %d:" % (topic_idx))
    
        words=[(tf_feature_names[i]) for i in topic.argsort()[::-1][0:number_of_top_words]]
        print(words)
        print("\n")
    tranform_test_tfidf = lda.transform(test_tfidf_matrix)
    tranform_test_tfidf_df = pd.DataFrame(tranform_test_tfidf)
    predicted_topic =  tranform_test_tfidf_df.idxmax(axis = 1)
    confusion_df = pd.DataFrame(list(zip(test_label, predicted_topic)),                            columns = ["actual_class", "cluster"])
    cr = pd.crosstab( index=confusion_df.cluster, columns=confusion_df.actual_class)
    print(cr)
    majority_vote_lda = cr.idxmax(axis=1).to_dict()
    predicted_target_lda = [majority_vote_lda[i] for i in predicted_topic]
    print(metrics.classification_report      (test_label, predicted_target_lda))
        

In [12]:
cluster_lda(train["text"], test_text, test_label)


iteration: 1 of max_iter: 40, perplexity: 3345.6230
iteration: 2 of max_iter: 40, perplexity: 3110.1062
iteration: 3 of max_iter: 40, perplexity: 2935.3820
iteration: 4 of max_iter: 40, perplexity: 2811.3937
iteration: 5 of max_iter: 40, perplexity: 2726.2071
iteration: 6 of max_iter: 40, perplexity: 2661.6236
iteration: 7 of max_iter: 40, perplexity: 2609.6799
iteration: 8 of max_iter: 40, perplexity: 2570.9029
iteration: 9 of max_iter: 40, perplexity: 2543.0075
iteration: 10 of max_iter: 40, perplexity: 2521.5307
iteration: 11 of max_iter: 40, perplexity: 2504.1573
iteration: 12 of max_iter: 40, perplexity: 2489.9569
iteration: 13 of max_iter: 40, perplexity: 2477.9848
iteration: 14 of max_iter: 40, perplexity: 2468.1776
iteration: 15 of max_iter: 40, perplexity: 2460.0644
iteration: 16 of max_iter: 40, perplexity: 2453.9584
iteration: 17 of max_iter: 40, perplexity: 2449.3203
iteration: 18 of max_iter: 40, perplexity: 2445.5774
iteration: 19 of max_iter: 40, perplexity: 2442.0304
it