In [2]:
#import libraries

import numpy as np 
from sklearn.cluster import KMeans 
from gensim.models import Word2Vec 
from tabulate import tabulate 
from collections import Counter 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK Resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Define text preprocessing function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum()]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

#create documents
dataset = ["I love playing football on the weekends", 
           "I enjoy hiking and camping in the mountains", 
           "I like to read books and watch movies", 
           "I prefer playing video games over sports", 
           "I love listening to music and going to concerts"] 

# Preprocess the dataset
preprocessed_dataset = [preprocess_text(doc) for doc in dataset]

#Train Word2Vec model 
word2vec_model = Word2Vec(sentences=preprocessed_dataset, vector_size=100, 
                          window=5, min_count=1, workers=4) 

#create documents embeddings
X = np.array([np.mean([word2vec_model.wv[word] for word in doc if word in word2vec_model.wv], axis=0) for doc in preprocessed_dataset]) 

#perform clustering 
k = 2  # Define the number of clusters 
km = KMeans(n_clusters=k) 
km.fit(X) 
 
# Predict the clusters for each document 
y_pred = km.predict(X) 
 
# Tabulate the document and predicted cluster 
table_data = [["Document", "Predicted Cluster"]] 
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)]) 
print(tabulate(table_data, headers="firstrow")) 

#evaluate result 
# Calculate purity 
total_samples = len(y_pred) 
cluster_label_counts = Counter(y_pred) 
purity = sum(cluster_label_counts.values()) / total_samples 
print("Purity:", purity)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            0
I enjoy hiking and camping in the mountains                        1
I like to read books and watch movies                              1
I prefer playing video games over sports                           0
I love listening to music and going to concerts                    1
Purity: 1.0




In [3]:
# purity change from 0.6 to 1.0