# Text Clustering using TF-IDF and WordVec

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from tabulate import tabulate
from collections import Counter
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

nltk.download('stopwords')
nltk.download('punkt')

# Text preprocessing function
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text.lower())
    filtered_text = [word for word in word_tokens if word.isalnum() and word not in stop_words]
    return " ".join(filtered_text)

dataset = ["I love playing football on the weekends",
           "I enjoy hiking and camping in the mountains",
           "I like to read books and watch movies",
           "I prefer playing video games over sports",
           "I love listening to music and going to concerts"]

# Preprocess the dataset
preprocessed_dataset = [preprocess_text(doc) for doc in dataset]

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(preprocessed_dataset)

# Word2Vec Vectorization
tokenized_dataset = [word_tokenize(preprocess_text(doc)) for doc in dataset]
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100,
                          window=5, min_count=1, workers=4)

X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in doc if word in word2vec_model.wv], axis=0)
                       for doc in tokenized_dataset])

# Clustering
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X_tfidf)
# Predict the clusters for each document
y_pred_tfidf = km.predict(X_tfidf)

# Tabulate the document and predicted cluster for TF-IDF
table_data_tfidf = [["Document", "Predicted Cluster"]]
table_data_tfidf.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred_tfidf)])
print("TF-IDF Clustering Results:")
print(tabulate(table_data_tfidf, headers="firstrow"))

# Calculate purity for TF-IDF
total_samples_tfidf = len(y_pred_tfidf)
cluster_label_counts_tfidf = [Counter(y_pred_tfidf)]
purity_tfidf = sum(max(cluster.values()) for cluster in cluster_label_counts_tfidf) / total_samples_tfidf
print("TF-IDF Purity:", purity_tfidf)

# Clustering for Word2Vec
km.fit(X_word2vec)
# Predict the clusters for each document
y_pred_word2vec = km.predict(X_word2vec)

# Tabulate the document and predicted cluster for Word2Vec
table_data_word2vec = [["Document", "Predicted Cluster"]]
table_data_word2vec.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred_word2vec)])
print("\nWord2Vec Clustering Results:")
print(tabulate(table_data_word2vec, headers="firstrow"))

# Calculate purity for Word2Vec
total_samples_word2vec = len(y_pred_word2vec)
cluster_label_counts_word2vec = [Counter(y_pred_word2vec)]
purity_word2vec = sum(max(cluster.values()) for cluster in cluster_label_counts_word2vec) / total_samples_word2vec
print("Word2Vec Purity:", purity_word2vec)


[nltk_data] Downloading package stopwords to C:\Users\ASPIRE
[nltk_data]     5\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\ASPIRE
[nltk_data]     5\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  super()._check_params_vs_input(X, default_n_init=10)


TF-IDF Clustering Results:
Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            1
I enjoy hiking and camping in the mountains                        1
I like to read books and watch movies                              0
I prefer playing video games over sports                           1
I love listening to music and going to concerts                    1
TF-IDF Purity: 0.8


  super()._check_params_vs_input(X, default_n_init=10)



Word2Vec Clustering Results:
Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            1
I enjoy hiking and camping in the mountains                        0
I like to read books and watch movies                              0
I prefer playing video games over sports                           1
I love listening to music and going to concerts                    0
Word2Vec Purity: 0.6
