In [1]:
import numpy as np 
from sklearn.cluster import KMeans 
from sklearn.feature_extraction.text import TfidfVectorizer 
from tabulate import tabulate 
from collections import Counter

In [3]:
dataset = ["I love playing football on the weekends", 
           "I enjoy hiking and camping in the mountains", 
           "I like to read books and watch movies", 
           "I prefer playing video games over sports", 
           "I love listening to music and going to concerts"] 

In [5]:
vectorizer = TfidfVectorizer() 
X = vectorizer.fit_transform(dataset)

In [9]:
k = 2  # Define the number of clusters 
km = KMeans(n_clusters=k) 
km.fit(X) 
 
# Predict the clusters for each document 
y_pred = km.predict(X) 
 
# Display the document and its predicted cluster in a table 
table_data = [["Document", "Predicted Cluster"]] 
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)]) 
print(tabulate(table_data, headers="firstrow"))
# Print top terms per cluster 
print("\nTop terms per cluster:") 
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 
terms = vectorizer.get_feature_names_out() 
for i in range(k): 
    print("Cluster %d:" % i) 
    for ind in order_centroids[i, :10]: 
        print(' %s' % terms[ind]) 
    print()

Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            1
I enjoy hiking and camping in the mountains                        1
I like to read books and watch movies                              0
I prefer playing video games over sports                           1
I love listening to music and going to concerts                    0

Top terms per cluster:
Cluster 0:
 to
 and
 read
 watch
 movies
 like
 books
 concerts
 going
 music

Cluster 1:
 playing
 the
 weekends
 on
 football
 video
 sports
 prefer
 over
 games



In [11]:
# Calculate purity 
total_samples = len(y_pred) 
cluster_label_counts = [Counter(y_pred)] 
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples 
print("Purity:", purity) 

Purity: 0.6


In [13]:
import numpy as np 
from sklearn.cluster import KMeans 
from gensim.models import Word2Vec 
from tabulate import tabulate 
from collections import Counter 

In [15]:
dataset = ["I love playing football on the weekends", 
           "I enjoy hiking and camping in the mountains", 
           "I like to read books and watch movies", 
           "I prefer playing video games over sports", 
           "I love listening to music and going to concerts"] 

In [17]:
tokenized_dataset = [doc.split() for doc in dataset] 
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100, 
window=5, min_count=1, workers=4) 

In [19]:
X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in 
word2vec_model.wv], axis=0) for doc in dataset])

In [23]:
k = 2  # Define the number of clusters 
km = KMeans(n_clusters=k) 
km.fit(X) 
 
# Predict the clusters for each document 
y_pred = km.predict(X) 
 
# Tabulate the document and predicted cluster 
table_data = [["Document", "Predicted Cluster"]] 
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)]) 
print(tabulate(table_data, headers="firstrow"))

Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            1
I enjoy hiking and camping in the mountains                        1
I like to read books and watch movies                              0
I prefer playing video games over sports                           1
I love listening to music and going to concerts                    0




In [25]:
# Calculate purity 
total_samples = len(y_pred) 
cluster_label_counts = [Counter(y_pred)] 
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples 
print("Purity:", purity)

Purity: 0.6


In [7]:
import pandas as pd
import numpy as np
import string
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
import nltk

In [9]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\victus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\victus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\victus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
file_path = "customer_complaints_1.csv"
df = pd.read_csv(file_path)

In [13]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join(tokens)


In [15]:
df["preprocessed_text"] = df["text"].apply(preprocess_text)

In [17]:
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df["preprocessed_text"])

k = 5  # Number of clusters
kmeans_tfidf = KMeans(n_clusters=k, random_state=0)
y_pred_tfidf = kmeans_tfidf.fit_predict(X_tfidf)

In [19]:
def calculate_purity(y_pred):
    counter = Counter(y_pred)
    purity = sum(counter.values()) / len(y_pred)
    return purity

purity_tfidf = calculate_purity(y_pred_tfidf)
print("Purity (TF-IDF with preprocessing):", purity_tfidf)

Purity (TF-IDF with preprocessing): 1.0


In [21]:
tokenized_docs = [text.split() for text in df["preprocessed_text"]]
w2v_model = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=1, workers=4)

def document_vector(doc):
    vectors = [w2v_model.wv[word] for word in doc if word in w2v_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(w2v_model.vector_size)

X_w2v = np.array([document_vector(doc) for doc in tokenized_docs])

kmeans_w2v = KMeans(n_clusters=k, random_state=0)
y_pred_w2v = kmeans_w2v.fit_predict(X_w2v)

purity_w2v = calculate_purity(y_pred_w2v)
print("Purity (Word2Vec with preprocessing):", purity_w2v)

Purity (Word2Vec with preprocessing): 1.0




In [23]:
df['TFIDF_Cluster'] = y_pred_tfidf
df['Word2Vec_Cluster'] = y_pred_w2v
print(df[['text', 'TFIDF_Cluster', 'Word2Vec_Cluster']].head())

                                                text  TFIDF_Cluster  \
0  I used to love Comcast. Until all these consta...              3   
1  I'm so over Comcast! The worst internet provid...              3   
2  If I could give them a negative star or no sta...              0   
3  I've had the worst experiences so far since in...              2   
4  Check your contract when you sign up for Comca...              0   

   Word2Vec_Cluster  
0                 1  
1                 1  
2                 1  
3                 1  
4                 0  
