In [63]:
# TF-IDF Vectorizer with Preprocessing

import numpy as np
import string
import nltk
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return ' '.join(tokens)

# Original documents
dataset = [
    "I love playing football on the weekends",
    "I enjoy hiking and camping in the mountains",
    "I like to read books and watch movies",
    "I prefer playing video games over sports",
    "I love listening to music and going to concerts"
]

# Apply preprocessing
preprocessed_dataset = [preprocess(doc) for doc in dataset]

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_dataset)

# Clustering
k = 2
km = KMeans(n_clusters=k)
km.fit(X)
y_pred = km.predict(X)

# Display results
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Purity calculation
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)


Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            1
I enjoy hiking and camping in the mountains                        0
I like to read books and watch movies                              1
I prefer playing video games over sports                           1
I love listening to music and going to concerts                    1
Purity: 0.8


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [65]:
# Word2Vec Vectorizer with Preprocessing

import numpy as np
import string
import nltk
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from tabulate import tabulate
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_tokens(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    return [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]

# Dataset and preprocessing
dataset = [
    "I love playing football on the weekends",
    "I enjoy hiking and camping in the mountains",
    "I like to read books and watch movies",
    "I prefer playing video games over sports",
    "I love listening to music and going to concerts"
]

tokenized_dataset = [preprocess_tokens(doc) for doc in dataset]

# Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100, window=5, min_count=1, workers=4)

# Document vectors
X = np.array([
    np.mean([word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv], axis=0)
    for tokens in tokenized_dataset
])

# Clustering
k = 2
km = KMeans(n_clusters=k)
km.fit(X)
y_pred = km.predict(X)

# Display results
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Purity calculation
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)


Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            1
I enjoy hiking and camping in the mountains                        0
I like to read books and watch movies                              1
I prefer playing video games over sports                           1
I love listening to music and going to concerts                    0
Purity: 0.6


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [79]:
import pandas as pd
import string
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from collections import Counter

# Basic stopwords list
basic_stopwords = set([
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your',
    'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she',
    'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their',
    'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',
    'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
    'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
    'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of',
    'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through',
    'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
    'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then',
    'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both',
    'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not',
    'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just',
    'don', 'should', 'now'
])

# Text preprocessing
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [word for word in tokens if word not in basic_stopwords]
    return ' '.join(tokens)

# Purity calculation function
def calculate_purity(y_true):
    counter = Counter(y_true)
    return sum(counter.values()) / len(y_true)

# Load data
df = pd.read_csv('customer_complaints_1.csv')
texts = df['text'].dropna().tolist()
preprocessed_texts = [preprocess_text(t) for t in texts]

### TF-IDF CLUSTERING
vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer.fit_transform(preprocessed_texts)

k = 3
km_tfidf = KMeans(n_clusters=k, random_state=42)
y_pred_tfidf = km_tfidf.fit_predict(X_tfidf)
df['cluster_tfidf'] = y_pred_tfidf

print("Top TF-IDF terms per cluster:")
order_centroids = km_tfidf.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print(f"Cluster {i}:")
    for ind in order_centroids[i, :10]:
        print(f"  {terms[ind]}")
    print()

purity_tfidf = calculate_purity(y_pred_tfidf)
print(f"Purity (TF-IDF): {purity_tfidf:.2f}")

### WORD2VEC CLUSTERING
tokenized_texts = [t.split() for t in preprocessed_texts]
w2v_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

# Average vector for each document
X_w2v = np.array([
    np.mean([w2v_model.wv[word] for word in words if word in w2v_model.wv], axis=0)
    for words in tokenized_texts
])

km_w2v = KMeans(n_clusters=k, random_state=42)
y_pred_w2v = km_w2v.fit_predict(X_w2v)
df['cluster_w2v'] = y_pred_w2v

purity_w2v = calculate_purity(y_pred_w2v)
print(f"Purity (Word2Vec): {purity_w2v:.2f}")

# Show first few rows
print(df[['text', 'cluster_tfidf', 'cluster_w2v']].head())


Top TF-IDF terms per cluster:
Cluster 0:
  service
  customer
  since
  adding
  boxes
  second
  speed
  protocol
  investigating
  malfunction

Cluster 1:
  internet
  comcast
  mbps
  service
  would
  day
  xfinity
  contract
  call
  get

Cluster 2:
  internet
  rude
  contract
  years
  comcast
  joke
  extra
  10
  local
  im

Purity (TF-IDF): 1.00
Purity (Word2Vec): 1.00
                                                text  cluster_tfidf  \
0  I used to love Comcast. Until all these consta...              1   
1  I'm so over Comcast! The worst internet provid...              2   
2  If I could give them a negative star or no sta...              0   
3  I've had the worst experiences so far since in...              0   
4  Check your contract when you sign up for Comca...              2   

   cluster_w2v  
0            1  
1            2  
2            2  
3            2  
4            1  


