In [1]:
import numpy as np
import string
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

# Text preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = ''.join([word for word in text if not word.isdigit()])
    # Tokenize and remove stopwords
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
dataset = [
    "I love playing football on the weekends",
    "I enjoy hiking and camping in the mountains",
    "I like to read books and watch movies",
    "I prefer playing video games over sports",
    "I love listening to music and going to concerts"
]
preprocessed_dataset = [preprocess_text(doc) for doc in dataset]


In [3]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_dataset)


In [5]:
k = 2
km = KMeans(n_clusters=k)
km.fit(X)
y_pred = km.predict(X)

# Display results
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Calculate purity (as per lab's method)
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("\nPurity:", purity)


Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            0
I enjoy hiking and camping in the mountains                        0
I like to read books and watch movies                              0
I prefer playing video games over sports                           1
I love listening to music and going to concerts                    0

Purity: 0.8


In [6]:
from gensim.models import Word2Vec

# Preprocess and tokenize
tokenized_dataset = [preprocess_text(doc).split() for doc in dataset]

# Step 3: Train Word2Vec model
word2vec_model = Word2Vec(
    sentences=tokenized_dataset,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
)

# Step 4: Create document embeddings
X = np.array([
    np.mean([word2vec_model.wv[word] for word in doc if word in word2vec_model.wv], axis=0)
    for doc in tokenized_dataset
])

# Step 5: Perform clustering
km = KMeans(n_clusters=k)
km.fit(X)
y_pred = km.predict(X)

# Display results
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("\nPurity:", purity)

Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            1
I enjoy hiking and camping in the mountains                        0
I like to read books and watch movies                              1
I prefer playing video games over sports                           1
I love listening to music and going to concerts                    0

Purity: 0.6




In [2]:
# ------------------------- Exercise 1 + 2 Combined Solution -------------------------
import numpy as np
import string
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# ------------------------- Text Preprocessing Function -------------------------
def preprocess_text(text):
    # Handle NaN values
    if pd.isna(text):
        return ""
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = ''.join([word for word in text if not word.isdigit()])
    # Tokenize and remove stopwords
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# ------------------------- Exercise 1: TF-IDF & Word2Vec with Preprocessing -------------------------
def exercise1():
    # Sample dataset
    dataset = [
        "I love playing football on the weekends",
        "I enjoy hiking and camping in the mountains",
        "I like to read books and watch movies",
        "I prefer playing video games over sports",
        "I love listening to music and going to concerts"
    ]

    # Preprocess data
    preprocessed_data = [preprocess_text(doc) for doc in dataset]

    # ---------- TF-IDF ----------
    vectorizer = TfidfVectorizer()
    X_tfidf = vectorizer.fit_transform(preprocessed_data)
    
    # Clustering
    k = 2
    km = KMeans(n_clusters=k)
    km.fit(X_tfidf)
    y_pred = km.predict(X_tfidf)
    
    # Calculate purity (lab's method)
    cluster_counts = Counter(y_pred)
    purity = max(cluster_counts.values()) / sum(cluster_counts.values())
    print(f"TF-IDF Purity: {purity:.2f}")

    # ---------- Word2Vec ----------
    # Tokenize preprocessed data
    tokenized_data = [doc.split() for doc in preprocessed_data]
    
    # Train Word2Vec model
    model = Word2Vec(
        sentences=tokenized_data,
        vector_size=100,
        window=5,
        min_count=1,
        workers=4
    )
    
    # Create document embeddings
    embeddings = []
    for doc in tokenized_data:
        valid_words = [word for word in doc if word in model.wv]
        if len(valid_words) > 0:
            embeddings.append(np.mean(model.wv[valid_words], axis=0))
        else:
            embeddings.append(np.zeros(model.vector_size))  # Handle empty docs
    
    # Clustering
    km = KMeans(n_clusters=k)
    km.fit(embeddings)
    y_pred = km.predict(embeddings)
    
    # Calculate purity
    cluster_counts = Counter(y_pred)
    purity = max(cluster_counts.values()) / sum(cluster_counts.values())
    print(f"Word2Vec Purity: {purity:.2f}\n")

# ------------------------- Exercise 2: Customer Complaints Analysis -------------------------
def exercise2():
    # Load data
    df = pd.read_csv('customer_complaints_1.csv')
    texts = df['text'].fillna('').tolist()  # Handle NaN
    
    # Preprocess data
    preprocessed_texts = [preprocess_text(text) for text in texts]
    
    # ---------- TF-IDF ----------
    tfidf_vectorizer = TfidfVectorizer(max_features=1000)
    X_tfidf = tfidf_vectorizer.fit_transform(preprocessed_texts)
    
    # Clustering
    k = 3
    km = KMeans(n_clusters=k)
    km.fit(X_tfidf)
    y_pred_tfidf = km.predict(X_tfidf)
    
    # Calculate purity
    cluster_counts = Counter(y_pred_tfidf)
    purity = max(cluster_counts.values()) / sum(cluster_counts.values())
    print(f"\nTF-IDF Purity (Complaints): {purity:.2f}")
    
    # Show top terms
    print("\nTop TF-IDF Terms per Cluster:")
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    terms = tfidf_vectorizer.get_feature_names_out()
    for i in range(k):
        print(f"Cluster {i}:")
        for ind in order_centroids[i, :10]:
            print(f"  {terms[ind]}")
        print()

    # ---------- Word2Vec ----------
    # Tokenize data
    tokenized_data = [doc.split() for doc in preprocessed_texts]
    
    # Train Word2Vec model
    model = Word2Vec(
        sentences=tokenized_data,
        vector_size=100,
        window=5,
        min_count=1,
        workers=4
    )
    
    # Create embeddings
    embeddings = []
    for doc in tokenized_data:
        valid_words = [word for word in doc if word in model.wv]
        if len(valid_words) > 0:
            embeddings.append(np.mean(model.wv[valid_words], axis=0))
        else:
            embeddings.append(np.zeros(model.vector_size))
    
    # Clustering
    km = KMeans(n_clusters=k)
    km.fit(embeddings)
    y_pred_w2v = km.predict(embeddings)
    
    # Calculate purity
    cluster_counts = Counter(y_pred_w2v)
    purity = max(cluster_counts.values()) / sum(cluster_counts.values())
    print(f"Word2Vec Purity (Complaints): {purity:.2f}")
    
    # Add clusters to DataFrame
    df['cluster_tfidf'] = y_pred_tfidf
    df['cluster_w2v'] = y_pred_w2v
    print("\nSample Complaints with Clusters:")
    print(df[['text', 'cluster_tfidf', 'cluster_w2v']].head(3))

# ------------------------- Execute Both Exercises -------------------------
print("Exercise 1 Results:")
exercise1()

print("\nExercise 2 Results:")
exercise2()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Exercise 1 Results:
TF-IDF Purity: 0.80
Word2Vec Purity: 0.80


Exercise 2 Results:

TF-IDF Purity (Complaints): 0.58

Top TF-IDF Terms per Cluster:
Cluster 0:
  contract
  xfinity
  signed
  fee
  second
  know
  year
  rude
  box
  local

Cluster 1:
  service
  internet
  customer
  comcast
  would
  mbps
  speed
  tech
  month
  cable

Cluster 2:
  rude
  rep
  day
  service
  call
  helpful
  overwhelming
  cutting
  pas
  ignorant

Word2Vec Purity (Complaints): 0.79

Sample Complaints with Clusters:
                                                text  cluster_tfidf  \
0  I used to love Comcast. Until all these consta...              1   
1  I'm so over Comcast! The worst internet provid...              1   
2  If I could give them a negative star or no sta...              1   

   cluster_w2v  
0            0  
1            2  
2            2  


