In [1]:
# Import
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sentence_transformers import SentenceTransformer
import numpy as np

ModuleNotFoundError: No module named 'sentence_transformers'

In [None]:
# Dummy-E-Mails
emails = [
    "Please schedule a meeting for project Alpha next week.",
    "The server is down, urgent fix needed.",
    "Reminder: Submit your project Alpha report by Friday.",
    "Customer complaint: Product arrived damaged.",
    "Can we reschedule the meeting to Monday?",
    "Issue: Application crash when uploading files.",
    "Follow up on customer support ticket.",
]

In [None]:
# 2. Feature Extraction
# Option A: TF-IDF
tfidf = TfidfVectorizer(stop_words="english")
X_tfidf = tfidf.fit_transform(emails)

# Option B: Moderne Embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")  # small and fast model
X_embed = model.encode(emails)

In [None]:
# 3. Dimensonality reduction (for TF-IDF)
svd = TruncatedSVD(n_components=50)  # 50 dimensions
X_reduced = svd.fit_transform(X_tfidf)

In [None]:
# 4. Clustering (with k-Means)
k = 3  # number of clusters needs to be optimized
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X_reduced)

In [None]:
# 5. Evaluation (Silhouette Score for k-Means)
score = silhouette_score(X_reduced, clusters)

print("Cluster Labels:", clusters)
print("Silhouette Score:", score)
