In [3]:
# Import
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sentence_transformers import SentenceTransformer
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Dummy-E-Mails
emails = [
    "Please schedule a meeting for project Alpha next week.",
    "The server is down, urgent fix needed.",
    "Reminder: Submit your project Alpha report by Friday.",
    "Customer complaint: Product arrived damaged.",
    "Can we reschedule the meeting to Monday?",
    "Issue: Application crash when uploading files.",
    "Follow up on customer support ticket.",
]

In [5]:
# 2. Feature Extraction
# Option A: TF-IDF
tfidf = TfidfVectorizer(stop_words="english")
X_tfidf = tfidf.fit_transform(emails)

# Option B: Moderne Embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")  # small and fast model
X_embed = model.encode(emails)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [7]:
# 3. Dimensonality reduction (for TF-IDF)
svd = TruncatedSVD(n_components=20)  # 50 dimensions
X_reduced = svd.fit_transform(X_tfidf)

In [8]:
# 4. Clustering (with k-Means)
k = 3  # number of clusters needs to be optimized
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X_reduced)



In [9]:
# 5. Evaluation (Silhouette Score for k-Means)
score = silhouette_score(X_reduced, clusters)

print("Cluster Labels:", clusters)
print("Silhouette Score:", score)


Cluster Labels: [2 0 2 1 2 1 1]
Silhouette Score: 0.05099901710506782
