In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HanDong\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HanDong\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(t) for t in tokens]
    return " ".join(tokens)

In [3]:
def vectorize_text(documents):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(documents)
    return X

In [4]:
def perform_clustering(X, num_clusters):
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    cluster_assignments = kmeans.fit_predict(X)
    return cluster_assignments

In [5]:
def print_cluster_assignments(cluster_assignments, documents):
    for i, cluster_id in enumerate(cluster_assignments):
        print(f"Document {i+1} -> Cluster {cluster_id}: {documents[i]}")

In [6]:
def evaluate_clustering(X, cluster_assignments):
    score = silhouette_score(X, cluster_assignments)
    return score

In [7]:
documents = [    
"Machine learning techniques are used in artificial intelligence applications.",    
"Natural language processing is a subfield of artificial intelligence.",    
"Data mining and machine learning are used to analyze large datasets.",    
"Deep learning models have achieved state-of-the-art results in various tasks.",   
"Text classification is a common application of natural language processing.",    
"Recommendation systems use machine learning to suggest relevant items.",    
"Sentiment analysis aims to determine the sentiment expressed in text data.",    
"Clustering algorithms group similar data points together.",    
"Supervised learning requires labeled training data.",    
"Unsupervised learning explores patterns in unlabeled data.",
"Reinforcement learning is used to train agents in dynamic environments."
]

In [8]:
# Preprocess all documents
preprocessed_docs = [preprocess_text(doc) for doc in documents]

# Vectorize
X = vectorize_text(preprocessed_docs)

# Cluster (let's assume 3 clusters)
cluster_assignments = perform_clustering(X, num_clusters=3)

# Print results
print_cluster_assignments(cluster_assignments, documents)

# Evaluate
score = evaluate_clustering(X, cluster_assignments)
print("\nSilhouette Score:", score)

Document 1 -> Cluster 2: Machine learning techniques are used in artificial intelligence applications.
Document 2 -> Cluster 0: Natural language processing is a subfield of artificial intelligence.
Document 3 -> Cluster 2: Data mining and machine learning are used to analyze large datasets.
Document 4 -> Cluster 1: Deep learning models have achieved state-of-the-art results in various tasks.
Document 5 -> Cluster 0: Text classification is a common application of natural language processing.
Document 6 -> Cluster 2: Recommendation systems use machine learning to suggest relevant items.
Document 7 -> Cluster 0: Sentiment analysis aims to determine the sentiment expressed in text data.
Document 8 -> Cluster 1: Clustering algorithms group similar data points together.
Document 9 -> Cluster 1: Supervised learning requires labeled training data.
Document 10 -> Cluster 1: Unsupervised learning explores patterns in unlabeled data.
Document 11 -> Cluster 1: Reinforcement learning is used to tra