In [None]:
# --- Step 1: Imports ---
import wikipediaapi
import nltk
import ssl
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Now, try to download the data again
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

print("NLTK data downloaded successfully using the manual method.")

print("Libraries imported and NLTK data downloaded successfully!")

In [None]:
# --- Step 2: Fetch Wikipedia Articles ---

# List of articles to cluster. We've chosen topics in astronomy, biology, and computer science.
article_titles = [
    "Galaxy", "Black hole", "Supernova", # Astronomy
    "DNA", "Photosynthesis", "Evolution", # Biology
    "Machine learning", "Artificial intelligence", "Computer programming" # Computer Science
]

# Initialize the Wikipedia API
wiki_api = wikipediaapi.Wikipedia('MyClusteringProject/1.0', 'en')

documents = []
for title in article_titles:
    page = wiki_api.page(title)
    if page.exists():
        documents.append(page.text)
        print(f"Successfully fetched: {title}")
    else:
        print(f"Could not find page: {title}")

In [None]:
# --- Step 3: Preprocess the Text ---

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    words = text.split()
    # Remove stop words and lemmatize
    processed_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(processed_words)

processed_documents = [preprocess_text(doc) for doc in documents]
print("Text preprocessing complete.")

In [None]:
# --- Step 4: Convert Text to Vectors ---

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000) # Limit to the top 1000 features

# Create the TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(processed_documents)

print(tfidf_matrix)

print("TF-IDF matrix created successfully.")
print(f"Shape of the matrix: {tfidf_matrix.shape}")

In [None]:
# --- Step 5: Run K-Means ---

k = 3
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
## The below line is functionally identical to the above line
#kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42, n_init=10)
kmeans.fit(tfidf_matrix)

# Get the cluster assignments for each document
labels = kmeans.labels_
print(labels)

In [None]:
# --- Step 6: Analyze the Results ---

# Group document titles by cluster
clusters = {i: [] for i in range(k)}
for i, label in enumerate(labels):
    clusters[label].append(article_titles[i])

# Get the top terms per cluster
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

for i in range(k):
    print(f"--- Cluster {i} ---")
    print(f"Documents: {clusters[i]}")
    
    top_terms = [terms[ind] for ind in order_centroids[i, :10]]
    print(f"Top Keywords: {top_terms}\n")

In [None]:
# --- Step 7: Putting the Model to Work - Predicting on New Documents ---
# Now for the exciting part! We can take our final "trained" model and 
# use it to instantly categorize a brand new, unseen document. 
# Let's see which topic cluster it belongs to!

# --- Define your new document ---
new_text = "An algorithm is a set of well-defined instructions designed to perform a specific task or solve a computational problem. In computer science, the study of algorithms is fundamental to creating efficient and scalable software. Data structures, such as arrays and hash tables, are used to organize data in a way that allows these algorithms to access and manipulate it effectively."

# --- Apply the SAME preprocessing ---
# We use the preprocess_text function we defined earlier
processed_new_text = preprocess_text(new_text)
print(f"Cleaned Text: {processed_new_text}")

# --- Use the FITTED vectorizer to transform the text ---
# IMPORTANT: Use .transform(), not .fit_transform()
# This ensures it uses the same vocabulary learned from the original documents.
new_tfidf_vector = vectorizer.transform([processed_new_text])

print(f"\nShape of the new vector: {new_tfidf_vector.shape}")

# --- Now you can predict its cluster ---
predicted_label = kmeans.predict(new_tfidf_vector)

print(f"\nThe new document belongs to cluster: {predicted_label[0]}")