In [1]:
import wikipediaapi
# Natural Language Toolkit for text processing
import nltk
# SSL for secure connections
import ssl
# Regular expressions for text cleaning
import re
# Numerical operations
import numpy as np
#stopwords for removing common words
from nltk.corpus import stopwords
#lemmatizer use for text normalization
from nltk.stem import WordNetLemmatizer
#TF-IDF vectorizer for converting text to numerical data
from sklearn.feature_extraction.text import TfidfVectorizer
#KMeans for clustering
from sklearn.cluster import KMeans
#Silhouette score for evaluating clustering quality
from sklearn.metrics import silhouette_score

try:
 _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
 ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kaifzaki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kaifzaki/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/kaifzaki/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
# import wikipediaapi

# wiki_api = wikipediaapi.Wikipedia('MyClustteringProject','en')
# page = wiki_api.page("machine learning")
# print(page.text)

In [3]:
import wikipediaapi

artical_titles = ["Galax","Black hole","supernova",
                  "DNA","Photosynthesis","Evolution",
                  "machine learning","Artificial intelligence","computer programming"]

wiki_api = wikipediaapi.Wikipedia('MyClustteringProject/1.0','en')

documents = []
for title in artical_titles:
    page = wiki_api.page(title)
    if page.exists():
        documents.append(page.text)
        print(f"Fetched article: {title}")
    else:
        print(f"Article not found: {title}")
print(f"Total articles fetched: {len(documents)}")



Fetched article: Galax
Fetched article: Black hole
Fetched article: supernova
Fetched article: DNA
Fetched article: Photosynthesis
Fetched article: Evolution
Fetched article: machine learning
Fetched article: Artificial intelligence
Fetched article: computer programming
Total articles fetched: 9


In [4]:
stop_words = set(stopwords.words('english'))
print(f"Stop Words :{stop_words}")
print(f"Number of stopwords: {len(stop_words)}")

lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    words = text.split()
    # Remove stopwords and lemmatize
    processd_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(processd_words)

preprocessed_documents = [preprocess_text(doc) for doc in documents]
print("Preprocessing completed.")

Stop Words :{'at', "i'll", 'doesn', "she's", 'both', 'we', 'again', 'shouldn', "they've", 'don', 'more', 'you', 'the', 'no', 'haven', 'on', 'do', 'such', 'should', "wasn't", 'until', "we'd", "they'd", "weren't", 'most', 'been', 'her', 'after', 'is', 'yourselves', 'as', 'below', "didn't", "i'd", 'into', "aren't", 'who', 'if', 'before', 'not', 'for', "he'd", 'needn', 'when', "shan't", 'themselves', 'y', 'than', 'itself', 'm', "doesn't", 'very', 'during', 'isn', "haven't", 'having', "mustn't", 'have', "we'll", 'this', 'because', 'few', 'their', 'how', 'in', 'an', 'was', 'only', 'some', 'hasn', 'of', 'out', 'they', 'ourselves', 'had', "she'll", 're', 'did', 'he', "that'll", 'up', 'hadn', 'whom', 'wasn', "don't", "you're", 'above', 'being', 'will', 'be', 'it', "shouldn't", 'from', 'just', 'o', 'where', 'what', 'same', 'hers', "you've", 'here', 'between', "wouldn't", 'down', 'which', 'there', "they'll", "you'd", 'couldn', "mightn't", 'why', 'myself', 'too', 'through', 'nor', 'll', 'aren', "s

In [5]:
# Step 4 : Covert Text to Vectors

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000) # Limit to top 1000 features

# Create the TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(preprocessed_documents)

print(tfidf_matrix)

print("TF-IDF matrix created successfully")
print(f"Shape of the matrix: {tfidf_matrix.shape}")

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4703 stored elements and shape (9, 1000)>
  Coords	Values
  (0, 368)	0.8512306812056797
  (0, 677)	0.1792181330104813
  (0, 189)	0.04480453325262033
  (0, 836)	0.03627313109833374
  (0, 857)	0.022402266626310164
  (0, 953)	0.02510569981063823
  (0, 869)	0.020116546437431444
  (0, 604)	0.0444834922189141
  (0, 398)	0.04480453325262033
  (0, 676)	0.020116546437431444
  (0, 621)	0.03627313109833374
  (0, 582)	0.01813656554916687
  (0, 506)	0.016390098632453266
  (0, 445)	0.04023309287486289
  (0, 547)	0.014827830739638035
  (0, 255)	0.020116546437431444
  (0, 242)	0.014827830739638035
  (0, 948)	0.02965566147927607
  (0, 513)	0.020116546437431444
  (0, 38)	0.014827830739638035
  (0, 983)	0.016390098632453266
  (0, 498)	0.07531709943191468
  (0, 822)	0.022402266626310164
  (0, 710)	0.01813656554916687
  (0, 491)	0.01813656554916687
  :	:
  (8, 167)	0.007752299703076014
  (8, 180)	0.015504599406152028
  (8, 522)	0.023256899109228

In [6]:
k =3
kmeans = KMeans(n_clusters=k, random_state=42,n_init=5)
## the below line is fuctionally identical to the above line
# kmeans = KMeans(n_clusters=k, random_state=42, n_init="10"
kmeans.fit(tfidf_matrix)

# Get cluster labels for each document
labels = kmeans.labels_
print(f"Cluster labels: {labels}")


Cluster labels: [1 2 2 1 1 1 0 0 0]


In [7]:
wcss = kmeans.inertia_

sil_score = silhouette_score(tfidf_matrix, labels)

print(f"Within-Cluster Sum of Squares (WCSS): {wcss}")
print(f"Silhouette Score: {sil_score}")


Within-Cluster Sum of Squares (WCSS): 4.8690322755338284
Silhouette Score: 0.08007362002750984
