In [69]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder, Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.utils.extmath import randomized_svd
import pyswarms as ps
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict

In [79]:
def purity_score(y_true, y_pred):
    contingency_matrix = np.zeros((len(np.unique(y_true)), len(np.unique(y_pred))))
    for true_label, pred_label in zip(y_true, y_pred):
        contingency_matrix[true_label, pred_label] += 1
    return np.sum(np.amax(contingency_matrix, axis=0)) / len(y_true)

def evaluate_clustering(y_true, y_pred):
    return {
        'Purity': purity_score(y_true, y_pred),
        'NMI': normalized_mutual_info_score(y_true, y_pred),
        'ARI': adjusted_rand_score(y_true, y_pred)
    }
    
def preprocess_text(texts):
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)

    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    processed_texts = []

    for text in texts:
        text = re.sub(r'\b\d+\b', ' ', text)
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'0x[0-9a-fA-F]+', '', text)
        text = re.sub(r'[_]+', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = text.lower()
        tokens = word_tokenize(text)
        filtered_tokens = [
            lemmatizer.lemmatize(token)
            for token in tokens
            if token.isalpha() and token not in stop_words and len(token) >= 3
        ]
        processed_texts.append(' '.join(filtered_tokens))

    return processed_texts



class SphericalKMeans:
    def __init__(self, n_clusters, max_iter=100):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
    
    def fit_predict(self, X):
        
        normalizer = Normalizer(norm='l2')
        X_norm = normalizer.fit_transform(X.toarray() if hasattr(X, 'toarray') else X)
        
        
        kmeans = KMeans(n_clusters=self.n_clusters, init='k-means++')
        centers = kmeans.fit(X_norm).cluster_centers_
        
        for _ in range(self.max_iter):
            
            similarities = cosine_similarity(X_norm, centers)
            labels = np.argmax(similarities, axis=1)
            
            
            new_centers = np.array([X_norm[labels == i].mean(axis=0) 
                                  for i in range(self.n_clusters)])
            
            
            for i in range(self.n_clusters):
                if np.isnan(new_centers[i]).any():
                    new_centers[i] = centers[i]
            
            
            new_centers = normalizer.transform(new_centers)
            
            if np.allclose(centers, new_centers):
                break
            centers = new_centers
        
        return labels

In [22]:

class SphericalKMeans:
    def __init__(self, n_clusters, max_iter=100):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
    
    def fit_predict(self, X):
        
        normalizer = Normalizer(norm='l2')
        X_norm = normalizer.fit_transform(X.toarray() if hasattr(X, 'toarray') else X)
        
        
        kmeans = KMeans(n_clusters=self.n_clusters, init='k-means++')
        centers = kmeans.fit(X_norm).cluster_centers_
        
        for _ in range(self.max_iter):
            
            similarities = cosine_similarity(X_norm, centers)
            labels = np.argmax(similarities, axis=1)
            
            
            new_centers = np.array([X_norm[labels == i].mean(axis=0) 
                                  for i in range(self.n_clusters)])
            
            
            for i in range(self.n_clusters):
                if np.isnan(new_centers[i]).any():
                    new_centers[i] = centers[i]
            
            
            new_centers = normalizer.transform(new_centers)
            
            if np.allclose(centers, new_centers):
                break
            centers = new_centers
        
        return labels
        


def run_experiments():
    categories = {
        'Computer': ['comp.graphics', 'comp.os.ms-windows.misc', 
                    'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 
                    'comp.windows.x'],
        'Politics': ['talk.politics.misc', 'talk.politics.guns', 
                    'talk.politics.mideast'],
        'Miscellaneous': ['misc.forsale','talk.politics.misc','talk.religion.misc','comp.os.ms-windows.misc'],
        'Religion': ['talk.religion.misc', 'alt.atheism', 'soc.religion.christian'],
        'Science': ['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space']
    }
    
    results = {}
    
    for name, subcats in categories.items():
        
        dataset = fetch_20newsgroups(subset='all', categories=subcats, 
                                   remove=('headers', 'footers', 'quotes'))
        n_clusters = len(subcats)
        
        processed_texts = preprocess_text(dataset.data)


        vectorizer = TfidfVectorizer(stop_words='english',
                     min_df=2,
                     max_df=0.95,
                     token_pattern=r'\b[a-zA-Z]{3,}\b')
        
        X = vectorizer.fit_transform(processed_texts)
        y_true = LabelEncoder().fit_transform(dataset.target)
        
        # 1. K-Means
        km = KMeans(n_clusters=n_clusters, init='k-means++')
        y_pred = km.fit_predict(X)
        results[f"{name}_KM"] = evaluate_clustering(y_true, y_pred)
        
        # 2. Spherical K-Means (SKM)
        skm = SphericalKMeans(n_clusters=n_clusters)
        y_pred = skm.fit_predict(X)
        results[f"{name}_SKM"] = evaluate_clustering(y_true, y_pred)
        
        # 3. LSAKM
        svd = TruncatedSVD(n_components=5)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X_lsa = lsa.fit_transform(X)
        km_lsa = KMeans(n_clusters=n_clusters, init='k-means++')
        y_pred = km_lsa.fit_predict(X_lsa)
        results[f"{name}_LSAKM"] = evaluate_clustering(y_true, y_pred)
    
    return results


final_results = run_experiments()


for name, metrics in final_results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")




Computer_KM:
Purity: 0.3318
NMI: 0.1359
ARI: 0.0392

Computer_SKM:
Purity: 0.3950
NMI: 0.1655
ARI: 0.0983

Computer_LSAKM:
Purity: 0.3881
NMI: 0.1575
ARI: 0.0963

Politics_KM:
Purity: 0.4640
NMI: 0.1205
ARI: 0.0435

Politics_SKM:
Purity: 0.6290
NMI: 0.3314
ARI: 0.2838

Politics_LSAKM:
Purity: 0.5878
NMI: 0.2881
ARI: 0.1865

Miscellaneous_KM:
Purity: 0.5894
NMI: 0.4066
ARI: 0.2277

Miscellaneous_SKM:
Purity: 0.7006
NMI: 0.5224
ARI: 0.4341

Miscellaneous_LSAKM:
Purity: 0.7101
NMI: 0.4987
ARI: 0.4704

Religion_KM:
Purity: 0.4493
NMI: 0.0322
ARI: 0.0172

Religion_SKM:
Purity: 0.4806
NMI: 0.0630
ARI: 0.0500

Religion_LSAKM:
Purity: 0.5507
NMI: 0.0826
ARI: 0.0925

Science_KM:
Purity: 0.4828
NMI: 0.3122
ARI: 0.1032

Science_SKM:
Purity: 0.7082
NMI: 0.4873
ARI: 0.3343

Science_LSAKM:
Purity: 0.7098
NMI: 0.3932
ARI: 0.3689


In [24]:
import numpy as np
from collections import defaultdict

n_runs = 5
aggregated_results = defaultdict(lambda: defaultdict(list))

for run in range(n_runs):
    print(f"\n Running iteration {run + 1}...")
    run_results = run_experiments()
    for method_name, metrics in run_results.items():
        for metric_name, value in metrics.items():
            aggregated_results[method_name][metric_name].append(value)

print("\n Final Aggregated Results:")
for method_name, metric_values in aggregated_results.items():
    print(f"\n {method_name}")
    for metric_name, values in metric_values.items():
        mean_val = np.mean(values)
        std_val = np.std(values)
        print(f"{metric_name}: Mean = {mean_val:.4f}, Std = {std_val:.4f}")



 Running iteration 1...





 Running iteration 2...





 Running iteration 3...





 Running iteration 4...





 Running iteration 5...





 Final Aggregated Results:

 Computer_KM
Purity: Mean = 0.2672, Std = 0.0640
NMI: Mean = 0.0679, Std = 0.0609
ARI: Mean = 0.0177, Std = 0.0190

 Computer_SKM
Purity: Mean = 0.4144, Std = 0.0156
NMI: Mean = 0.1733, Std = 0.0117
ARI: Mean = 0.1100, Std = 0.0104

 Computer_LSAKM
Purity: Mean = 0.4029, Std = 0.0228
NMI: Mean = 0.1722, Std = 0.0160
ARI: Mean = 0.1119, Std = 0.0258

 Politics_KM
Purity: Mean = 0.4670, Std = 0.0642
NMI: Mean = 0.1480, Std = 0.0925
ARI: Mean = 0.0591, Std = 0.0401

 Politics_SKM
Purity: Mean = 0.5903, Std = 0.0463
NMI: Mean = 0.2997, Std = 0.0680
ARI: Mean = 0.2179, Std = 0.0717

 Politics_LSAKM
Purity: Mean = 0.5904, Std = 0.0305
NMI: Mean = 0.2582, Std = 0.0300
ARI: Mean = 0.2033, Std = 0.0402

 Miscellaneous_KM
Purity: Mean = 0.6278, Std = 0.0276
NMI: Mean = 0.4312, Std = 0.0219
ARI: Mean = 0.2893, Std = 0.0183

 Miscellaneous_SKM
Purity: Mean = 0.6952, Std = 0.0409
NMI: Mean = 0.4835, Std = 0.0213
ARI: Mean = 0.4390, Std = 0.0309

 Miscellaneous_LSAKM
Pur

In [26]:
#bbc news
def run_experiments_custom_dataset():
    
    df = pd.read_csv("C:/Users/asus/Downloads/FinalProject/dataset.csv", encoding='latin1')

    texts = df['news'].astype(str).tolist()
    labels = df['type'].astype(str).tolist()
    y_true = LabelEncoder().fit_transform(labels)
    n_clusters = len(np.unique(y_true))

    processed_texts = preprocess_text(texts)


    vectorizer = TfidfVectorizer(stop_words='english',
                 min_df=2,
                 max_df=0.95,
                 token_pattern=r'\b[a-zA-Z]{3,}\b')
   
    
    X = vectorizer.fit_transform(processed_texts)

    results = {}

    
    km = KMeans(n_clusters=n_clusters, init='k-means++')
    y_pred = km.fit_predict(X)
    results["KM"] = evaluate_clustering(y_true, y_pred)

    
    skm = SphericalKMeans(n_clusters=n_clusters)
    y_pred = skm.fit_predict(X)
    results["SKM"] = evaluate_clustering(y_true, y_pred)

    
    svd = TruncatedSVD(n_components=min(5, X.shape[1]))
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    X_lsa = lsa.fit_transform(X)
    km_lsa = KMeans(n_clusters=n_clusters, init='k-means++')
    y_pred = km_lsa.fit_predict(X_lsa)
    results["LSAKM"] = evaluate_clustering(y_true, y_pred)

    return results


results = run_experiments_custom_dataset()


for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")





KM:
Purity: 0.8980
NMI: 0.7614
ARI: 0.7606

SKM:
Purity: 0.7991
NMI: 0.6957
ARI: 0.6514

LSAKM:
Purity: 0.9330
NMI: 0.8131
ARI: 0.8470




In [28]:
import numpy as np
from collections import defaultdict

n_runs = 5
aggregated_results = defaultdict(lambda: defaultdict(list))

for run in range(n_runs):
    print(f"\n Running iteration {run + 1}...")
    run_results = run_experiments_custom_dataset()
    for method_name, metrics in run_results.items():
        for metric_name, value in metrics.items():
            aggregated_results[method_name][metric_name].append(value)

print("\n Final Aggregated Results:")
for method_name, metric_values in aggregated_results.items():
    print(f"\n {method_name}")
    for metric_name, values in metric_values.items():
        mean_val = np.mean(values)
        std_val = np.std(values)
        print(f"{metric_name}: Mean = {mean_val:.4f}, Std = {std_val:.4f}")


 Running iteration 1...





 Running iteration 2...





 Running iteration 3...





 Running iteration 4...





 Running iteration 5...





 Final Aggregated Results:

 KM
Purity: Mean = 0.8004, Std = 0.1135
NMI: Mean = 0.6793, Std = 0.1097
ARI: Mean = 0.5986, Std = 0.1919

 SKM
Purity: Mean = 0.8604, Std = 0.0944
NMI: Mean = 0.7431, Std = 0.0934
ARI: Mean = 0.7361, Std = 0.1416

 LSAKM
Purity: Mean = 0.9325, Std = 0.0004
NMI: Mean = 0.8120, Std = 0.0009
ARI: Mean = 0.8457, Std = 0.0010




In [56]:
#SMS
def run_experiments_SMS_dataset():
    
    df = pd.read_csv("C:/Users/asus/Downloads/FinalProject/archive (3)/spam.csv", encoding='latin1')[['v1', 'v2']]
    df.columns = ['label', 'text']

    processed_texts = preprocess_text(df['text'].tolist())


    vectorizer = TfidfVectorizer(stop_words='english',
                 min_df=2,
                 max_df=0.95,
                 token_pattern=r'\b[a-zA-Z]{3,}\b')

    X = vectorizer.fit_transform(processed_texts)

    label_mapping = {'ham': 0, 'spam': 1}
    df['label'] = df['label'].map(label_mapping)

    y_true = df['label'].tolist()

    n_clusters = len(np.unique(y_true))
    results = {}
    km = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42)
    y_pred = km.fit_predict(X)
    results["KM"] = evaluate_clustering(y_true, y_pred)

    
    skm = SphericalKMeans(n_clusters=n_clusters)
    y_pred = skm.fit_predict(X)
    results["SKM"] = evaluate_clustering(y_true, y_pred)

    
    svd = TruncatedSVD(n_components=min(10, X.shape[1]))
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    X_lsa = lsa.fit_transform(X)
    km_lsa = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42)
    y_pred = km_lsa.fit_predict(X_lsa)
    results["LSAKM"] = evaluate_clustering(y_true, y_pred)
    return results


results = run_experiments_SMS_dataset()


for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")





KM:
Purity: 0.8659
NMI: 0.0205
ARI: -0.0510

SKM:
Purity: 0.8819
NMI: 0.3562
ARI: 0.4994

LSAKM:
Purity: 0.8659
NMI: 0.0818
ARI: 0.0185


In [34]:
n_runs = 5
aggregated_results = defaultdict(lambda: defaultdict(list))

for run in range(n_runs):
    print(f"\n Running iteration {run + 1}...")
    run_results = run_experiments_SMS_dataset()
    for method_name, metrics in run_results.items():
        for metric_name, value in metrics.items():
            aggregated_results[method_name][metric_name].append(value)

print("\n Final Aggregated Results:")
for method_name, metric_values in aggregated_results.items():
    print(f"\n {method_name}")
    for metric_name, values in metric_values.items():
        mean_val = np.mean(values)
        std_val = np.std(values)
        print(f"{metric_name}: Mean = {mean_val:.4f}, Std = {std_val:.4f}")


 Running iteration 1...

 Running iteration 2...

 Running iteration 3...

 Running iteration 4...

 Running iteration 5...

 Final Aggregated Results:

 KM
Purity: Mean = 0.8705, Std = 0.0090
NMI: Mean = 0.0400, Std = 0.0649
ARI: Mean = 0.0733, Std = 0.1202

 SKM
Purity: Mean = 0.8683, Std = 0.0048
NMI: Mean = 0.1615, Std = 0.1410
ARI: Mean = 0.1800, Std = 0.2344

 LSAKM
Purity: Mean = 0.8725, Std = 0.0131
NMI: Mean = 0.1562, Std = 0.1299
ARI: Mean = 0.1427, Std = 0.2218


In [58]:
#DMOZ dataset
def run_experiments_DMOZ_dataset():
    df = pd.read_csv("C:/Users/asus/Downloads/FinalProject/archive (5)/dmoz.csv", encoding='latin1')
    df = df.rename(columns={
                'category': 'Class Index',
                'title': 'Title',
                'desc': 'Description'
            })
    N = 300  
    df_balanced = (
        df.groupby('Class Index', group_keys=False)
        .apply(lambda x: x.sample(n=min(N, len(x)), random_state=42))
        .reset_index(drop=True)
    )


    texts = (df_balanced['Title'] + ' ' + df_balanced['Description']).astype(str).tolist()
    labels = df_balanced['Class Index'].tolist()


    processed_texts = preprocess_text(texts)


    vectorizer = TfidfVectorizer(stop_words='english',
                 min_df=2,
                 max_df=0.95,
                 token_pattern=r'\b[a-zA-Z]{3,}\b')

    X = vectorizer.fit_transform(processed_texts)
    label_encoder = LabelEncoder()
    y_true = label_encoder.fit_transform(labels)

    n_clusters = len(np.unique(y_true))
    results = {}
    km = KMeans(n_clusters=n_clusters, init='k-means++')
    y_pred = km.fit_predict(X)
    results["KM"] = evaluate_clustering(y_true, y_pred)

    
    skm = SphericalKMeans(n_clusters=n_clusters)
    y_pred = skm.fit_predict(X)
    results["SKM"] = evaluate_clustering(y_true, y_pred)

    
    svd = TruncatedSVD(n_components=min(15, X.shape[1]))
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    X_lsa = lsa.fit_transform(X)
    km_lsa = KMeans(n_clusters=n_clusters, init='k-means++')
    y_pred = km_lsa.fit_predict(X_lsa)
    results["LSAKM"] = evaluate_clustering(y_true, y_pred)
    return results
    
results = run_experiments_DMOZ_dataset()


for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")


  .apply(lambda x: x.sample(n=min(N, len(x)), random_state=42))



KM:
Purity: 0.2297
NMI: 0.1517
ARI: 0.0309

SKM:
Purity: 0.2951
NMI: 0.1803
ARI: 0.0766

LSAKM:
Purity: 0.3169
NMI: 0.1880
ARI: 0.0976


In [60]:
n_runs = 5
aggregated_results = defaultdict(lambda: defaultdict(list))

for run in range(n_runs):
    print(f"\n Running iteration {run + 1}...")
    run_results = run_experiments_DMOZ_dataset()
    for method_name, metrics in run_results.items():
        for metric_name, value in metrics.items():
            aggregated_results[method_name][metric_name].append(value)

print("\n Final Aggregated Results:")
for method_name, metric_values in aggregated_results.items():
    print(f"\n {method_name}")
    for metric_name, values in metric_values.items():
        mean_val = np.mean(values)
        std_val = np.std(values)
        print(f"{metric_name}: Mean = {mean_val:.4f}, Std = {std_val:.4f}")


 Running iteration 1...


  .apply(lambda x: x.sample(n=min(N, len(x)), random_state=42))



 Running iteration 2...


  .apply(lambda x: x.sample(n=min(N, len(x)), random_state=42))



 Running iteration 3...


  .apply(lambda x: x.sample(n=min(N, len(x)), random_state=42))



 Running iteration 4...


  .apply(lambda x: x.sample(n=min(N, len(x)), random_state=42))



 Running iteration 5...


  .apply(lambda x: x.sample(n=min(N, len(x)), random_state=42))



 Final Aggregated Results:

 KM
Purity: Mean = 0.2451, Std = 0.0102
NMI: Mean = 0.1670, Std = 0.0088
ARI: Mean = 0.0317, Std = 0.0038

 SKM
Purity: Mean = 0.3039, Std = 0.0035
NMI: Mean = 0.1955, Std = 0.0082
ARI: Mean = 0.0846, Std = 0.0040

 LSAKM
Purity: Mean = 0.3277, Std = 0.0053
NMI: Mean = 0.2019, Std = 0.0067
ARI: Mean = 0.1110, Std = 0.0041


In [44]:
#BBC Sport
def run_clustering(name, X, y, n_clusters):
    print(f"\n=== Results on {name} Dataset ===")
    results = {}

    # KMeans
    km = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10)
    y_pred_km = km.fit_predict(X)
    results['KMeans'] = evaluate_clustering(y, y_pred_km)

    # SphericalKMeans
    skm = SphericalKMeans(n_clusters=n_clusters)
    y_pred_skm = skm.fit_predict(X)
    results['SphericalKMeans'] = evaluate_clustering(y, y_pred_skm)

    # LSA + KMeans
    svd = TruncatedSVD(n_components=7)
    lsa = make_pipeline(svd, Normalizer(copy=False))
    X_lsa = lsa.fit_transform(X)

    km_lsa = KMeans(n_clusters=n_clusters, init='k-means++',  n_init=10)
    y_pred_lsa_km = km_lsa.fit_predict(X_lsa)
    results['LSA+KMeans'] = evaluate_clustering(y, y_pred_lsa_km)

    for method, metrics in results.items():
        print(f"\n{method}:")
        for metric, value in metrics.items():
            print(f"{metric}: {value:.4f}")


bbcsport_X = pd.read_csv(r'C:\Users\asus\Downloads\FinalProject\archive (2)\bbcsport_mtx.csv', header=None).values
bbcsport_y = pd.read_csv(r'C:\Users\asus\Downloads\FinalProject\archive (2)\bbcsport_classes.csv', header=None)[0].values
bbcsport_n_clusters = len(np.unique(bbcsport_y))

run_clustering("BBCSport", bbcsport_X, bbcsport_y, bbcsport_n_clusters)





=== Results on BBCSport Dataset ===





KMeans:
Purity: 0.3813
NMI: 0.0582
ARI: 0.0275

SphericalKMeans:
Purity: 0.8046
NMI: 0.6996
ARI: 0.5755

LSA+KMeans:
Purity: 0.7938
NMI: 0.6723
ARI: 0.5817


In [50]:
def run_clustering(name, X, y, n_clusters):
    results = {}

    # KMeans
    km = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10)
    y_pred_km = km.fit_predict(X)
    results['KMeans'] = evaluate_clustering(y, y_pred_km)

    # SphericalKMeans
    skm = SphericalKMeans(n_clusters=n_clusters)
    y_pred_skm = skm.fit_predict(X)
    results['SphericalKMeans'] = evaluate_clustering(y, y_pred_skm)

    # LSA + KMeans
    svd = TruncatedSVD(n_components=7)
    lsa = make_pipeline(svd, Normalizer(copy=False))
    X_lsa = lsa.fit_transform(X)

    km_lsa = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10)
    y_pred_lsa_km = km_lsa.fit_predict(X_lsa)
    results['LSA+KMeans'] = evaluate_clustering(y, y_pred_lsa_km)

    return results

n_runs = 5
aggregated_results = defaultdict(lambda: defaultdict(list))

for run in range(n_runs):
    print(f"\n Running iteration {run + 1}...")
    run_results = run_clustering("BBCSport", bbcsport_X, bbcsport_y, bbcsport_n_clusters)
    for method_name, metrics in run_results.items():
        for metric_name, value in metrics.items():
            aggregated_results[method_name][metric_name].append(value)

print("\n Final Aggregated Results:")
for method_name, metric_values in aggregated_results.items():
    print(f"\n {method_name}")
    for metric_name, values in metric_values.items():
        mean_val = np.mean(values)
        std_val = np.std(values)
        print(f"{metric_name}: Mean = {mean_val:.4f}, Std = {std_val:.4f}")


 Running iteration 1...





 Running iteration 2...





 Running iteration 3...





 Running iteration 4...





 Running iteration 5...





 Final Aggregated Results:

 KMeans
Purity: Mean = 0.4814, Std = 0.0582
NMI: Mean = 0.2235, Std = 0.1057
ARI: Mean = 0.0913, Std = 0.0764

 SphericalKMeans
Purity: Mean = 0.8152, Std = 0.0756
NMI: Mean = 0.6999, Std = 0.0893
ARI: Mean = 0.6538, Std = 0.1111

 LSA+KMeans
Purity: Mean = 0.7938, Std = 0.0000
NMI: Mean = 0.6730, Std = 0.0000
ARI: Mean = 0.5826, Std = 0.0000


In [65]:
#AG dataset
def run_AG_dataset():
    
    df = pd.read_csv("C:/Users/asus/Downloads/FinalProject/archive (4)/train.csv", encoding='latin1')

    df = df.sample(n=1180, random_state=42)
    texts = (df['Title'] + ' ' + df['Description']).astype(str).tolist()
    labels = df['Class Index'].tolist()
    label_encoder = LabelEncoder()
    y_true = label_encoder.fit_transform(labels)
    n_clusters = len(np.unique(y_true))

    processed_texts = preprocess_text(texts)


    vectorizer = TfidfVectorizer(stop_words='english',
                 min_df=2,
                 max_df=0.95,
                 token_pattern=r'\b[a-zA-Z]{3,}\b')
   
    
    X = vectorizer.fit_transform(processed_texts)

    results = {}

    
    km = KMeans(n_clusters=n_clusters, init='k-means++')
    y_pred = km.fit_predict(X)
    results["KM"] = evaluate_clustering(y_true, y_pred)

    
    skm = SphericalKMeans(n_clusters=n_clusters)
    y_pred = skm.fit_predict(X)
    results["SKM"] = evaluate_clustering(y_true, y_pred)

    
    svd = TruncatedSVD(n_components=min(5, X.shape[1]))
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    X_lsa = lsa.fit_transform(X)
    km_lsa = KMeans(n_clusters=n_clusters, init='k-means++')
    y_pred = km_lsa.fit_predict(X_lsa)
    results["LSAKM"] = evaluate_clustering(y_true, y_pred)

    return results


results = run_AG_dataset()


for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")




KM:
Purity: 0.4034
NMI: 0.1110
ARI: 0.0513

SKM:
Purity: 0.5949
NMI: 0.3060
ARI: 0.2850

LSAKM:
Purity: 0.6695
NMI: 0.3770
ARI: 0.3874




In [40]:
n_runs = 5
aggregated_results = defaultdict(lambda: defaultdict(list))

for run in range(n_runs):
    print(f"\n Running iteration {run + 1}...")
    run_results = run_AG_dataset()
    for method_name, metrics in run_results.items():
        for metric_name, value in metrics.items():
            aggregated_results[method_name][metric_name].append(value)

print("\n Final Aggregated Results:")
for method_name, metric_values in aggregated_results.items():
    print(f"\n {method_name}")
    for metric_name, values in metric_values.items():
        mean_val = np.mean(values)
        std_val = np.std(values)
        print(f"{metric_name}: Mean = {mean_val:.4f}, Std = {std_val:.4f}")


 Running iteration 1...





 Running iteration 2...





 Running iteration 3...





 Running iteration 4...





 Running iteration 5...





 Final Aggregated Results:

 KM
Purity: Mean = 0.3939, Std = 0.0600
NMI: Mean = 0.0928, Std = 0.0547
ARI: Mean = 0.0602, Std = 0.0395

 SKM
Purity: Mean = 0.5661, Std = 0.1110
NMI: Mean = 0.2732, Std = 0.1228
ARI: Mean = 0.2484, Std = 0.1121

 LSAKM
Purity: Mean = 0.6814, Std = 0.0362
NMI: Mean = 0.3960, Std = 0.0326
ARI: Mean = 0.4067, Std = 0.0430




In [81]:
def run_experiments_webkb_classic(folder_path):
    results = {}

    for file in os.listdir(folder_path):
        if file.endswith(".csv"):
            file_path = os.path.join(folder_path, file)
            print(f"\nProcessing {file}...")

            df = pd.read_csv(file_path)
            texts = df['raw_text'].astype(str).tolist()
            labels = df['label'].tolist()
            y_true = LabelEncoder().fit_transform(labels)
            n_clusters = len(np.unique(y_true))

            
            processed_texts = preprocess_text(texts)

            vectorizer = TfidfVectorizer(
                stop_words='english',
                min_df=2,
                max_df=0.95,
                token_pattern=r'\b[a-zA-Z]{3,}\b'
            )
            X = vectorizer.fit_transform(processed_texts)

        
            dataset_results = {}

            km = KMeans(n_clusters=n_clusters, init='k-means++')
            y_pred = km.fit_predict(X)
            dataset_results["KM"] = evaluate_clustering(y_true, y_pred)

            skm = SphericalKMeans(n_clusters=n_clusters)
            y_pred = skm.fit_predict(X)
            dataset_results["SKM"] = evaluate_clustering(y_true, y_pred)

            svd = TruncatedSVD(n_components=min(15, X.shape[1]))
            normalizer = Normalizer(copy=False)
            lsa = make_pipeline(svd, normalizer)
            X_lsa = lsa.fit_transform(X)
            km_lsa = KMeans(n_clusters=n_clusters, init='k-means++')
            y_pred = km_lsa.fit_predict(X_lsa)
            dataset_results["LSAKM"] = evaluate_clustering(y_true, y_pred)

            results[file.replace(".csv", "")] = dataset_results

    return results

results = run_experiments_webkb_classic("C:/Users/asus/Downloads/FinalProject/WebKB")

print("\n=== Final Results ===")
for dataset, algos in results.items():
    print(f"\nDataset: {dataset}")
    for algo_name, metrics in algos.items():
        print(f"  {algo_name}:")
        for metric, value in metrics.items():
            print(f"    {metric}: {value:.4f}")




Processing Cornell.csv...





Processing Texas.csv...





Processing Washington.csv...





Processing Wisconsin.csv...

=== Final Results ===

Dataset: Cornell
  KM:
    Purity: 0.4974
    NMI: 0.1517
    ARI: 0.0869
  SKM:
    Purity: 0.6754
    NMI: 0.3468
    ARI: 0.2454
  LSAKM:
    Purity: 0.6545
    NMI: 0.3534
    ARI: 0.3402

Dataset: Texas
  KM:
    Purity: 0.5668
    NMI: 0.1233
    ARI: 0.1482
  SKM:
    Purity: 0.6631
    NMI: 0.2494
    ARI: 0.1682
  LSAKM:
    Purity: 0.7112
    NMI: 0.3703
    ARI: 0.2740

Dataset: Washington
  KM:
    Purity: 0.6638
    NMI: 0.3156
    ARI: 0.2777
  SKM:
    Purity: 0.6812
    NMI: 0.2806
    ARI: 0.3069
  LSAKM:
    Purity: 0.7293
    NMI: 0.3776
    ARI: 0.3296

Dataset: Wisconsin
  KM:
    Purity: 0.6566
    NMI: 0.2904
    ARI: 0.1724
  SKM:
    Purity: 0.6792
    NMI: 0.2966
    ARI: 0.2953
  LSAKM:
    Purity: 0.6906
    NMI: 0.3569
    ARI: 0.3090




In [95]:
import numpy as np
from collections import defaultdict

n_runs = 5
aggregated_results = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

for run in range(n_runs):
    print(f"\n Running iteration {run + 1}...")
    run_results = run_experiments_webkb_classic("C:/Users/asus/Downloads/FinalProject/WebKB")

    
    for dataset_name, methods in run_results.items():
        for method_name, metrics in methods.items():
            for metric_name, value in metrics.items():
                aggregated_results[dataset_name][method_name][metric_name].append(value)

print("\nFinal Aggregated Results:")
for dataset_name, methods in aggregated_results.items():
    print(f"\n=== {dataset_name} ===")
    for method_name, metrics in methods.items():
        print(f"\nMethod: {method_name}")
        for metric_name, values in metrics.items():
            mean_val = np.mean(values)
            std_val = np.std(values)
            print(f"{metric_name}: Mean = {mean_val:.4f}, Std = {std_val:.4f}")



 Running iteration 1...

Processing Cornell.csv...





Processing Texas.csv...





Processing Washington.csv...





Processing Wisconsin.csv...





 Running iteration 2...

Processing Cornell.csv...





Processing Texas.csv...





Processing Washington.csv...





Processing Wisconsin.csv...





 Running iteration 3...

Processing Cornell.csv...





Processing Texas.csv...





Processing Washington.csv...





Processing Wisconsin.csv...





 Running iteration 4...

Processing Cornell.csv...





Processing Texas.csv...





Processing Washington.csv...





Processing Wisconsin.csv...





 Running iteration 5...

Processing Cornell.csv...





Processing Texas.csv...





Processing Washington.csv...





Processing Wisconsin.csv...





Final Aggregated Results:

=== Cornell ===

Method: KM
Purity: Mean = 0.6168, Std = 0.0493
NMI: Mean = 0.3071, Std = 0.0602
ARI: Mean = 0.2069, Std = 0.0621

Method: SKM
Purity: Mean = 0.6461, Std = 0.0240
NMI: Mean = 0.3173, Std = 0.0397
ARI: Mean = 0.2366, Std = 0.0333

Method: LSAKM
Purity: Mean = 0.6712, Std = 0.0130
NMI: Mean = 0.3809, Std = 0.0286
ARI: Mean = 0.3389, Std = 0.0625

=== Texas ===

Method: KM
Purity: Mean = 0.6257, Std = 0.0251
NMI: Mean = 0.1910, Std = 0.0450
ARI: Mean = 0.1749, Std = 0.0662

Method: SKM
Purity: Mean = 0.6727, Std = 0.0520
NMI: Mean = 0.2316, Std = 0.0940
ARI: Mean = 0.2235, Std = 0.1234

Method: LSAKM
Purity: Mean = 0.6877, Std = 0.0307
NMI: Mean = 0.3354, Std = 0.0440
ARI: Mean = 0.2279, Std = 0.0464

=== Washington ===

Method: KM
Purity: Mean = 0.6568, Std = 0.0446
NMI: Mean = 0.2801, Std = 0.0638
ARI: Mean = 0.2540, Std = 0.0977

Method: SKM
Purity: Mean = 0.6934, Std = 0.0160
NMI: Mean = 0.3109, Std = 0.0400
ARI: Mean = 0.2733, Std = 0.0295


In [85]:
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
import pyswarms as ps
from deap import base, creator, tools, algorithms



class ImprovedGAKM_NMI:
    def __init__(self, n_clusters, y_true, pop_size=20, n_gen=50, mutation_rate=0.2):
        self.n_clusters = n_clusters
        self.pop_size = pop_size
        self.n_gen = n_gen
        self.y_true = y_true
        self.mutation_rate = mutation_rate

    def _evaluate(self, centers, X):
        
        distances = np.array([np.linalg.norm(X - center, axis=1) for center in centers])
        labels = np.argmin(distances, axis=0)
        
        
        if len(np.unique(labels)) < self.n_clusters:
            return -1  
        return normalized_mutual_info_score(self.y_true, labels)

    def fit_predict(self, X):
        X = X.toarray() if hasattr(X, 'toarray') else X
        n_samples = X.shape[0]

        
        population = []
        for _ in range(self.pop_size):
            indices = np.random.choice(n_samples, self.n_clusters, replace=False)
            centers = X[indices]
            population.append(centers)

        best_centers = None
        best_score = -1

        for gen in range(self.n_gen):
            scores = [self._evaluate(centers, X) for centers in population]
            sorted_indices = np.argsort(scores)[::-1]
            population = [population[i] for i in sorted_indices]

            if scores[sorted_indices[0]] > best_score:
                best_score = scores[sorted_indices[0]]
                best_centers = population[0]

            
            parents = population[:self.pop_size // 2]
            new_population = parents.copy()

            
            while len(new_population) < self.pop_size:
                p1, p2 = random.sample(parents, 2)
                #child = (parent1 + parent2) / 2
                crossover_point = np.random.randint(1, self.n_clusters)
                child = np.vstack((p1[:crossover_point], p2[crossover_point:]))

                
                if random.random() < self.mutation_rate:
                    noise = np.random.normal(0, 0.1, child.shape)
                    child = np.clip(child + noise, 0, 1)  

                new_population.append(child)

            population = new_population

        
        final_distances = np.array([np.linalg.norm(X - center, axis=1) for center in best_centers])
        return np.argmin(final_distances, axis=0)


class ImprovedGAKM:
    def __init__(self, n_clusters, pop_size=15, n_gen=30):
        self.n_clusters = n_clusters
        self.pop_size = pop_size
        self.n_gen = n_gen
        
    def fit_predict(self, X):
        X = X.toarray() if hasattr(X, 'toarray') else X
        n_samples, n_features = X.shape
        
        
        if self.n_clusters == 1:
            return np.zeros(n_samples, dtype=int)
        
        
        population = []
        for _ in range(self.pop_size):
            centers = X[np.random.choice(n_samples, self.n_clusters, replace=False)]
            population.append(centers)
        
        best_centers = None
        best_score = -np.inf
        
        for _ in range(self.n_gen):
            
            scores = []
            for centers in population:
                distances = np.array([np.linalg.norm(X - center, axis=1) for center in centers])
                labels = np.argmin(distances, axis=0)
                
                if len(np.unique(labels)) == self.n_clusters:
                    score = silhouette_score(X, labels)
                    scores.append(score)
                    
                    if score > best_score:
                        best_score = score
                        best_centers = centers
                else:
                    scores.append(-np.inf)
            
            
            selected_indices = np.argsort(scores)[-self.pop_size//2:]
            new_population = [population[i] for i in selected_indices]
            
           
            while len(new_population) < self.pop_size:
                parent1, parent2 = random.sample(new_population, 2)
                #child = (parent1 + parent2) / 2  
                crossover_point = np.random.randint(1, self.n_clusters)
                child = np.vstack((
                    parent1[:crossover_point],
                    parent2[crossover_point:]
                ))
                
                if random.random() < 0.2:
                    child += np.random.normal(0, 0.5, size=child.shape)
                
                new_population.append(child)
            
            population = new_population
        
        
        distances = np.array([np.linalg.norm(X - center, axis=1) for center in best_centers])
        return np.argmin(distances, axis=0)


In [21]:
#20newsgroup
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
import pyswarms as ps
from deap import base, creator, tools, algorithms
import random




def run_experiments():
    categories = {
        'Computer': ['comp.graphics', 'comp.os.ms-windows.misc', 
                    'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 
                    'comp.windows.x'],
        'Politics': ['talk.politics.misc', 'talk.politics.guns', 
                    'talk.politics.mideast'],
        'Miscellaneous': ['misc.forsale','talk.politics.misc','talk.religion.misc','comp.os.ms-windows.misc'],
        'Religion': ['talk.religion.misc', 'alt.atheism', 'soc.religion.christian'],
        'Science': ['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space']
    }
    
    results = {}
    
    for name, subcats in categories.items():
        print(f"\n=== Processing {name} category ===")
        dataset = fetch_20newsgroups(subset='all', categories=subcats, 
                                   remove=('headers', 'footers', 'quotes'))
        n_clusters = len(subcats)
        
        processed_texts = preprocess_text(dataset.data)


        vectorizer = TfidfVectorizer(stop_words='english',
                     min_df=2,
                     max_df=0.95,
                     token_pattern=r'\b[a-zA-Z]{3,}\b')
   
    
        X = vectorizer.fit_transform(processed_texts)
        y_true = dataset.target
        
        
        algorithms = {

            'GAKM': ImprovedGAKM_NMI(n_clusters=n_clusters, y_true=y_true)
            
            
            
        }
        
        for algo_name, algo in algorithms.items():
            try:
                print(f"Running {algo_name}...")
                y_pred = algo.fit_predict(X)
                results[f"{name}_{algo_name}"] = evaluate_clustering(y_true, y_pred)
                print(f"{algo_name} completed for {name}")
            except Exception as e:
                print(f"Error in {algo_name} for {name}: {str(e)}")
                results[f"{name}_{algo_name}"] = {'Purity': 0, 'NMI': 0, 'ARI': 0}
    
    return results


if __name__ == "__main__":
    results = run_experiments()
    print("\n=== Final Results ===")
    for name, metrics in results.items():
        print(f"\n{name}:")
        for metric, value in metrics.items():
            print(f"{metric}: {value:.4f}")


=== Processing Computer category ===
Running GAKM...
GAKM completed for Computer

=== Processing Politics category ===
Running GAKM...
GAKM completed for Politics

=== Processing Miscellaneous category ===
Running GAKM...
GAKM completed for Miscellaneous

=== Processing Religion category ===
Running GAKM...
GAKM completed for Religion

=== Processing Science category ===
Running GAKM...
GAKM completed for Science

=== Final Results ===

Computer_GAKM:
Purity: 0.3682
NMI: 0.0778
ARI: 0.0549

Politics_GAKM:
Purity: 0.5337
NMI: 0.1119
ARI: 0.1156

Miscellaneous_GAKM:
Purity: 0.5822
NMI: 0.2308
ARI: 0.2310

Religion_GAKM:
Purity: 0.4719
NMI: 0.0338
ARI: 0.0443

Science_GAKM:
Purity: 0.4643
NMI: 0.1082
ARI: 0.1011


In [25]:
#BBC news
def run_experiments_BBC_dataset():
    
    df = pd.read_csv("C:/Users/asus/Downloads/FinalProject/dataset.csv", encoding='latin1')

    texts = df['news'].astype(str).tolist()
    labels = df['type'].astype(str).tolist()
    y_true = LabelEncoder().fit_transform(labels)
    n_clusters = len(np.unique(y_true))

    processed_texts = preprocess_text(texts)


    vectorizer = TfidfVectorizer(stop_words='english',
                 min_df=2,
                 max_df=0.95,
                 token_pattern=r'\b[a-zA-Z]{3,}\b')
   
    
    X = vectorizer.fit_transform(processed_texts)

    results = {}

    algorithms = {
        'GAKM': ImprovedGAKM_NMI(n_clusters=n_clusters, y_true=y_true)
            
    }
        
    for algo_name, algo in algorithms.items():
        try:
            print(f"Running {algo_name}...")
            y_pred = algo.fit_predict(X)
            results[f"{algo_name}"] = evaluate_clustering(y_true, y_pred)
            print(f"{algo_name} completed")
        except Exception as e:
            print(f"Error in {algo_name} : {str(e)}")
            results[f"{algo_name}"] = {'Purity': 0, 'NMI': 0, 'ARI': 0}
    

    return results


results = run_experiments_BBC_dataset()

print("\n=== Final Results ===")
for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")


Running GAKM...
GAKM completed

=== Final Results ===

GAKM:
Purity: 0.6643
NMI: 0.4178
ARI: 0.4109


In [27]:
#SMS
df = pd.read_csv("C:/Users/asus/Downloads/FinalProject/archive (3)/spam.csv", encoding='latin1')[['v1', 'v2']]
df.columns = ['label', 'text']

processed_texts = preprocess_text(df['text'].tolist())


vectorizer = TfidfVectorizer(stop_words='english',
             min_df=2,
             max_df=0.95,
             token_pattern=r'\b[a-zA-Z]{3,}\b')

X = vectorizer.fit_transform(processed_texts)

label_mapping = {'ham': 0, 'spam': 1}
df['label'] = df['label'].map(label_mapping)

y_true = df['label'].tolist()

n_clusters = len(np.unique(y_true))

results = {}

algorithms = {
    'GAKM': ImprovedGAKM_NMI(n_clusters=n_clusters, y_true=y_true)
            
}
        
for algo_name, algo in algorithms.items():
    try:
        print(f"Running {algo_name}...")
        y_pred = algo.fit_predict(X)
        results[f"{algo_name}"] = evaluate_clustering(y_true, y_pred)
        print(f"{algo_name} completed")
    except Exception as e:
        print(f"Error in {algo_name} : {str(e)}")
        results[f"{algo_name}"] = {'Purity': 0, 'NMI': 0, 'ARI': 0}
    

print("\n=== Final Results ===")
for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")


Running GAKM...
GAKM completed

=== Final Results ===

GAKM:
Purity: 0.8659
NMI: 0.1133
ARI: 0.2713


In [29]:
#DMOZ dataset

df = pd.read_csv("C:/Users/asus/Downloads/FinalProject/archive (5)/dmoz.csv", encoding='latin1')
df = df.rename(columns={
            'category': 'Class Index',
            'title': 'Title',
            'desc': 'Description'
        })
N = 300  
df_balanced = (
    df.groupby('Class Index', group_keys=False)
    .apply(lambda x: x.sample(n=min(N, len(x)), random_state=42))
    .reset_index(drop=True)
)


texts = (df_balanced['Title'] + ' ' + df_balanced['Description']).astype(str).tolist()
labels = df_balanced['Class Index'].tolist()


processed_texts = preprocess_text(texts)


vectorizer = TfidfVectorizer(stop_words='english',
             min_df=2,
             max_df=0.95,
             token_pattern=r'\b[a-zA-Z]{3,}\b')

X = vectorizer.fit_transform(processed_texts)

label_encoder = LabelEncoder()
y_true = label_encoder.fit_transform(labels)



n_clusters = len(np.unique(y_true))

results = {}

algorithms = {
    'GAKM': ImprovedGAKM_NMI(n_clusters=n_clusters, y_true=y_true)
            
}
        
for algo_name, algo in algorithms.items():
    try:
        print(f"Running {algo_name}...")
        y_pred = algo.fit_predict(X)
        results[f"{algo_name}"] = evaluate_clustering(y_true, y_pred)
        print(f"{algo_name} completed")
    except Exception as e:
        print(f"Error in {algo_name} : {str(e)}")
        results[f"{algo_name}"] = {'Purity': 0, 'NMI': 0, 'ARI': 0}
    

print("\n=== Final Results ===")
for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")


  .apply(lambda x: x.sample(n=min(N, len(x)), random_state=42))


Running GAKM...
GAKM completed

=== Final Results ===

GAKM:
Purity: 0.2100
NMI: 0.0932
ARI: 0.0329


In [31]:
#BBC Sport
def run_GAKM_BBCS(name, X, y_true, n_clusters):
    print(f"\n=== Results on {name} Dataset ===")
    results = {}
    algorithms = {
        'GAKM': ImprovedGAKM_NMI(n_clusters=n_clusters, y_true=y_true)
            
    }
        
    for algo_name, algo in algorithms.items():
        try:
            print(f"Running {algo_name}...")
            y_pred = algo.fit_predict(X)
            results[f"{algo_name}"] = evaluate_clustering(y_true, y_pred)
            print(f"{algo_name} completed")
        except Exception as e:
            print(f"Error in {algo_name} : {str(e)}")
            results[f"{algo_name}"] = {'Purity': 0, 'NMI': 0, 'ARI': 0}
   
    

    for method, metrics in results.items():
        print(f"\n{method}:")
        for metric, value in metrics.items():
            print(f"{metric}: {value:.4f}")


bbcsport_X = pd.read_csv(r'C:\Users\asus\Downloads\FinalProject\archive (2)\bbcsport_mtx.csv', header=None).values
bbcsport_y = pd.read_csv(r'C:\Users\asus\Downloads\FinalProject\archive (2)\bbcsport_classes.csv', header=None)[0].values
bbcsport_n_clusters = len(np.unique(bbcsport_y))

run_GAKM_BBCS("BBCSport", bbcsport_X, bbcsport_y, bbcsport_n_clusters)


=== Results on BBCSport Dataset ===
Running GAKM...
GAKM completed

GAKM:
Purity: 0.6133
NMI: 0.3077
ARI: 0.2559


In [37]:
#AG dataset
def run_AG_dataset():
    
    df = pd.read_csv("C:/Users/asus/Downloads/FinalProject/archive (4)/train.csv", encoding='latin1')

    class_counts = df['Class Index'].value_counts()
    min_class_count = class_counts.min()  

    
    samples_per_class = min(295, min_class_count)

    
    df = df.groupby('Class Index', group_keys=False).apply(
        lambda x: x.sample(n=samples_per_class, random_state=42)
    )
    texts = (df['Title'] + ' ' + df['Description']).astype(str).tolist()
    labels = df['Class Index'].tolist()
    label_encoder = LabelEncoder()
    y_true = label_encoder.fit_transform(labels)
    n_clusters = len(np.unique(y_true))

    processed_texts = preprocess_text(texts)


    vectorizer = TfidfVectorizer(stop_words='english',
                 min_df=2,
                 max_df=0.95,
                 token_pattern=r'\b[a-zA-Z]{3,}\b')
   
    
    X = vectorizer.fit_transform(processed_texts)

    results = {}

    algorithms = {
        'GAKM': ImprovedGAKM_NMI(n_clusters=n_clusters, y_true=y_true)
            
    }
        
    for algo_name, algo in algorithms.items():
        try:
            print(f"Running {algo_name}...")
            y_pred = algo.fit_predict(X)
            results[f"{algo_name}"] = evaluate_clustering(y_true, y_pred)
            print(f"{algo_name} completed")
        except Exception as e:
            print(f"Error in {algo_name} : {str(e)}")
            results[f"{algo_name}"] = {'Purity': 0, 'NMI': 0, 'ARI': 0}
    

    return results


results = run_AG_dataset()


for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

  df = df.groupby('Class Index', group_keys=False).apply(


Running GAKM...
GAKM completed

GAKM:
Purity: 0.4415
NMI: 0.1009
ARI: 0.0767


In [89]:
import os
import random

def run_experiments_webkb_dataset(webkb_folder_path):
    results = {}
    

    for file in os.listdir(webkb_folder_path):
        if file.endswith(".csv"):
            file_path = os.path.join(webkb_folder_path, file)
            print(f"\nProcessing {file}...")

            df = pd.read_csv(file_path)
            texts = df['raw_text'].astype(str).tolist()
            labels = df['label'].tolist()
            y_true = LabelEncoder().fit_transform(labels)
            n_clusters = len(np.unique(y_true))

            processed_texts = preprocess_text(texts)

            
            vectorizer = TfidfVectorizer(
                stop_words='english',
                min_df=2,
                max_df=0.95,
                token_pattern=r'\b[a-zA-Z]{3,}\b'
            )
            X = vectorizer.fit_transform(processed_texts)

            try:
                algo = ImprovedGAKM_NMI(n_clusters=n_clusters, y_true=y_true)
                print(f"Running GAKM on {file}...")
                y_pred = algo.fit_predict(X)
                metrics = evaluate_clustering(y_true, y_pred)
                print(f"SCPSO completed on {file}")
            except Exception as e:
                print(f"Error in GAKM on {file}: {str(e)}")
                metrics = {'Purity': 0, 'NMI': 0, 'ARI': 0}

            
            dataset_name = file.replace(".csv", "")
            results[dataset_name] = metrics

    return results

results = run_experiments_webkb_dataset("C:/Users/asus/Downloads/FinalProject/WebKB")

print("\n=== Final Results on WebKB ===")
for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")


Processing Cornell.csv...
Running GAKM on Cornell.csv...
SCPSO completed on Cornell.csv

Processing Texas.csv...
Running GAKM on Texas.csv...
SCPSO completed on Texas.csv

Processing Washington.csv...
Running GAKM on Washington.csv...
SCPSO completed on Washington.csv

Processing Wisconsin.csv...
Running GAKM on Wisconsin.csv...
SCPSO completed on Wisconsin.csv

=== Final Results on WebKB ===

Cornell:
Purity: 0.6963
NMI: 0.3588
ARI: 0.3216

Texas:
Purity: 0.7112
NMI: 0.3225
ARI: 0.4426

Washington:
Purity: 0.6987
NMI: 0.2782
ARI: 0.2115

Wisconsin:
Purity: 0.7170
NMI: 0.3202
ARI: 0.3210


In [73]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_kernels
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.preprocessing import normalize


class SCPSO:
    def __init__(self, n_clusters=4, n_particles=20, max_iter=100, w=0.7, c1=1.5, c2=1.5, gamma=0.01):
        self.n_clusters = n_clusters
        self.n_particles = n_particles
        self.max_iter = max_iter
        self.w = w
        self.c1 = c1
        self.c2 = c2
        self.gamma = gamma

    def _compute_laplacian(self, X):
        
        W = rbf_kernel(X, gamma=self.gamma)
        D = np.diag(np.sum(W, axis=1))
        D_inv_sqrt = np.linalg.inv(np.sqrt(D))
        L = np.eye(len(X)) - D_inv_sqrt @ W @ D_inv_sqrt
        return L

    def _compute_embedding(self, L):
        
        eigvals, eigvecs = np.linalg.eig(L)
        idx = np.argsort(eigvals)
        embedding = eigvecs[:, idx[1:self.n_clusters+1]].real
        return normalize(embedding)

    def _pso_init(self, X_embedding):
        
        n_features = X_embedding.shape[1]
        particles = np.random.rand(self.n_particles, self.n_clusters, n_features)
        velocities = np.zeros_like(particles)
        pbest = particles.copy()
        pbest_scores = np.full(self.n_particles, -np.inf)
        return particles, velocities, pbest, pbest_scores

    def _compute_fitness(self, X_embedding, centroids):
        
        distances = np.zeros((X_embedding.shape[0], self.n_clusters))
        for i in range(self.n_clusters):
            distances[:, i] = np.linalg.norm(X_embedding - centroids[i], axis=1)
        
        labels = np.argmin(distances, axis=1)
        
        if len(np.unique(labels)) < self.n_clusters:
            return -np.inf

        
        intra_dist = np.mean([np.mean(distances[labels == i, i]) for i in range(self.n_clusters)])
        
        
        inter_distances = []
        for i in range(self.n_clusters):
            mask = labels == i
            if np.sum(mask) == 0:
                return -np.inf
            min_dist = np.min([np.mean(distances[mask, j]) for j in range(self.n_clusters) if j != i] or [np.inf])
            inter_distances.append(min_dist)
        
        inter_dist = np.mean(inter_distances)
        return inter_dist / (intra_dist + 1e-10)

    def _pso_optimize(self, X_embedding):
        
        particles, velocities, pbest, pbest_scores = self._pso_init(X_embedding)
        gbest = pbest[0].copy()
        gbest_score = -np.inf
        
        for _ in range(self.max_iter):
            for i in range(self.n_particles):
                current_score = self._compute_fitness(X_embedding, particles[i])
                
                if current_score > pbest_scores[i]:
                    pbest_scores[i] = current_score
                    pbest[i] = particles[i].copy()
                
                if current_score > gbest_score:
                    gbest_score = current_score
                    gbest = particles[i].copy()
            
            for i in range(self.n_particles):
                r1, r2 = np.random.rand(2)
                velocities[i] = (self.w * velocities[i] + 
                                self.c1 * r1 * (pbest[i] - particles[i]) + 
                                self.c2 * r2 * (gbest - particles[i]))
                particles[i] += velocities[i]
        
        return gbest

    def fit_predict(self, X):
        
        if hasattr(X, 'toarray'):
            X = X.toarray()
        
        L = self._compute_laplacian(X)
        X_embedding = self._compute_embedding(L)
        centroids = self._pso_optimize(X_embedding)
        
        
        distances = np.zeros((X_embedding.shape[0], self.n_clusters))
        for i in range(self.n_clusters):
            distances[:, i] = np.linalg.norm(X_embedding - centroids[i], axis=1)
        
        return np.argmin(distances, axis=1)

In [75]:
from sklearn.metrics import normalized_mutual_info_score
import numpy as np
from tqdm import tqdm  

def find_optimal_gamma(X, y_true, n_clusters, gamma_range):
   
    best_nmi = -1
    best_gamma = None
    
    for gamma in tqdm(gamma_range, desc='Finding gamma'):
        try:
            model = SCPSO(n_clusters=n_clusters, gamma=gamma)
            y_pred = model.fit_predict(X)
            
            
            current_nmi = normalized_mutual_info_score(y_true, y_pred)
            
            if current_nmi > best_nmi:
                best_nmi = current_nmi
                best_gamma = gamma
        except:
            continue
    
    return best_gamma, best_nmi

In [11]:
def run_AG_dataset():
    
    df = pd.read_csv("C:/Users/asus/Downloads/FinalProject/archive (4)/train.csv", encoding='latin1')
    
    class_counts = df['Class Index'].value_counts()
    min_class_count = class_counts.min()  
    samples_per_class = min(295, min_class_count)
    
    df = df.groupby('Class Index', group_keys=False).apply(
        lambda x: x.sample(n=samples_per_class, random_state=42)
    )
    
    texts = (df['Title'] + ' ' + df['Description']).astype(str).tolist()
    labels = df['Class Index'].tolist()
    label_encoder = LabelEncoder()
    y_true = label_encoder.fit_transform(labels)
    n_clusters = len(np.unique(y_true))

    
    processed_texts = preprocess_text(texts)
    
    
    vectorizer = TfidfVectorizer(stop_words='english',
                 min_df=2,
                 max_df=0.95,
                 token_pattern=r'\b[a-zA-Z]{3,}\b')
    X = vectorizer.fit_transform(processed_texts)

    gamma_candidates = np.logspace(-3, 1, 20)  

    best_gamma, best_nmi = find_optimal_gamma(
        X=X,
        y_true=y_true,
        n_clusters=n_clusters,
        gamma_range=gamma_candidates
    )

    print(f"\nbest gamma: {best_gamma:.5f}")

    results = {}

    
    algorithms = {
        
        'SCPSO': SCPSO(n_clusters=n_clusters, n_particles=30, max_iter=50, gamma=best_gamma)
    }
        
    for algo_name, algo in algorithms.items():
        try:
            print(f"Running {algo_name}...")
            y_pred = algo.fit_predict(X)
            results[f"{algo_name}"] = evaluate_clustering(y_true, y_pred)
            print(f"{algo_name} completed")
        except Exception as e:
            print(f"Error in {algo_name} : {str(e)}")
            results[f"{algo_name}"] = {'Purity': 0, 'NMI': 0, 'ARI': 0}
    
    return results


results = run_AG_dataset()


for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")



  df = df.groupby('Class Index', group_keys=False).apply(
Finding gamma: 100%|██████████| 20/20 [01:44<00:00,  5.21s/it]



best gamma: 0.33598
Running SCPSO...
SCPSO completed

SCPSO:
Purity: 0.6483
NMI: 0.3835
ARI: 0.4042


In [14]:
#BBC Sport
def run_BBCS(name, X, y_true, n_clusters):
    print(f"\n=== Results on {name} Dataset ===")
    
    gamma_candidates = np.logspace(-3, 1, 20)  

    best_gamma, best_nmi = find_optimal_gamma(
        X=X,
        y_true=y_true,
        n_clusters=n_clusters,
        gamma_range=gamma_candidates
    )

    print(f"\nbest gamma: {best_gamma:.5f}")
    
    results = {}
    algorithms = {
        'SCPSO': SCPSO(n_clusters=n_clusters, n_particles=30, max_iter=50, gamma=best_gamma)
            
    }
        
    for algo_name, algo in algorithms.items():
        try:
            print(f"Running {algo_name}...")
            y_pred = algo.fit_predict(X)
            results[f"{algo_name}"] = evaluate_clustering(y_true, y_pred)
            print(f"{algo_name} completed")
        except Exception as e:
            print(f"Error in {algo_name} : {str(e)}")
            results[f"{algo_name}"] = {'Purity': 0, 'NMI': 0, 'ARI': 0}
   
    

    for method, metrics in results.items():
        print(f"\n{method}:")
        for metric, value in metrics.items():
            print(f"{metric}: {value:.4f}")


bbcsport_X = pd.read_csv(r'C:\Users\asus\Downloads\FinalProject\archive (2)\bbcsport_mtx.csv', header=None).values
bbcsport_y = pd.read_csv(r'C:\Users\asus\Downloads\FinalProject\archive (2)\bbcsport_classes.csv', header=None)[0].values
bbcsport_n_clusters = len(np.unique(bbcsport_y))

run_BBCS("BBCSport", bbcsport_X, bbcsport_y, bbcsport_n_clusters)



=== Results on BBCSport Dataset ===


Finding gamma: 100%|██████████| 20/20 [01:14<00:00,  3.70s/it]



best gamma: 0.00100
Running SCPSO...
SCPSO completed

SCPSO:
Purity: 0.4559
NMI: 0.2120
ARI: 0.1327


In [43]:
#SMS
df = pd.read_csv("C:/Users/asus/Downloads/FinalProject/archive (3)/spam.csv", encoding='latin1')[['v1', 'v2']]
df.columns = ['label', 'text']

processed_texts = preprocess_text(df['text'].tolist())


vectorizer = TfidfVectorizer(stop_words='english',
             min_df=2,
             max_df=0.95,
             token_pattern=r'\b[a-zA-Z]{3,}\b')

X = vectorizer.fit_transform(processed_texts)

label_mapping = {'ham': 0, 'spam': 1}
df['label'] = df['label'].map(label_mapping)

y_true = df['label'].tolist()

n_clusters = len(np.unique(y_true))

results = {}

algorithms = {
    'SCPSO': SCPSO(n_clusters=n_clusters, n_particles=30, max_iter=50, gamma=0.1)
            
}
        
for algo_name, algo in algorithms.items():
    try:
        print(f"Running {algo_name}...")
        y_pred = algo.fit_predict(X)
        results[f"{algo_name}"] = evaluate_clustering(y_true, y_pred)
        print(f"{algo_name} completed")
    except Exception as e:
        print(f"Error in {algo_name} : {str(e)}")
        results[f"{algo_name}"] = {'Purity': 0, 'NMI': 0, 'ARI': 0}
    

print("\n=== Final Results ===")
for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")


gamma_candidates = np.logspace(-3, 1, 20)  

best_gamma, best_nmi = find_optimal_gamma(
    X=X,
    y_true=y_true,
    n_clusters=n_clusters,
    gamma_range=gamma_candidates
)

print(f"\nbest gamma: {best_gamma:.5f}")



Running SCPSO...
SCPSO completed

=== Final Results ===

SCPSO:
Purity: 0.8659
NMI: 0.0282
ARI: -0.0779


Finding gamma: 100%|██████████| 20/20 [45:16<00:00, 135.81s/it]


best gamma: 0.01129





In [45]:
#SMS
df = pd.read_csv("C:/Users/asus/Downloads/FinalProject/archive (3)/spam.csv", encoding='latin1')[['v1', 'v2']]
df.columns = ['label', 'text']

processed_texts = preprocess_text(df['text'].tolist())


vectorizer = TfidfVectorizer(stop_words='english',
             min_df=2,
             max_df=0.95,
             token_pattern=r'\b[a-zA-Z]{3,}\b')

X = vectorizer.fit_transform(processed_texts)

label_mapping = {'ham': 0, 'spam': 1}
df['label'] = df['label'].map(label_mapping)

y_true = df['label'].tolist()

n_clusters = len(np.unique(y_true))

results = {}

algorithms = {
    'SCPSO': SCPSO(n_clusters=n_clusters, n_particles=30, max_iter=50, gamma=best_gamma)
            
}
        
for algo_name, algo in algorithms.items():
    try:
        print(f"Running {algo_name}...")
        y_pred = algo.fit_predict(X)
        results[f"{algo_name}"] = evaluate_clustering(y_true, y_pred)
        print(f"{algo_name} completed")
    except Exception as e:
        print(f"Error in {algo_name} : {str(e)}")
        results[f"{algo_name}"] = {'Purity': 0, 'NMI': 0, 'ARI': 0}
    

print("\n=== Final Results ===")
for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

Running SCPSO...
SCPSO completed

=== Final Results ===

SCPSO:
Purity: 0.8659
NMI: 0.0709
ARI: -0.0780


In [71]:
#BBC news
def run_experiments_BBC_dataset():
    
    df = pd.read_csv("C:/Users/asus/Downloads/FinalProject/dataset.csv", encoding='latin1')

    texts = df['news'].astype(str).tolist()
    labels = df['type'].astype(str).tolist()
    y_true = LabelEncoder().fit_transform(labels)
    n_clusters = len(np.unique(y_true))

    processed_texts = preprocess_text(texts)


    vectorizer = TfidfVectorizer(stop_words='english',
                 min_df=2,
                 max_df=0.95,
                 token_pattern=r'\b[a-zA-Z]{3,}\b')
   
    
    X = vectorizer.fit_transform(processed_texts)

    gamma_candidates = np.logspace(-3, 1, 20)  
    best_gamma, best_nmi = find_optimal_gamma(
        X=X,
        y_true=y_true,
        n_clusters=n_clusters,
        gamma_range=gamma_candidates
    )

    print(f"\nbest gamma: {best_gamma:.5f}")
    
    results = {}

    algorithms = {
         'SCPSO': SCPSO(n_clusters=n_clusters, n_particles=30, max_iter=50, gamma=best_gamma)
            
    }
        
    for algo_name, algo in algorithms.items():
        try:
            print(f"Running {algo_name}...")
            y_pred = algo.fit_predict(X)
            results[f"{algo_name}"] = evaluate_clustering(y_true, y_pred)
            print(f"{algo_name} completed")
        except Exception as e:
            print(f"Error in {algo_name} : {str(e)}")
            results[f"{algo_name}"] = {'Purity': 0, 'NMI': 0, 'ARI': 0}
    

    return results


results = run_experiments_BBC_dataset()

print("\n=== Final Results ===")
for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")



Finding gamma: 100%|██████████| 20/20 [02:42<00:00,  8.15s/it]



best gamma: 0.00162
Running SCPSO...
SCPSO completed

=== Final Results ===

SCPSO:
Purity: 0.7052
NMI: 0.5819
ARI: 0.5045


In [21]:
#20newsgroup
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
import pyswarms as ps
from deap import base, creator, tools, algorithms
import random




def run_experiments():
    categories = {
        'Computer': ['comp.graphics', 'comp.os.ms-windows.misc', 
                    'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 
                    'comp.windows.x'],
        'Politics': ['talk.politics.misc', 'talk.politics.guns', 
                    'talk.politics.mideast'],
        'Miscellaneous': ['misc.forsale','talk.politics.misc','talk.religion.misc','comp.os.ms-windows.misc'],
        'Religion': ['talk.religion.misc', 'alt.atheism', 'soc.religion.christian'],
        'Science': ['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space']
    }
    
    results = {}
    
    for name, subcats in categories.items():
        print(f"\n=== Processing {name} category ===")
        dataset = fetch_20newsgroups(subset='all', categories=subcats, 
                                   remove=('headers', 'footers', 'quotes'))
        n_clusters = len(subcats)
        
        processed_texts = preprocess_text(dataset.data)


        vectorizer = TfidfVectorizer(stop_words='english',
                     min_df=2,
                     max_df=0.95,
                     token_pattern=r'\b[a-zA-Z]{3,}\b')
   
    
        X = vectorizer.fit_transform(processed_texts)
        y_true = dataset.target
        
        gamma_candidates = np.logspace(-3, 1, 20)  
        best_gamma, best_nmi = find_optimal_gamma(
            X=X,
            y_true=y_true,
            n_clusters=n_clusters,
            gamma_range=gamma_candidates
        )

        print(f"\nbest gamma: {best_gamma:.5f}")
        
        algorithms = {

            'SCPSO': SCPSO(n_clusters=n_clusters, n_particles=30, max_iter=50, gamma=best_gamma)
               
            
        }
        
        for algo_name, algo in algorithms.items():
            try:
                print(f"Running {algo_name}...")
                y_pred = algo.fit_predict(X)
                results[f"{name}_{algo_name}"] = evaluate_clustering(y_true, y_pred)
                print(f"{algo_name} completed for {name}")
            except Exception as e:
                print(f"Error in {algo_name} for {name}: {str(e)}")
                results[f"{name}_{algo_name}"] = {'Purity': 0, 'NMI': 0, 'ARI': 0}
    
    return results


if __name__ == "__main__":
    results = run_experiments()
    print("\n=== Final Results ===")
    for name, metrics in results.items():
        print(f"\n{name}:")
        for metric, value in metrics.items():
            print(f"{metric}: {value:.4f}")


=== Processing Computer category ===


Finding gamma: 100%|██████████| 20/20 [39:39<00:00, 118.96s/it]



best gamma: 2.33572
Running SCPSO...
SCPSO completed for Computer

=== Processing Politics category ===


Finding gamma: 100%|██████████| 20/20 [07:47<00:00, 23.39s/it]



best gamma: 0.00428
Running SCPSO...
SCPSO completed for Politics

=== Processing Miscellaneous category ===


Finding gamma: 100%|██████████| 20/20 [14:55<00:00, 44.78s/it]



best gamma: 0.54556
Running SCPSO...
SCPSO completed for Miscellaneous

=== Processing Religion category ===


Finding gamma: 100%|██████████| 20/20 [06:17<00:00, 18.87s/it]



best gamma: 0.02976
Running SCPSO...
SCPSO completed for Religion

=== Processing Science category ===


Finding gamma: 100%|██████████| 20/20 [21:56<00:00, 65.80s/it]



best gamma: 0.33598
Running SCPSO...
SCPSO completed for Science

=== Final Results ===

Computer_SCPSO:
Purity: 0.3210
NMI: 0.1627
ARI: 0.0862

Politics_SCPSO:
Purity: 0.5337
NMI: 0.2059
ARI: 0.1049

Miscellaneous_SCPSO:
Purity: 0.6973
NMI: 0.5330
ARI: 0.4806

Religion_SCPSO:
Purity: 0.5219
NMI: 0.0993
ARI: 0.0920

Science_SCPSO:
Purity: 0.4226
NMI: 0.2515
ARI: 0.1063


In [15]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_kernels
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.preprocessing import normalize
#DMOZ
class SCPSO:
    def __init__(self, n_clusters=4, n_particles=20, max_iter=100, w=0.7, c1=1.5, c2=1.5, gamma=0.01, random_state=None):
        self.random_state = random_state
        self.n_clusters = n_clusters
        self.n_particles = n_particles
        self.max_iter = max_iter
        self.w = w
        self.c1 = c1
        self.c2 = c2
        self.gamma = gamma

    def _compute_laplacian(self, X):
        W = rbf_kernel(X, gamma=self.gamma)
        D = np.diag(np.sum(W, axis=1))
        D_inv_sqrt = np.linalg.inv(np.sqrt(D))
        L = np.eye(len(X)) - D_inv_sqrt @ W @ D_inv_sqrt
        return L

    def _compute_embedding(self, L):
        eigvals, eigvecs = np.linalg.eigh(L)
        idx = np.argsort(eigvals)

        max_components = eigvecs.shape[1] - 1
        n_embedding_components = min(self.n_clusters, max_components)

        if n_embedding_components < self.n_clusters:
            print(f"[Warning] Only {n_embedding_components} eigenvectors available for embedding. Clustering will still use {self.n_clusters} clusters.")

        try:
            embedding = eigvecs[:, idx[1:self.n_clusters + 1]].real
        except IndexError as e:
            print(f"[FATAL] Could not extract embedding: {str(e)}")
            raise e


        return normalize(embedding)

    def _pso_init(self, X_embedding):
        n_features = X_embedding.shape[1]
        particles = np.random.rand(self.n_particles, self.n_clusters, n_features)
        velocities = np.zeros_like(particles)
        pbest = particles.copy()
        pbest_scores = np.full(self.n_particles, -np.inf)
        

        return particles, velocities, pbest, pbest_scores

    def _compute_fitness(self, X_embedding, centroids):
        distances = np.zeros((X_embedding.shape[0], self.n_clusters))
        for i in range(self.n_clusters):
            distances[:, i] = np.linalg.norm(X_embedding - centroids[i], axis=1)

        labels = np.argmin(distances, axis=1)

        if len(np.unique(labels)) < self.n_clusters:
            return -np.inf

        intra_dist = np.mean([np.mean(distances[labels == i, i]) for i in range(self.n_clusters)])
        inter_distances = []
        for i in range(self.n_clusters):
            mask = labels == i
            if np.sum(mask) == 0:
                return -np.inf
            min_dist = np.min([np.mean(distances[mask, j]) for j in range(self.n_clusters) if j != i] or [np.inf])
            inter_distances.append(min_dist)

        inter_dist = np.mean(inter_distances)
        return inter_dist / (intra_dist + 1e-10)

    def _pso_optimize(self, X_embedding):
        particles, velocities, pbest, pbest_scores = self._pso_init(X_embedding)
        gbest = pbest[0].copy()
        gbest_score = -np.inf

        for _ in range(self.max_iter):
            for i in range(self.n_particles):
                current_score = self._compute_fitness(X_embedding, particles[i])
                if current_score > pbest_scores[i]:
                    pbest_scores[i] = current_score
                    pbest[i] = particles[i].copy()
                if current_score > gbest_score:
                    gbest_score = current_score
                    gbest = particles[i].copy()

            for i in range(self.n_particles):
                r1, r2 = np.random.rand(2)
                velocities[i] = (self.w * velocities[i] +
                                 self.c1 * r1 * (pbest[i] - particles[i]) +
                                 self.c2 * r2 * (gbest - particles[i]))
                particles[i] += velocities[i]

        return gbest

    def fit_predict(self, X):
        if self.random_state is not None:
            np.random.seed(self.random_state)
        if hasattr(X, 'toarray'):
            X = X.toarray()

        L = self._compute_laplacian(X)
        X_embedding = self._compute_embedding(L)
        centroids = self._pso_optimize(X_embedding)

        distances = np.zeros((X_embedding.shape[0], self.n_clusters))
        for i in range(self.n_clusters):
            distances[:, i] = np.linalg.norm(X_embedding - centroids[i], axis=1)

        return np.argmin(distances, axis=1)

df = pd.read_csv("C:/Users/asus/Downloads/FinalProject/archive (5)/dmoz.csv", encoding='latin1')
df = df.rename(columns={
            'category': 'Class Index',
            'title': 'Title',
            'desc': 'Description'
        })
N = 300  
df_balanced = (
    df.groupby('Class Index', group_keys=False)
    .apply(lambda x: x.sample(n=min(N, len(x)), random_state=42))
    .reset_index(drop=True)
)


texts = (df_balanced['Title'] + ' ' + df_balanced['Description']).astype(str).tolist()
labels = df_balanced['Class Index'].tolist()


processed_texts = preprocess_text(texts)


vectorizer = TfidfVectorizer(stop_words='english',
             min_df=2,
             max_df=0.95,
             token_pattern=r'\b[a-zA-Z]{3,}\b')

X = vectorizer.fit_transform(processed_texts)
 
label_encoder = LabelEncoder()
y_true = label_encoder.fit_transform(labels)



n_clusters = len(np.unique(y_true))
gamma_candidates = np.logspace(-3, 1, 20)  
best_gamma, best_nmi = find_optimal_gamma(
    X=X,
    y_true=y_true,
    n_clusters=n_clusters,
    gamma_range=gamma_candidates
)

print(f"\nbest gamma: {best_gamma:.5f}")
results = {}

algorithms = {
    'SCPSO': SCPSO(n_clusters=n_clusters, n_particles=30, max_iter=50, gamma=best_gamma, random_state=42)
            
}
        
for algo_name, algo in algorithms.items():
    try:
        print(f"Running {algo_name}...")
        y_pred = algo.fit_predict(X)
        results[f"{algo_name}"] = evaluate_clustering(y_true, y_pred)
        print(f"{algo_name} completed")
    except Exception as e:
        print(f"Error in {algo_name} : {str(e)}")
        results[f"{algo_name}"] = {'Purity': 0, 'NMI': 0, 'ARI': 0}
    

print("\n=== Final Results ===")
for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

  .apply(lambda x: x.sample(n=min(N, len(x)), random_state=42))
Finding gamma: 100%|██████████| 20/20 [03:39<00:00, 10.98s/it]



best gamma: 0.04833
Running SCPSO...
SCPSO completed

=== Final Results ===

SCPSO:
Purity: 0.1762
NMI: 0.0881
ARI: 0.0260


In [77]:
import os
def run_experiments_webkb_dataset(webkb_folder_path):
    results = {}
    gamma_candidates = np.logspace(-3, 1, 20)

    for file in os.listdir(webkb_folder_path):
        if file.endswith(".csv"):
            file_path = os.path.join(webkb_folder_path, file)
            print(f"\nProcessing {file}...")

            # Load data
            df = pd.read_csv(file_path)
            texts = df['raw_text'].astype(str).tolist()
            labels = df['label'].tolist()
            y_true = LabelEncoder().fit_transform(labels)
            n_clusters = len(np.unique(y_true))

            # Preprocess
            processed_texts = preprocess_text(texts)

            # TF-IDF vectorization
            vectorizer = TfidfVectorizer(
                stop_words='english',
                min_df=2,
                max_df=0.95,
                token_pattern=r'\b[a-zA-Z]{3,}\b'
            )
            X = vectorizer.fit_transform(processed_texts)

            # Find best gamma for SCPSO
            best_gamma, best_nmi = find_optimal_gamma(
                X=X,
                y_true=y_true,
                n_clusters=n_clusters,
                gamma_range=gamma_candidates
            )
            print(f"Best gamma for {file}: {best_gamma:.5f}")

            # Run SCPSO
            try:
                algo = SCPSO(n_clusters=n_clusters, n_particles=30, max_iter=50, gamma=best_gamma)
                print(f"Running SCPSO on {file}...")
                y_pred = algo.fit_predict(X)
                metrics = evaluate_clustering(y_true, y_pred)
                print(f"SCPSO completed on {file}")
            except Exception as e:
                print(f"Error in SCPSO on {file}: {str(e)}")
                metrics = {'Purity': 0, 'NMI': 0, 'ARI': 0}

            # Save result
            dataset_name = file.replace(".csv", "")
            results[dataset_name] = metrics

    return results

results = run_experiments_webkb_dataset("C:/Users/asus/Downloads/FinalProject/WebKB")

print("\n=== Final Results on WebKB ===")
for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")



Processing Cornell.csv...


Finding gamma: 100%|██████████| 20/20 [00:15<00:00,  1.31it/s]


Best gamma for Cornell.csv: 0.54556
Running SCPSO on Cornell.csv...
SCPSO completed on Cornell.csv

Processing Texas.csv...


Finding gamma: 100%|██████████| 20/20 [00:14<00:00,  1.37it/s]


Best gamma for Texas.csv: 0.54556
Running SCPSO on Texas.csv...
SCPSO completed on Texas.csv

Processing Washington.csv...


Finding gamma: 100%|██████████| 20/20 [00:16<00:00,  1.24it/s]


Best gamma for Washington.csv: 2.33572
Running SCPSO on Washington.csv...
SCPSO completed on Washington.csv

Processing Wisconsin.csv...


Finding gamma: 100%|██████████| 20/20 [00:16<00:00,  1.18it/s]


Best gamma for Wisconsin.csv: 2.33572
Running SCPSO on Wisconsin.csv...
SCPSO completed on Wisconsin.csv

=== Final Results on WebKB ===

Cornell:
Purity: 0.5183
NMI: 0.1719
ARI: 0.1029

Texas:
Purity: 0.5989
NMI: 0.2286
ARI: 0.0502

Washington:
Purity: 0.6856
NMI: 0.3187
ARI: 0.2524

Wisconsin:
Purity: 0.6642
NMI: 0.3279
ARI: 0.2362


In [66]:
#standard deviation
def run_AG_dataset():
    
    df = pd.read_csv("C:/Users/asus/Downloads/FinalProject/archive (4)/train.csv", encoding='latin1')
    
    class_counts = df['Class Index'].value_counts()
    min_class_count = class_counts.min()  
    samples_per_class = min(295, min_class_count)
    
    df = df.groupby('Class Index', group_keys=False).apply(
        lambda x: x.sample(n=samples_per_class, random_state=42)
    )
    
    texts = (df['Title'] + ' ' + df['Description']).astype(str).tolist()
    labels = df['Class Index'].tolist()
    label_encoder = LabelEncoder()
    y_true = label_encoder.fit_transform(labels)
    n_clusters = len(np.unique(y_true))

    
    processed_texts = preprocess_text(texts)
    
    
    vectorizer = TfidfVectorizer(stop_words='english',
                 min_df=2,
                 max_df=0.95,
                 token_pattern=r'\b[a-zA-Z]{3,}\b')
    X = vectorizer.fit_transform(processed_texts)


    results = {}

    
    algorithms = {
        
        'SCPSO': SCPSO(n_clusters=n_clusters, n_particles=30, max_iter=50, gamma=0.33598)
    }
        
    for algo_name, algo in algorithms.items():
        try:
            print(f"Running {algo_name}...")
            y_pred = algo.fit_predict(X)
            results[f"{algo_name}"] = evaluate_clustering(y_true, y_pred)
            print(f"{algo_name} completed")
        except Exception as e:
            print(f"Error in {algo_name} : {str(e)}")
            results[f"{algo_name}"] = {'Purity': 0, 'NMI': 0, 'ARI': 0}
    
    return results


n_runs = 5  
all_results = defaultdict(lambda: defaultdict(list))

for run in range(n_runs):
    print(f"\n Running iteration {run + 1}/{n_runs}...")
    run_result = run_AG_dataset()
    for method, metrics in run_result.items():
        for metric_name, value in metrics.items():
            all_results[method][metric_name].append(value)

print("\n Final Aggregated Results over", n_runs, "runs:")
for method, metric_values in all_results.items():
    print(f"\n Method: {method}")
    for metric_name, values in metric_values.items():
        mean_val = np.mean(values)
        std_val = np.std(values)
        print(f"{metric_name}: Mean = {mean_val:.4f}, Std = {std_val:.4f}")



 Running iteration 1/5...


  df = df.groupby('Class Index', group_keys=False).apply(


Running SCPSO...
SCPSO completed

 Running iteration 2/5...


  df = df.groupby('Class Index', group_keys=False).apply(


Running SCPSO...
SCPSO completed

 Running iteration 3/5...


  df = df.groupby('Class Index', group_keys=False).apply(


Running SCPSO...
SCPSO completed

 Running iteration 4/5...


  df = df.groupby('Class Index', group_keys=False).apply(


Running SCPSO...
SCPSO completed

 Running iteration 5/5...


  df = df.groupby('Class Index', group_keys=False).apply(


Running SCPSO...
SCPSO completed

 Final Aggregated Results over 5 runs:

 Method: SCPSO
Purity: Mean = 0.5258, Std = 0.0636
NMI: Mean = 0.2895, Std = 0.0529
ARI: Mean = 0.2586, Std = 0.0725


In [70]:
#BBC Sport
n_runs = 5
aggregated_results = defaultdict(lambda: defaultdict(list))

for run in range(n_runs):
    print(f"\n Running iteration {run+1}/{n_runs}...")
    
    results = {}
    algorithms = {
        'SCPSO': SCPSO(n_clusters=bbcsport_n_clusters, n_particles=30, max_iter=50, gamma=0.001)
    }
    
    for algo_name, algo in algorithms.items():
        try:
            print(f"Running {algo_name}...")
            y_pred = algo.fit_predict(bbcsport_X)
            results[algo_name] = evaluate_clustering(bbcsport_y, y_pred)
            print(f"{algo_name} completed")
        except Exception as e:
            print(f"Error in {algo_name}: {str(e)}")
            results[algo_name] = {'Purity': 0, 'NMI': 0, 'ARI': 0}
    
    
    for method_name, metrics in results.items():
        for metric_name, value in metrics.items():
            aggregated_results[method_name][metric_name].append(value)


print("\n Final Aggregated Results over", n_runs, "runs:")
for method_name, metric_values in aggregated_results.items():
    print(f"\n➡ Method: {method_name}")
    for metric_name, values in metric_values.items():
        mean_val = np.mean(values)
        std_val = np.std(values)
        print(f"{metric_name}: Mean = {mean_val:.4f}, Std = {std_val:.4f}")



 Running iteration 1/5...
Running SCPSO...
SCPSO completed

 Running iteration 2/5...
Running SCPSO...
SCPSO completed

 Running iteration 3/5...
Running SCPSO...
SCPSO completed

 Running iteration 4/5...
Running SCPSO...
SCPSO completed

 Running iteration 5/5...
Running SCPSO...
SCPSO completed

 Final Aggregated Results over 5 runs:

➡ Method: SCPSO
Purity: Mean = 0.4795, Std = 0.0363
NMI: Mean = 0.2144, Std = 0.0381
ARI: Mean = 0.1498, Std = 0.0432


In [72]:
#SMS
n_runs = 5  
aggregated_results = defaultdict(lambda: defaultdict(list))

df = pd.read_csv("C:/Users/asus/Downloads/FinalProject/archive (3)/spam.csv", encoding='latin1')[['v1', 'v2']]
df.columns = ['label', 'text']

processed_texts = preprocess_text(df['text'].tolist())

vectorizer = TfidfVectorizer(stop_words='english',
             min_df=2,
             max_df=0.95,
             token_pattern=r'\b[a-zA-Z]{3,}\b')

X = vectorizer.fit_transform(processed_texts)

label_mapping = {'ham': 0, 'spam': 1}
df['label'] = df['label'].map(label_mapping)
y_true = df['label'].tolist()
n_clusters = len(np.unique(y_true))

for run in range(n_runs):
    print(f"\n Running iteration {run+1}/{n_runs}...")

    algorithms = {
        'SCPSO': SCPSO(n_clusters=n_clusters, n_particles=30, max_iter=50, gamma=0.01129)
    }

    for algo_name, algo in algorithms.items():
        try:
            print(f"Running {algo_name}...")
            y_pred = algo.fit_predict(X)
            metrics = evaluate_clustering(y_true, y_pred)
            print(f"{algo_name} completed")
        except Exception as e:
            print(f"Error in {algo_name}: {str(e)}")
            metrics = {'Purity': 0, 'NMI': 0, 'ARI': 0}

        for metric_name, value in metrics.items():
            aggregated_results[algo_name][metric_name].append(value)

print("\n Final Aggregated Results over", n_runs, "runs:")
for method_name, metric_values in aggregated_results.items():
    print(f"\n➡ Method: {method_name}")
    for metric_name, values in metric_values.items():
        mean_val = np.mean(values)
        std_val = np.std(values)
        print(f"{metric_name}: Mean = {mean_val:.4f}, Std = {std_val:.4f}")




 Running iteration 1/5...
Running SCPSO...
SCPSO completed

 Running iteration 2/5...
Running SCPSO...
SCPSO completed

 Running iteration 3/5...
Running SCPSO...
SCPSO completed

 Running iteration 4/5...
Running SCPSO...
SCPSO completed

 Running iteration 5/5...
Running SCPSO...
SCPSO completed

 Final Aggregated Results over 5 runs:

➡ Method: SCPSO
Purity: Mean = 0.8659, Std = 0.0000
NMI: Mean = 0.0737, Std = 0.0051
ARI: Mean = -0.0748, Std = 0.0054


In [13]:
#BBC news
def run_experiments_BBC_dataset():
    
    df = pd.read_csv("C:/Users/asus/Downloads/FinalProject/dataset.csv", encoding='latin1')

    texts = df['news'].astype(str).tolist()
    labels = df['type'].astype(str).tolist()
    y_true = LabelEncoder().fit_transform(labels)
    n_clusters = len(np.unique(y_true))

    processed_texts = preprocess_text(texts)


    vectorizer = TfidfVectorizer(stop_words='english',
                 min_df=2,
                 max_df=0.95,
                 token_pattern=r'\b[a-zA-Z]{3,}\b')
   
    
    X = vectorizer.fit_transform(processed_texts)

    
    results = {}

    algorithms = {
         'SCPSO': SCPSO(n_clusters=n_clusters, n_particles=30, max_iter=50, gamma=0.00162)
            
    }
        
    for algo_name, algo in algorithms.items():
        try:
            print(f"Running {algo_name}...")
            y_pred = algo.fit_predict(X)
            results[f"{algo_name}"] = evaluate_clustering(y_true, y_pred)
            print(f"{algo_name} completed")
        except Exception as e:
            print(f"Error in {algo_name} : {str(e)}")
            results[f"{algo_name}"] = {'Purity': 0, 'NMI': 0, 'ARI': 0}
    

    return results



n_runs = 5  
all_results = defaultdict(lambda: defaultdict(list))

for run in range(n_runs):
    print(f"\n Running iteration {run + 1}/{n_runs}...")
    run_result = run_experiments_BBC_dataset()
    for method, metrics in run_result.items():
        for metric_name, value in metrics.items():
            all_results[method][metric_name].append(value)

print("\n Final Aggregated Results over", n_runs, "runs:")
for method, metric_values in all_results.items():
    print(f"\n Method: {method}")
    for metric_name, values in metric_values.items():
        mean_val = np.mean(values)
        std_val = np.std(values)
        print(f"{metric_name}: Mean = {mean_val:.4f}, Std = {std_val:.4f}")


 Running iteration 1/5...
Running SCPSO...
SCPSO completed

 Running iteration 2/5...
Running SCPSO...
SCPSO completed

 Running iteration 3/5...
Running SCPSO...
SCPSO completed

 Running iteration 4/5...
Running SCPSO...
SCPSO completed

 Running iteration 5/5...
Running SCPSO...
SCPSO completed

 Final Aggregated Results over 5 runs:

 Method: SCPSO
Purity: Mean = 0.6009, Std = 0.0508
NMI: Mean = 0.4493, Std = 0.0364
ARI: Mean = 0.3379, Std = 0.0673


In [15]:
#20newsgroup
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
import pyswarms as ps
from deap import base, creator, tools, algorithms
import random




def run_experiments():
    categories = {
        'Computer': ['comp.graphics', 'comp.os.ms-windows.misc', 
                    'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 
                    'comp.windows.x']
        
    }
    
    results = {}
    
    for name, subcats in categories.items():
        print(f"\n=== Processing {name} category ===")
        dataset = fetch_20newsgroups(subset='all', categories=subcats, 
                                   remove=('headers', 'footers', 'quotes'))
        n_clusters = len(subcats)
        
        processed_texts = preprocess_text(dataset.data)


        vectorizer = TfidfVectorizer(stop_words='english',
                     min_df=2,
                     max_df=0.95,
                     token_pattern=r'\b[a-zA-Z]{3,}\b')
   
    
        X = vectorizer.fit_transform(processed_texts)
        y_true = dataset.target
        
        
        
        algorithms = {

            'SCPSO': SCPSO(n_clusters=n_clusters, n_particles=30, max_iter=50, gamma=2.33572)
               
            
        }
        
        for algo_name, algo in algorithms.items():
            try:
                print(f"Running {algo_name}...")
                y_pred = algo.fit_predict(X)
                results[f"{name}_{algo_name}"] = evaluate_clustering(y_true, y_pred)
                print(f"{algo_name} completed for {name}")
            except Exception as e:
                print(f"Error in {algo_name} for {name}: {str(e)}")
                results[f"{name}_{algo_name}"] = {'Purity': 0, 'NMI': 0, 'ARI': 0}
    
    return results

n_runs = 5  
all_results = defaultdict(lambda: defaultdict(list))

for run in range(n_runs):
    print(f"\n Running iteration {run + 1}/{n_runs}...")
    run_result = run_experiments()
    for method, metrics in run_result.items():
        for metric_name, value in metrics.items():
            all_results[method][metric_name].append(value)

print("\n Final Aggregated Results over", n_runs, "runs:")
for method, metric_values in all_results.items():
    print(f"\n Method: {method}")
    for metric_name, values in metric_values.items():
        mean_val = np.mean(values)
        std_val = np.std(values)
        print(f"{metric_name}: Mean = {mean_val:.4f}, Std = {std_val:.4f}")



 Running iteration 1/5...

=== Processing Computer category ===
Running SCPSO...
SCPSO completed for Computer

 Running iteration 2/5...

=== Processing Computer category ===
Running SCPSO...
SCPSO completed for Computer

 Running iteration 3/5...

=== Processing Computer category ===
Running SCPSO...
SCPSO completed for Computer

 Running iteration 4/5...

=== Processing Computer category ===
Running SCPSO...
SCPSO completed for Computer

 Running iteration 5/5...

=== Processing Computer category ===
Running SCPSO...
SCPSO completed for Computer

 Final Aggregated Results over 5 runs:

 Method: Computer_SCPSO
Purity: Mean = 0.3369, Std = 0.0144
NMI: Mean = 0.1573, Std = 0.0179
ARI: Mean = 0.0853, Std = 0.0238


In [17]:
#20newsgroup
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
import pyswarms as ps
from deap import base, creator, tools, algorithms
import random




def run_experiments():
    categories = {
        
        'Politics': ['talk.politics.misc', 'talk.politics.guns', 
                    'talk.politics.mideast']
        
    }
    
    results = {}
    
    for name, subcats in categories.items():
        print(f"\n=== Processing {name} category ===")
        dataset = fetch_20newsgroups(subset='all', categories=subcats, 
                                   remove=('headers', 'footers', 'quotes'))
        n_clusters = len(subcats)
        
        processed_texts = preprocess_text(dataset.data)


        vectorizer = TfidfVectorizer(stop_words='english',
                     min_df=2,
                     max_df=0.95,
                     token_pattern=r'\b[a-zA-Z]{3,}\b')
   
    
        X = vectorizer.fit_transform(processed_texts)
        y_true = dataset.target
        
        
        
        algorithms = {

            'SCPSO': SCPSO(n_clusters=n_clusters, n_particles=30, max_iter=50, gamma=0.00428)
               
            
        }
        
        for algo_name, algo in algorithms.items():
            try:
                print(f"Running {algo_name}...")
                y_pred = algo.fit_predict(X)
                results[f"{name}_{algo_name}"] = evaluate_clustering(y_true, y_pred)
                print(f"{algo_name} completed for {name}")
            except Exception as e:
                print(f"Error in {algo_name} for {name}: {str(e)}")
                results[f"{name}_{algo_name}"] = {'Purity': 0, 'NMI': 0, 'ARI': 0}
    
    return results

n_runs = 5  
all_results = defaultdict(lambda: defaultdict(list))

for run in range(n_runs):
    print(f"\n Running iteration {run + 1}/{n_runs}...")
    run_result = run_experiments()
    for method, metrics in run_result.items():
        for metric_name, value in metrics.items():
            all_results[method][metric_name].append(value)

print("\n Final Aggregated Results over", n_runs, "runs:")
for method, metric_values in all_results.items():
    print(f"\n Method: {method}")
    for metric_name, values in metric_values.items():
        mean_val = np.mean(values)
        std_val = np.std(values)
        print(f"{metric_name}: Mean = {mean_val:.4f}, Std = {std_val:.4f}")



 Running iteration 1/5...

=== Processing Politics category ===
Running SCPSO...
SCPSO completed for Politics

 Running iteration 2/5...

=== Processing Politics category ===
Running SCPSO...
SCPSO completed for Politics

 Running iteration 3/5...

=== Processing Politics category ===
Running SCPSO...
SCPSO completed for Politics

 Running iteration 4/5...

=== Processing Politics category ===
Running SCPSO...
SCPSO completed for Politics

 Running iteration 5/5...

=== Processing Politics category ===
Running SCPSO...
SCPSO completed for Politics

 Final Aggregated Results over 5 runs:

 Method: Politics_SCPSO
Purity: Mean = 0.5078, Std = 0.0265
NMI: Mean = 0.1700, Std = 0.0431
ARI: Mean = 0.0947, Std = 0.0211


In [19]:
#20newsgroup
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
import pyswarms as ps
from deap import base, creator, tools, algorithms
import random




def run_experiments():
    categories = {
        'Miscellaneous': ['misc.forsale','talk.politics.misc','talk.religion.misc','comp.os.ms-windows.misc'],
        
    }
    
    results = {}
    
    for name, subcats in categories.items():
        print(f"\n=== Processing {name} category ===")
        dataset = fetch_20newsgroups(subset='all', categories=subcats, 
                                   remove=('headers', 'footers', 'quotes'))
        n_clusters = len(subcats)
        
        processed_texts = preprocess_text(dataset.data)


        vectorizer = TfidfVectorizer(stop_words='english',
                     min_df=2,
                     max_df=0.95,
                     token_pattern=r'\b[a-zA-Z]{3,}\b')
   
    
        X = vectorizer.fit_transform(processed_texts)
        y_true = dataset.target
        

        
        algorithms = {

            'SCPSO': SCPSO(n_clusters=n_clusters, n_particles=30, max_iter=50, gamma=0.54556)
               
            
        }
        
        for algo_name, algo in algorithms.items():
            try:
                print(f"Running {algo_name}...")
                y_pred = algo.fit_predict(X)
                results[f"{name}_{algo_name}"] = evaluate_clustering(y_true, y_pred)
                print(f"{algo_name} completed for {name}")
            except Exception as e:
                print(f"Error in {algo_name} for {name}: {str(e)}")
                results[f"{name}_{algo_name}"] = {'Purity': 0, 'NMI': 0, 'ARI': 0}
    
    return results


n_runs = 5  
all_results = defaultdict(lambda: defaultdict(list))

for run in range(n_runs):
    print(f"\n Running iteration {run + 1}/{n_runs}...")
    run_result = run_experiments()
    for method, metrics in run_result.items():
        for metric_name, value in metrics.items():
            all_results[method][metric_name].append(value)

print("\n Final Aggregated Results over", n_runs, "runs:")
for method, metric_values in all_results.items():
    print(f"\n Method: {method}")
    for metric_name, values in metric_values.items():
        mean_val = np.mean(values)
        std_val = np.std(values)
        print(f"{metric_name}: Mean = {mean_val:.4f}, Std = {std_val:.4f}")


 Running iteration 1/5...

=== Processing Miscellaneous category ===
Running SCPSO...
SCPSO completed for Miscellaneous

 Running iteration 2/5...

=== Processing Miscellaneous category ===
Running SCPSO...
SCPSO completed for Miscellaneous

 Running iteration 3/5...

=== Processing Miscellaneous category ===
Running SCPSO...
SCPSO completed for Miscellaneous

 Running iteration 4/5...

=== Processing Miscellaneous category ===
Running SCPSO...
SCPSO completed for Miscellaneous

 Running iteration 5/5...

=== Processing Miscellaneous category ===
Running SCPSO...
SCPSO completed for Miscellaneous

 Final Aggregated Results over 5 runs:

 Method: Miscellaneous_SCPSO
Purity: Mean = 0.5555, Std = 0.0761
NMI: Mean = 0.3742, Std = 0.0778
ARI: Mean = 0.3188, Std = 0.1100


In [21]:
#20newsgroup
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
import pyswarms as ps
from deap import base, creator, tools, algorithms
import random




def run_experiments():
    categories = {
        
        'Religion': ['talk.religion.misc', 'alt.atheism', 'soc.religion.christian']
        
    }
    
    results = {}
    
    for name, subcats in categories.items():
        print(f"\n=== Processing {name} category ===")
        dataset = fetch_20newsgroups(subset='all', categories=subcats, 
                                   remove=('headers', 'footers', 'quotes'))
        n_clusters = len(subcats)
        
        processed_texts = preprocess_text(dataset.data)


        vectorizer = TfidfVectorizer(stop_words='english',
                     min_df=2,
                     max_df=0.95,
                     token_pattern=r'\b[a-zA-Z]{3,}\b')
   
    
        X = vectorizer.fit_transform(processed_texts)
        y_true = dataset.target
        
       
        
        algorithms = {

            'SCPSO': SCPSO(n_clusters=n_clusters, n_particles=30, max_iter=50, gamma=0.02976)
               
            
        }
        
        for algo_name, algo in algorithms.items():
            try:
                print(f"Running {algo_name}...")
                y_pred = algo.fit_predict(X)
                results[f"{name}_{algo_name}"] = evaluate_clustering(y_true, y_pred)
                print(f"{algo_name} completed for {name}")
            except Exception as e:
                print(f"Error in {algo_name} for {name}: {str(e)}")
                results[f"{name}_{algo_name}"] = {'Purity': 0, 'NMI': 0, 'ARI': 0}
    
    return results

n_runs = 5  
all_results = defaultdict(lambda: defaultdict(list))

for run in range(n_runs):
    print(f"\n Running iteration {run + 1}/{n_runs}...")
    run_result = run_experiments()
    for method, metrics in run_result.items():
        for metric_name, value in metrics.items():
            all_results[method][metric_name].append(value)

print("\n Final Aggregated Results over", n_runs, "runs:")
for method, metric_values in all_results.items():
    print(f"\n Method: {method}")
    for metric_name, values in metric_values.items():
        mean_val = np.mean(values)
        std_val = np.std(values)
        print(f"{metric_name}: Mean = {mean_val:.4f}, Std = {std_val:.4f}")


 Running iteration 1/5...

=== Processing Religion category ===
Running SCPSO...
SCPSO completed for Religion

 Running iteration 2/5...

=== Processing Religion category ===
Running SCPSO...
SCPSO completed for Religion

 Running iteration 3/5...

=== Processing Religion category ===
Running SCPSO...
SCPSO completed for Religion

 Running iteration 4/5...

=== Processing Religion category ===
Running SCPSO...
SCPSO completed for Religion

 Running iteration 5/5...

=== Processing Religion category ===
Running SCPSO...
SCPSO completed for Religion

 Final Aggregated Results over 5 runs:

 Method: Religion_SCPSO
Purity: Mean = 0.4526, Std = 0.0167
NMI: Mean = 0.0499, Std = 0.0105
ARI: Mean = 0.0273, Std = 0.0130


In [23]:
#20newsgroup
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
import pyswarms as ps
from deap import base, creator, tools, algorithms
import random




def run_experiments():
    categories = {
       
        'Science': ['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space']
    }
    
    results = {}
    
    for name, subcats in categories.items():
        print(f"\n=== Processing {name} category ===")
        dataset = fetch_20newsgroups(subset='all', categories=subcats, 
                                   remove=('headers', 'footers', 'quotes'))
        n_clusters = len(subcats)
        
        processed_texts = preprocess_text(dataset.data)


        vectorizer = TfidfVectorizer(stop_words='english',
                     min_df=2,
                     max_df=0.95,
                     token_pattern=r'\b[a-zA-Z]{3,}\b')
   
    
        X = vectorizer.fit_transform(processed_texts)
        y_true = dataset.target
        
        
        
        algorithms = {

            'SCPSO': SCPSO(n_clusters=n_clusters, n_particles=30, max_iter=50, gamma=0.33598)
               
            
        }
        
        for algo_name, algo in algorithms.items():
            try:
                print(f"Running {algo_name}...")
                y_pred = algo.fit_predict(X)
                results[f"{name}_{algo_name}"] = evaluate_clustering(y_true, y_pred)
                print(f"{algo_name} completed for {name}")
            except Exception as e:
                print(f"Error in {algo_name} for {name}: {str(e)}")
                results[f"{name}_{algo_name}"] = {'Purity': 0, 'NMI': 0, 'ARI': 0}
    
    return results


n_runs = 5  
all_results = defaultdict(lambda: defaultdict(list))

for run in range(n_runs):
    print(f"\n Running iteration {run + 1}/{n_runs}...")
    run_result = run_experiments()
    for method, metrics in run_result.items():
        for metric_name, value in metrics.items():
            all_results[method][metric_name].append(value)

print("\n Final Aggregated Results over", n_runs, "runs:")
for method, metric_values in all_results.items():
    print(f"\n Method: {method}")
    for metric_name, values in metric_values.items():
        mean_val = np.mean(values)
        std_val = np.std(values)
        print(f"{metric_name}: Mean = {mean_val:.4f}, Std = {std_val:.4f}")


 Running iteration 1/5...

=== Processing Science category ===
Running SCPSO...
SCPSO completed for Science

 Running iteration 2/5...

=== Processing Science category ===
Running SCPSO...
SCPSO completed for Science

 Running iteration 3/5...

=== Processing Science category ===
Running SCPSO...
SCPSO completed for Science

 Running iteration 4/5...

=== Processing Science category ===
Running SCPSO...
SCPSO completed for Science

 Running iteration 5/5...

=== Processing Science category ===
Running SCPSO...
SCPSO completed for Science

 Final Aggregated Results over 5 runs:

 Method: Science_SCPSO
Purity: Mean = 0.4865, Std = 0.0692
NMI: Mean = 0.2065, Std = 0.0585
ARI: Mean = 0.1279, Std = 0.0392


In [99]:
import os
def run_experiments_webkb_dataset(webkb_folder_path):
    results = {}
    gamma_candidates = np.logspace(-3, 1, 20)

    for file in os.listdir(webkb_folder_path):
        if file.endswith(".csv"):
            file_path = os.path.join(webkb_folder_path, file)
            print(f"\nProcessing {file}...")

            df = pd.read_csv(file_path)
            texts = df['raw_text'].astype(str).tolist()
            labels = df['label'].tolist()
            y_true = LabelEncoder().fit_transform(labels)
            n_clusters = len(np.unique(y_true))

            processed_texts = preprocess_text(texts)

            vectorizer = TfidfVectorizer(
                stop_words='english',
                min_df=2,
                max_df=0.95,
                token_pattern=r'\b[a-zA-Z]{3,}\b'
            )
            X = vectorizer.fit_transform(processed_texts)

            best_gamma, best_nmi = find_optimal_gamma(
                X=X,
                y_true=y_true,
                n_clusters=n_clusters,
                gamma_range=gamma_candidates
            )
            print(f"Best gamma for {file}: {best_gamma:.5f}")

            try:
                algo = SCPSO(n_clusters=n_clusters, n_particles=30, max_iter=50, gamma=best_gamma)
                print(f"Running SCPSO on {file}...")
                y_pred = algo.fit_predict(X)
                metrics = evaluate_clustering(y_true, y_pred)
                print(f"SCPSO completed on {file}")
            except Exception as e:
                print(f"Error in SCPSO on {file}: {str(e)}")
                metrics = {'Purity': 0, 'NMI': 0, 'ARI': 0}

            dataset_name = file.replace(".csv", "")
            results[dataset_name] = {"SCPSO": metrics}  

    return results




n_runs = 5
aggregated_results = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

for run in range(n_runs):
    print(f"\n Running iteration {run + 1}...")
    run_results = run_experiments_webkb_dataset("C:/Users/asus/Downloads/FinalProject/WebKB")

    
    for dataset_name, methods in run_results.items():
        for method_name, metrics in methods.items():
            for metric_name, value in metrics.items():
                aggregated_results[dataset_name][method_name][metric_name].append(value)

print("\nFinal Aggregated Results:")
for dataset_name, methods in aggregated_results.items():
    print(f"\n=== {dataset_name} ===")
    for method_name, metrics in methods.items():
        print(f"\nMethod: {method_name}")
        for metric_name, values in metrics.items():
            mean_val = np.mean(values)
            std_val = np.std(values)
            print(f"{metric_name}: Mean = {mean_val:.4f}, Std = {std_val:.4f}")


 Running iteration 1...

Processing Cornell.csv...


Finding gamma: 100%|██████████| 20/20 [00:15<00:00,  1.31it/s]


Best gamma for Cornell.csv: 0.07848
Running SCPSO on Cornell.csv...
SCPSO completed on Cornell.csv

Processing Texas.csv...


Finding gamma: 100%|██████████| 20/20 [00:14<00:00,  1.35it/s]


Best gamma for Texas.csv: 0.00264
Running SCPSO on Texas.csv...
SCPSO completed on Texas.csv

Processing Washington.csv...


Finding gamma: 100%|██████████| 20/20 [00:15<00:00,  1.30it/s]


Best gamma for Washington.csv: 0.88587
Running SCPSO on Washington.csv...
SCPSO completed on Washington.csv

Processing Wisconsin.csv...


Finding gamma: 100%|██████████| 20/20 [00:16<00:00,  1.22it/s]


Best gamma for Wisconsin.csv: 3.79269
Running SCPSO on Wisconsin.csv...
SCPSO completed on Wisconsin.csv

 Running iteration 2...

Processing Cornell.csv...


Finding gamma: 100%|██████████| 20/20 [00:14<00:00,  1.36it/s]


Best gamma for Cornell.csv: 3.79269
Running SCPSO on Cornell.csv...
SCPSO completed on Cornell.csv

Processing Texas.csv...


Finding gamma: 100%|██████████| 20/20 [00:15<00:00,  1.30it/s]


Best gamma for Texas.csv: 3.79269
Running SCPSO on Texas.csv...
SCPSO completed on Texas.csv

Processing Washington.csv...


Finding gamma: 100%|██████████| 20/20 [00:15<00:00,  1.32it/s]


Best gamma for Washington.csv: 1.43845
Running SCPSO on Washington.csv...
SCPSO completed on Washington.csv

Processing Wisconsin.csv...


Finding gamma: 100%|██████████| 20/20 [00:16<00:00,  1.23it/s]


Best gamma for Wisconsin.csv: 1.43845
Running SCPSO on Wisconsin.csv...
SCPSO completed on Wisconsin.csv

 Running iteration 3...

Processing Cornell.csv...


Finding gamma: 100%|██████████| 20/20 [00:14<00:00,  1.36it/s]


Best gamma for Cornell.csv: 0.04833
Running SCPSO on Cornell.csv...
SCPSO completed on Cornell.csv

Processing Texas.csv...


Finding gamma: 100%|██████████| 20/20 [00:15<00:00,  1.32it/s]


Best gamma for Texas.csv: 3.79269
Running SCPSO on Texas.csv...
SCPSO completed on Texas.csv

Processing Washington.csv...


Finding gamma: 100%|██████████| 20/20 [00:15<00:00,  1.30it/s]


Best gamma for Washington.csv: 0.00695
Running SCPSO on Washington.csv...
SCPSO completed on Washington.csv

Processing Wisconsin.csv...


Finding gamma: 100%|██████████| 20/20 [00:16<00:00,  1.24it/s]


Best gamma for Wisconsin.csv: 0.07848
Running SCPSO on Wisconsin.csv...
SCPSO completed on Wisconsin.csv

 Running iteration 4...

Processing Cornell.csv...


Finding gamma: 100%|██████████| 20/20 [00:13<00:00,  1.46it/s]


Best gamma for Cornell.csv: 2.33572
Running SCPSO on Cornell.csv...
SCPSO completed on Cornell.csv

Processing Texas.csv...


Finding gamma: 100%|██████████| 20/20 [00:14<00:00,  1.39it/s]


Best gamma for Texas.csv: 1.43845
Running SCPSO on Texas.csv...
SCPSO completed on Texas.csv

Processing Washington.csv...


Finding gamma: 100%|██████████| 20/20 [00:14<00:00,  1.36it/s]


Best gamma for Washington.csv: 2.33572
Running SCPSO on Washington.csv...
SCPSO completed on Washington.csv

Processing Wisconsin.csv...


Finding gamma: 100%|██████████| 20/20 [00:16<00:00,  1.19it/s]


Best gamma for Wisconsin.csv: 1.43845
Running SCPSO on Wisconsin.csv...
SCPSO completed on Wisconsin.csv

 Running iteration 5...

Processing Cornell.csv...


Finding gamma: 100%|██████████| 20/20 [00:15<00:00,  1.32it/s]


Best gamma for Cornell.csv: 0.02976
Running SCPSO on Cornell.csv...
SCPSO completed on Cornell.csv

Processing Texas.csv...


Finding gamma: 100%|██████████| 20/20 [00:14<00:00,  1.33it/s]


Best gamma for Texas.csv: 3.79269
Running SCPSO on Texas.csv...
SCPSO completed on Texas.csv

Processing Washington.csv...


Finding gamma: 100%|██████████| 20/20 [00:15<00:00,  1.28it/s]


Best gamma for Washington.csv: 1.43845
Running SCPSO on Washington.csv...
SCPSO completed on Washington.csv

Processing Wisconsin.csv...


Finding gamma: 100%|██████████| 20/20 [00:16<00:00,  1.19it/s]


Best gamma for Wisconsin.csv: 2.33572
Running SCPSO on Wisconsin.csv...
SCPSO completed on Wisconsin.csv

Final Aggregated Results:

=== Cornell ===

Method: SCPSO
Purity: Mean = 0.5634, Std = 0.0686
NMI: Mean = 0.2318, Std = 0.1192
ARI: Mean = 0.1300, Std = 0.1176

=== Texas ===

Method: SCPSO
Purity: Mean = 0.6642, Std = 0.0542
NMI: Mean = 0.2676, Std = 0.0768
ARI: Mean = 0.3682, Std = 0.1088

=== Washington ===

Method: SCPSO
Purity: Mean = 0.6725, Std = 0.0526
NMI: Mean = 0.3110, Std = 0.0798
ARI: Mean = 0.2840, Std = 0.1012

=== Wisconsin ===

Method: SCPSO
Purity: Mean = 0.6091, Std = 0.0785
NMI: Mean = 0.2748, Std = 0.0918
ARI: Mean = 0.2464, Std = 0.0903


In [35]:
from scipy.stats import wilcoxon
import numpy as np


methods = ['GAKM', 'KM', 'SCPSO', 'SKM', 'NMF-FR', 'LSAKM']


accuracy = {
    'GAKM':   [0.680, 0.533, 0.582, 0.471, 0.464, 0.441, 0.613, 0.664, 0.210, 0.865],
    'KM':     [0.331, 0.464, 0.633, 0.449, 0.482, 0.461, 0.601, 0.898, 0.229, 0.865],
    'SCPSO':  [0.321, 0.533, 0.697, 0.521, 0.422, 0.648, 0.455, 0.705, 0.176, 0.865],
    'SKM':    [0.395, 0.629, 0.661, 0.471, 0.607, 0.516, 0.802, 0.799, 0.288, 0.881],
    'NMF-FR': [0.432, 0.586, 0.777, 0.530, 0.575, 0.639, 0.765, 0.907, 0.376, 0.868],
    'LSAKM':  [0.388, 0.587, 0.678, 0.521, 0.529, 0.696, 0.793, 0.933, 0.316, 0.865]
}


nmi = {
    'GAKM':   [0.770, 0.111, 0.230, 0.033, 0.108, 0.100, 0.307, 0.417, 0.093, 0.113],
    'KM':     [0.107, 0.120, 0.406, 0.032, 0.405, 0.169, 0.385, 0.761, 0.178, 0.297],
    'SCPSO':  [0.162, 0.205, 0.530, 0.099, 0.251, 0.383, 0.212, 0.581, 0.088, 0.070],
    'SKM':    [0.163, 0.332, 0.452, 0.057, 0.348, 0.399, 0.771, 0.695, 0.179, 0.300],
    'NMF-FR': [0.175, 0.238, 0.531, 0.068, 0.333, 0.361, 0.654, 0.763, 0.231, 0.124],
    'LSAKM':  [0.133, 0.288, 0.473, 0.082, 0.220, 0.415, 0.673, 0.815, 0.204, 0.183]
}


ari = {
    'GAKM':   [0.054, 0.115, 0.231, 0.044, 0.101, 0.076, 0.255, 0.410, 0.032, 0.271],
    'KM':     [0.042, 0.095, 0.280, 0.047, 0.168, 0.102, 0.243, 0.760, 0.032, 0.050],
    'SCPSO':  [0.086, 0.104, 0.480, 0.092, 0.106, 0.404, 0.132, 0.504, 0.026, -0.078],
    'SKM':    [0.123, 0.190, 0.425, 0.035, 0.309, 0.379, 0.751, 0.651, 0.082, 0.499],
    'NMF-FR': [0.139, 0.254, 0.529, 0.070, 0.299, 0.373, 0.578, 0.784, 0.151, 0.014],
    'LSAKM':  [0.091, 0.198, 0.427, 0.063, 0.200, 0.435, 0.582, 0.848, 0.116, 0.018]
}


def run_wilcoxon(metric_dict, metric_name):
    print(f"\n=== Wilcoxon Test for {metric_name} ===")
    for method in ['GAKM', 'KM', 'SCPSO', 'SKM', 'LSAKM']:
        stat, p = wilcoxon(metric_dict['NMF-FR'], metric_dict[method])
        print(f"NMF-FR vs {method}: p-value = {p:.6f} {'(significant)' if p < 0.05 else '(not significant)'}")


run_wilcoxon(accuracy, "Accuracy")
run_wilcoxon(nmi, "NMI")
run_wilcoxon(ari, "ARI")



=== Wilcoxon Test for Accuracy ===
NMF-FR vs GAKM: p-value = 0.083984 (not significant)
NMF-FR vs KM: p-value = 0.001953 (significant)
NMF-FR vs SCPSO: p-value = 0.007812 (significant)
NMF-FR vs SKM: p-value = 0.130859 (not significant)
NMF-FR vs LSAKM: p-value = 0.375000 (not significant)

=== Wilcoxon Test for NMI ===
NMF-FR vs GAKM: p-value = 0.083984 (not significant)
NMF-FR vs KM: p-value = 0.160156 (not significant)
NMF-FR vs SCPSO: p-value = 0.037109 (significant)
NMF-FR vs SKM: p-value = 0.556641 (not significant)
NMF-FR vs LSAKM: p-value = 0.845703 (not significant)

=== Wilcoxon Test for ARI ===
NMF-FR vs GAKM: p-value = 0.027344 (significant)
NMF-FR vs KM: p-value = 0.009766 (significant)
NMF-FR vs SCPSO: p-value = 0.009766 (significant)
NMF-FR vs SKM: p-value = 0.625000 (not significant)
NMF-FR vs LSAKM: p-value = 0.375000 (not significant)
