# Wikipedia NLP Analysis

## Overview
This notebook performs comprehensive NLP analysis on a corpus of Wikipedia articles, including:
- Text preprocessing and cleaning
- Document clustering (KMeans, DBSCAN, OPTICS)
- Word embeddings (GloVe)
- Dimensionality reduction and visualization
- Multi-label classification

## Dataset
- **Source**: Wikipedia articles
- **Size**: 289 documents
- **Features**: Article text, categories, metadata


In [None]:
import os
import re
import wikipediaapi as wiki_api

class WikipediaReader():
    def __init__(self, dir = "articles"):
        self.pages = set()
        self.article_path = os.path.join("./", dir)
        self.wiki = wiki_api.Wikipedia(user_agent = 'jmoses126@gmail.com',
                language = 'en',
                extract_format=wiki_api.ExtractFormat.WIKI)
        try:
            os.mkdir(self.article_path)
        except Exception as e:
            pass

    def reset(self):
        """Reset the reader by clearing all stored pages."""
        self.pages = set()

    def _get_page_title(self, article):
        return re.sub(r'\s+','_', article)

    def add_article(self, article):
        try:
            page = self.wiki.page(self._get_page_title(article))
            if page.exists():
                self.pages.add(page)
                return(page)
        except Exception as e:
            print(e)

    def list(self):
        return self.pages

    def process(self, update=False):
        for page in self.pages:
            filename = re.sub('\\s+', '_', f'{page.title}')
            filename = re.sub(r'[\(\):]','', filename)
            file_path = os.path.join(self.article_path, f'{filename}.txt')
            if update or not os.path.exists(file_path):
                print(f'Downloading {page.title} ...')
                content = page.text
                with open(file_path, 'w') as file:
                    file.write(content)
            else:
                print(f'Not updating {page.title} ...')

    def crawl_pages(self, article, depth = 3, total_number = 1000):
        print(f'Crawl {total_number} :: {article}')

        page = self.add_article(article)
        childs = set()

        if page:
            for child in page.links.keys():
                if len(self.pages) < total_number:
                    print(f'Add article {len(self.pages)}/{total_number} {child}')
                    self.add_article(child)
                    childs.add(child)

        depth -= 1
        if depth > 0:
            for child in sorted(childs):
                if len(self.pages) < total_number:
                    self.crawl_pages(child, depth, len(self.pages))

    def get_categories(self, title):
        page = self.add_article(title)
        if page:
            if (list(page.categories.keys())) and (len(list(page.categories.keys())) > 0):
                categories = [c.replace('Category:','').lower() for c in list(page.categories.keys())
                   if c.lower().find('articles') == -1
                   and c.lower().find('pages') == -1
                   and c.lower().find('wikipedia') == -1
                   and c.lower().find('cs1') == -1
                   and c.lower().find('webarchive') == -1
                   and c.lower().find('dmy dates') == -1
                   and c.lower().find('short description') == -1
                   and c.lower().find('commons category') == -1

                ]
                return dict.fromkeys(categories, 1)
        return {}

: 

In [None]:
import sys
print(sys.executable)


In [None]:
import nltk
from  nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
from time import time

class WikipediaCorpus(CategorizedPlaintextCorpusReader):
    
    def __init__(self, root, fileids, cat_pattern=None, **kwargs):
        """
        Initialize the WikipediaCorpus reader.
        
        :param root: The root directory for corpus files
        :param fileids: File pattern(s) to match
        :param cat_pattern: Regex pattern to extract categories from filenames
        """
        super().__init__(root, fileids, cat_pattern=cat_pattern, **kwargs)

    def vocab(self):
        return nltk.FreqDist(re.sub(r'[^A-Za-z0-9,;\.]+', ' ', word).lower() for word in corpus.words())

    def max_words(self):
        max = 0
        for doc in self.fileids():
            l = len(self.words(doc))
            max = l if l > max else max
        return max

    def describe(self, fileids=None, categories=None):
        started = time()

        return {
            'files': len(self.fileids()),
            'paras': len(self.paras()),
            'sents': len(self.sents()),
            'words': len(self.words()),
            'vocab': len(self.vocab()),
            'max_words': self.max_words(),
            'time': time()-started
            }
        pass

corpus = WikipediaCorpus(root='articles', fileids=r'[^\\.ipynb].*', cat_pattern=r'[.*]')
print(corpus.fileids())

In [None]:
import ssl

# Fix SSL certificate issue for NLTK downloads on macOS
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download required NLTK data
nltk.download('punkt_tab', quiet=True)

class WikipediaPlaintextCorpus(CategorizedPlaintextCorpusReader):
    def __init__(self, root_path):
        # Need to provide both root AND fileids
        super().__init__(root_path, r'.*\.txt', cat_pattern=r'.*')
    
    def vocab(self):
        return nltk.FreqDist(re.sub(r'[^A-Za-z0-9,;\.]+', ' ', word).lower() for word in self.words())
    
    def max_words(self):
        max_count = 0
        for doc in self.fileids():
            word_count = len(self.words(doc))
            max_count = word_count if word_count > max_count else max_count
        return max_count
    
    def describe(self, fileids=None, categories=None):
        started = time()
        
        return {
            'files': len(self.fileids()),
            'paras': len(self.paras()),
            'sents': len(self.sents()),
            'words': len(self.words()),
            'vocab': len(self.vocab()),
            'max_words': self.max_words(),
            'time': time() - started
        }

root_path = './articles'
corpus = WikipediaPlaintextCorpus(root_path)
print(corpus.describe())

## 1. Setup and Configuration
Load required libraries and define custom classes for the analysis pipeline.


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class WikipediaCorpusTransformer(BaseEstimator, TransformerMixin):
    """
    A scikit-learn compatible transformer that loads Wikipedia corpus data.
    This class is designed to be used in a sklearn Pipeline.
    """
    def __init__(self, root_path):
        self.root_path = root_path
        self.corpus = None

    def fit(self, X=None, y=None):
        """Load the corpus from the root path."""
        self.corpus = WikipediaPlaintextCorpus(self.root_path)
        return self

    def transform(self, X=None):
        """Return the list of file IDs from the corpus."""
        if self.corpus is None:
            self.fit()
        return self.corpus.fileids()

    def get_corpus(self):
        """Return the loaded corpus object."""
        if self.corpus is None:
            self.fit()
        return self.corpus

In [None]:
class Categorizer(BaseEstimator, TransformerMixin):
    """
    Assigns categories to documents based on Wikipedia page categories.
    """
    def __init__(self, wikipedia_reader):
        self.reader = wikipedia_reader

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        """Transform file IDs to include category information."""
        categorized_docs = []
        for fileid in X:
            # Extract article title from filename
            title = fileid.replace('.txt', '').replace('_', ' ')
            categories = self.reader.get_categories(title)
            categorized_docs.append({
                'fileid': fileid,
                'title': title,
                'categories': categories
            })
        return categorized_docs


class TextPreprocessor(BaseEstimator, TransformerMixin):
    """
    Preprocesses text from the corpus files with comprehensive text cleaning.
    
    Parameters:
    -----------
    root_path : str
        Path to directory containing text files
    lowercase : bool, default=True
        Convert text to lowercase
    remove_stopwords : bool, default=True
        Remove common stopwords using NLTK
    lemmatize : bool, default=True
        Apply lemmatization using WordNetLemmatizer
    remove_punctuation : bool, default=True
        Remove punctuation and special characters
    remove_numbers : bool, default=False
        Remove numeric digits
    min_token_length : int, default=3
        Minimum length for tokens to keep
    """
    def __init__(self, root_path, lowercase=True, remove_stopwords=True, 
                 lemmatize=True, remove_punctuation=True, 
                 remove_numbers=False, min_token_length=3):
        self.root_path = root_path
        self.lowercase = lowercase
        self.remove_stopwords = remove_stopwords
        self.lemmatize = lemmatize
        self.remove_punctuation = remove_punctuation
        self.remove_numbers = remove_numbers
        self.min_token_length = min_token_length
        
        # Initialize lemmatizer and stopwords
        if self.lemmatize:
            from nltk.stem import WordNetLemmatizer
            self.lemmatizer = WordNetLemmatizer()
        if self.remove_stopwords:
            from nltk.corpus import stopwords
            self.stop_words = set(stopwords.words('english'))

    def fit(self, X, y=None):
        return self

    def _clean_text(self, text):
        """Apply comprehensive text cleaning."""
        import re
        
        # Lowercase
        if self.lowercase:
            text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        
        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)
        
        # Remove numbers if requested
        if self.remove_numbers:
            text = re.sub(r'\d+', '', text)
        
        # Remove punctuation if requested
        if self.remove_punctuation:
            text = re.sub(r'[^\w\s]', ' ', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def _process_tokens(self, text):
        """Tokenize and apply token-level processing."""
        # Simple word tokenization
        tokens = text.split()
        
        # Filter short tokens
        tokens = [t for t in tokens if len(t) >= self.min_token_length]
        
        # Remove stopwords
        if self.remove_stopwords:
            tokens = [t for t in tokens if t not in self.stop_words]
        
        # Lemmatize
        if self.lemmatize:
            tokens = [self.lemmatizer.lemmatize(t) for t in tokens]
        
        return ' '.join(tokens)

    def transform(self, X):
        """Read and preprocess text from files."""
        processed_docs = []
        for doc in X:
            fileid = doc['fileid'] if isinstance(doc, dict) else doc
            filepath = os.path.join(self.root_path, fileid)
            
            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    text = f.read()
                    # Apply comprehensive preprocessing
                    text = self._clean_text(text)
                    text = self._process_tokens(text)
                    processed_docs.append({
                        'text': text,
                        'fileid': fileid,
                        'categories': doc.get('categories', {}) if isinstance(doc, dict) else {}
                    })
            except Exception as e:
                print(f"Error processing {fileid}: {e}")
                
        return processed_docs


class TextTokenizer(BaseEstimator, TransformerMixin):
    """
    Tokenizes text into words and sentences.
    """
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        """Tokenize documents into words."""
        tokenized_docs = []
        for doc in X:
            text = doc['text'] if isinstance(doc, dict) else doc
            # Simple word tokenization
            words = nltk.word_tokenize(text)
            
            tokenized_docs.append({
                'tokens': words,
                'fileid': doc.get('fileid', '') if isinstance(doc, dict) else '',
                'categories': doc.get('categories', {}) if isinstance(doc, dict) else {}
            })
        return tokenized_docs


class BagOfWordVectorizer(BaseEstimator, TransformerMixin):
    """
    Converts tokenized documents to bag-of-words representation.
    """
    def __init__(self, corpus):
        self.corpus = corpus
        self.vocabulary = None

    def fit(self, X, y=None):
        """Build vocabulary from corpus."""
        self.vocabulary = self.corpus.vocab()
        return self

    def transform(self, X):
        """Transform documents to bag-of-words vectors."""
        vectors = []
        for doc in X:
            tokens = doc['tokens'] if isinstance(doc, dict) else doc
            # Create frequency distribution for this document
            word_freq = nltk.FreqDist(tokens)
            vectors.append({
                'vector': dict(word_freq),
                'fileid': doc.get('fileid', '') if isinstance(doc, dict) else '',
                'categories': doc.get('categories', {}) if isinstance(doc, dict) else {}
            })
        return vectors


In [None]:
from sklearn.pipeline import Pipeline

root_path = './articles'

pipeline = Pipeline([
    ('corpus', WikipediaCorpusTransformer(root_path=root_path)),
    ('categorizer', Categorizer(WikipediaReader())),
    ('preprocessor', TextPreprocessor(root_path=root_path)),
    ('tokenizer', TextTokenizer()),
    ('vectorizer', BagOfWordVectorizer(WikipediaPlaintextCorpus(root_path))),
])

In [None]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer

# Run the pipeline to get the vectorized documents
transformed_data = pipeline.fit_transform(None)

# Create a DataFrame with the results, including preprocessed text
X = pd.DataFrame([{
    'fileid': doc['fileid'],
    'bow': doc['vector'],
    'categories': doc['categories']
} for doc in transformed_data])

# Add preprocessed text by reading from the pipeline's preprocessor output
# We need to re-run just the preprocessing steps to get the text
preprocessed_data = pipeline.named_steps['preprocessor'].transform(
    pipeline.named_steps['categorizer'].transform(
        pipeline.named_steps['corpus'].transform(None)
    )
)

# Add the preprocessed text to the DataFrame
X['preprocessed'] = [doc['text'] for doc in preprocessed_data]

# Add tokens from the tokenizer output
tokenized_data = pipeline.named_steps['tokenizer'].transform(preprocessed_data)
X['tokens'] = [doc['tokens'] for doc in tokenized_data]

# Now create the feature matrix
vectorizer = DictVectorizer(sparse=False)
x_train = vectorizer.fit_transform(X['bow'].to_numpy())

## 2. Document Clustering Analysis
Apply various clustering algorithms to discover document groups.

### 2.1 KMeans Clustering
Partition-based clustering with predefined number of clusters.


In [None]:
from sklearn.cluster import KMeans

# Create and fit KMeans model
kmeans = KMeans(n_clusters=8, random_state=0)
kmeans.fit(x_train)

# Verify that labels were created
print(f"Number of documents clustered: {len(kmeans.labels_)}")
print(f"Cluster assignments: {kmeans.labels_}")

In [None]:
import matplotlib.pyplot as plt

plt.ylabel('Clusters')
plt.xlabel('Document ID')
plt.plot(kmeans.labels_, 'o')

In [None]:
plt.hist(x=kmeans.labels_,  bins=8, density=False)
plt.grid(True)
plt.show()


In [None]:
def get_cluster_articles(c_id, lables):
    # Extract title from fileid by removing .txt and replacing underscores
    return [X['fileid'][i].replace('.txt', '').replace('_', ' ') for i,l in enumerate(lables) if l == c_id]

print(get_cluster_articles(0, kmeans.labels_))

In [None]:
def evaluate_clustering(X, labels, method_name="Clustering"):
    """
    Evaluate clustering quality using multiple metrics.
    
    Parameters:
    -----------
    X : array-like
        Feature matrix used for clustering
    labels : array-like
        Cluster labels assigned to each sample
    method_name : str
        Name of clustering method for display
    
    Returns:
    --------
    dict : Dictionary containing evaluation metrics
    """
    from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
    import numpy as np
    
    # Filter out noise points (label -1) for DBSCAN/OPTICS
    mask = labels >= 0
    n_noise = (labels == -1).sum()
    
    if mask.sum() < 2:
        print(f"{method_name}: Insufficient non-noise samples for evaluation")
        return {
            'method': method_name,
            'n_clusters': 0,
            'n_noise': n_noise,
            'n_samples': len(labels),
            'silhouette': None,
            'davies_bouldin': None,
            'calinski_harabasz': None
        }
    
    # Get unique clusters (excluding noise)
    unique_clusters = len(set(labels[mask]))
    
    if unique_clusters < 2:
        print(f"{method_name}: Only {unique_clusters} cluster(s) found")
        return {
            'method': method_name,
            'n_clusters': unique_clusters,
            'n_noise': n_noise,
            'n_samples': len(labels),
            'silhouette': None,
            'davies_bouldin': None,
            'calinski_harabasz': None
        }
    
    # Calculate metrics only on non-noise points
    X_filtered = X[mask]
    labels_filtered = labels[mask]
    
    metrics = {
        'method': method_name,
        'n_clusters': unique_clusters,
        'n_noise': n_noise,
        'n_samples': len(labels),
        'silhouette': silhouette_score(X_filtered, labels_filtered),
        'davies_bouldin': davies_bouldin_score(X_filtered, labels_filtered),
        'calinski_harabasz': calinski_harabasz_score(X_filtered, labels_filtered)
    }
    
    # Print formatted results
    print(f"\n{method_name} Evaluation:")
    print(f"  Samples: {metrics['n_samples']} ({metrics['n_noise']} noise)")
    print(f"  Clusters: {metrics['n_clusters']}")
    print(f"  Silhouette Score: {metrics['silhouette']:.4f} (higher is better, range: -1 to 1)")
    print(f"  Davies-Bouldin Index: {metrics['davies_bouldin']:.4f} (lower is better)")
    print(f"  Calinski-Harabasz Score: {metrics['calinski_harabasz']:.2f} (higher is better)")
    
    return metrics


# Example usage (commented out - uncomment to test after running clustering)
# metrics_kmeans = evaluate_clustering(x_train, kmeans.labels_, "KMeans")


In [None]:
def plot_elbow(X, k_range=range(2, 21), random_state=42):
    """
    Plot elbow curve to determine optimal number of clusters for KMeans.
    
    Parameters:
    -----------
    X : array-like
        Feature matrix for clustering
    k_range : range or list
        Range of k values to test (default: 2 to 20)
    random_state : int
        Random state for reproducibility
    
    Returns:
    --------
    list : Inertia values for each k
    """
    from sklearn.cluster import KMeans
    import matplotlib.pyplot as plt
    import numpy as np
    
    inertias = []
    k_values = list(k_range)
    
    print("Calculating inertia for different k values...")
    for k in k_values:
        kmeans = KMeans(n_clusters=k, random_state=random_state, n_init=10)
        kmeans.fit(X)
        inertias.append(kmeans.inertia_)
        print(f"  k={k}: inertia={kmeans.inertia_:.2f}")
    
    # Plot elbow curve
    plt.figure(figsize=(10, 6))
    plt.plot(k_values, inertias, 'bo-', linewidth=2, markersize=8)
    plt.xlabel('Number of Clusters (k)', fontsize=12)
    plt.ylabel('Inertia (Within-cluster sum of squares)', fontsize=12)
    plt.title('Elbow Method for Optimal k', fontsize=14, fontweight='bold')
    plt.grid(True, alpha=0.3)
    plt.xticks(k_values)
    
    # Try to identify elbow point using second derivative
    inertias_arr = np.array(inertias)
    if len(inertias_arr) > 2:
        # Calculate second derivative (rate of change of slope)
        first_deriv = np.diff(inertias_arr)
        second_deriv = np.diff(first_deriv)
        # Find point where second derivative is maximum (sharpest turn)
        elbow_idx = np.argmax(second_deriv) + 2  # +2 because of two diff operations
        elbow_k = k_values[elbow_idx]
        
        # Highlight elbow point
        plt.axvline(x=elbow_k, color='r', linestyle='--', alpha=0.7, 
                    label=f'Suggested k={elbow_k}')
        plt.legend()
        print(f"\nSuggested optimal k (elbow point): {elbow_k}")
    
    plt.tight_layout()
    plt.show()
    
    return inertias


# Example usage (commented out - uncomment to run)
# inertias = plot_elbow(x_train, k_range=range(2, 21))


In [None]:
from wordcloud import WordCloud

def wordcloud_for_cluster(c_id, lables):
    text = ' '.join([X['preprocessed'][i] for i,l in enumerate(lables) if l == c_id ])

    wordcloud = WordCloud(max_font_size=50, max_words=20).generate(text)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

wordcloud_for_cluster(0, kmeans.labels_)


In [None]:
from sklearn.feature_extraction import DictVectorizer

vectorizer = DictVectorizer(sparse=False)
x_train = vectorizer.fit_transform(X['bow'])

print(type(x_train))
#numpy.ndarray

print(x_train)
#[[ 15.   0.  10. ...   0.   0.   0.]
# [662.   0. 430. ...   0.   0.   0.]
# [316.   0. 143. ...   0.   0.   0.]
# ...
# [319.   0. 217. ...   0.   0.   0.]
# [158.   0. 147. ...   0.   0.   0.]
# [328.   0. 279. ...   0.   0.   0.]]

print(x_train.shape)
# (272, 52743)

print(vectorizer.get_feature_names_out())
# array([',', ',1', '.', ..., 'zy', 'zygomaticus', 'zygote'], dtype=object)

print(len(vectorizer.get_feature_names_out()))
# 52743


In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

def pca_reduce(vec_list, dimensions=2):
    return PCA(dimensions).fit_transform(vec_list)

def d2_plot(data):
    plt.plot(data, 'o')

d2_plot(pca_reduce(x_train,2))





In [None]:
def d2_plot(data):
    plt.plot(data, '.')

d2_plot(pca_reduce(x_train,2))

In [None]:
def d3_plot(data):
    fig = plt.figure()
    ax = fig.add_subplot(projection='3d')
    for _, v in enumerate(data[:90]):
        ax.scatter(v[0],v[1], v[2],marker='.', color='r')
    for _, v in enumerate(data[90:180]):
        ax.scatter(v[0],v[1], v[2],marker='.', color='g')
    for _, v in enumerate(data[180:]):
        ax.scatter(v[0],v[1], v[2],marker ='.', color='b')

    plt.show()

d3_plot(pca_reduce(x_train,3))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

x_train = X['preprocessed'].tolist()

vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)

print(x_train.shape)
# (272, 40337)

print(x_train)


In [None]:
import gensim.downloader as api
import numpy as np

vocab = corpus.vocab()
vector_lookup = api.load('glove-wiki-gigaword-50')

def word_vector(tokens):
    return np.array([
        vector_lookup[token]
        for token in tokens
        if token in vocab and token in vector_lookup
    ])

X['word_vector'] = X['tokens'].apply(lambda tokens: word_vector(tokens))


In [None]:
word_vector_length = np.array([len(tokens) for tokens in X['word_vector'].to_numpy().flatten()])

print(word_vector_length[:5])
# [760, 157, 7566, 2543, 2086]

bins=int(np.max(word_vector_length)/1000)

plt.hist(x=word_vector_length,  bins=bins, density=False)
plt.show()

print(f'Mean: {word_vector_length.mean()}')


In [None]:
def pad_word_vectors(vec_list, padding_value):
    res = []
    for vec in vec_list:
        con = np.array([v for v in vec]).reshape(-1)
        con_padded = np.pad(con, (0, padding_value))
        con_truncated = con_padded[:padding_value]
        res.append(con_truncated)
    return np.array(res)

def pca_reduce(vec_list, n_components):
    return PCA(n_components).fit_transform(vec_list)

# Use the X DataFrame we already created earlier (no pickle needed!)
x_train = X['word_vector'].to_numpy()

x_train_padded = pad_word_vectors(x_train, 300000)

x_train_2d = pca_reduce(x_train_padded, 2)
x_train_3d = pca_reduce(x_train_padded, 3)

In [None]:
# Fit KMeans on the 2D PCA-reduced vectors to avoid "setting an array element with a sequence" error
model = KMeans(n_clusters=8, random_state=0).fit(x_train_2d)
print(model)
print(model.get_params())
print(model.labels_)

In [None]:
model = KMeans(n_clusters=8, random_state=0, n_init="auto").fit(x_train_2d)

print(model.labels_)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
import pandas as pd

x_train = X['preprocessed'].tolist()

vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)


### 2.2 DBSCAN Clustering
Density-based clustering that can find arbitrary-shaped clusters and identify noise points.


In [None]:
# DBSCAN Clustering with Parameter Tuning
# DBSCAN works poorly in high-dimensional spaces, so we reduce dimensionality first

from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import numpy as np

# Step 1: Reduce dimensionality to make DBSCAN effective
print("Reducing dimensionality for DBSCAN...")
pca = PCA(n_components=50, random_state=42)
x_train_pca50 = pca.fit_transform(x_train.toarray() if hasattr(x_train, 'toarray') else x_train)
print(f"Reduced from {x_train.shape[1]} to 50 dimensions")
print(f"Explained variance: {pca.explained_variance_ratio_.sum():.2%}\n")

# Step 2: Find optimal eps using k-distance graph
print("Finding optimal eps parameter using k-distance graph...")
min_samples = 5  # Typical: 2*dim, but we'll use 5 for start
neighbors = NearestNeighbors(n_neighbors=min_samples)
neighbors_fit = neighbors.fit(x_train_pca50)
distances, indices = neighbors_fit.kneighbors(x_train_pca50)

# Sort distances to k-th nearest neighbor
k_distances = np.sort(distances[:, min_samples-1], axis=0)

# Plot k-distance graph
plt.figure(figsize=(10, 6))
plt.plot(k_distances)
plt.ylabel(f'{min_samples}-Nearest Neighbor Distance', fontsize=12)
plt.xlabel('Points sorted by distance', fontsize=12)
plt.title('K-Distance Graph for DBSCAN eps Selection', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.axhline(y=np.percentile(k_distances, 90), color='r', linestyle='--', 
            alpha=0.7, label=f'90th percentile: {np.percentile(k_distances, 90):.2f}')
plt.legend()
plt.tight_layout()
plt.show()

# Use 90th percentile as eps (common heuristic for elbow point)
optimal_eps = np.percentile(k_distances, 90)
print(f"\nSuggested eps (90th percentile): {optimal_eps:.2f}")

# Step 3: Run DBSCAN with tuned parameters
print(f"\nRunning DBSCAN with eps={optimal_eps:.2f}, min_samples={min_samples}...")
dbscan_model = DBSCAN(eps=optimal_eps, min_samples=min_samples)
dbscan_labels = dbscan_model.fit_predict(x_train_pca50)

# Step 4: Evaluate results
print(f"\nDBSCAN Results:")
print(f"  Parameters: eps={optimal_eps:.2f}, min_samples={min_samples}")
unique_labels = set(dbscan_labels)
n_clusters = len(unique_labels) - (1 if -1 in unique_labels else 0)
n_noise = list(dbscan_labels).count(-1)
print(f"  Clusters found: {n_clusters}")
print(f"  Noise points: {n_noise} ({n_noise/len(dbscan_labels)*100:.1f}%)")
print(f"  Cluster sizes: {[(label, list(dbscan_labels).count(label)) for label in unique_labels if label != -1]}")

# Step 5: Evaluate clustering quality
if n_clusters >= 2:
    dbscan_metrics = evaluate_clustering(x_train_pca50, dbscan_labels, "DBSCAN")
else:
    print("\nInsufficient clusters for quality metrics")


In [None]:
from sklearn.decomposition import TruncatedSVD

def pca_reduce(vec_list, n_components):
    return TruncatedSVD(n_components).fit_transform(vec_list)

x_train_3d = pca_reduce(x_train, 3)

model = DBSCAN().fit(x_train_3d)

print(model.get_params())
print(model.labels_)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import OPTICS

x_train = X['preprocessed'].tolist()

vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train).todense()


### 2.3 OPTICS Clustering
Ordering Points To Identify Clustering Structure - density-based clustering with variable density.


In [None]:
# OPTICS Clustering with Parameter Tuning
# Like DBSCAN, OPTICS works better with dimensionality reduction

from sklearn.cluster import OPTICS
import matplotlib.pyplot as plt
import numpy as np

# Use the PCA-reduced data from DBSCAN (if not already created, create it)
if 'x_train_pca50' not in globals():
    from sklearn.decomposition import PCA
    print("Reducing dimensionality for OPTICS...")
    pca = PCA(n_components=50, random_state=42)
    x_train_pca50 = pca.fit_transform(x_train.toarray() if hasattr(x_train, 'toarray') else x_train)
    print(f"Reduced to 50 dimensions\n")

# Try different min_samples values
print("Testing OPTICS with different min_samples values...")
min_samples_options = [5, 10, 15]
best_model = None
best_metrics = None
best_min_samples = None

for min_samples in min_samples_options:
    print(f"\n--- Testing min_samples={min_samples} ---")
    
    # Run OPTICS with finite max_eps to help with convergence
    optics_model = OPTICS(
        min_samples=min_samples,
        max_eps=10.0,  # Finite max_eps helps prevent single-cluster output
        cluster_method='dbscan',  # Use DBSCAN extraction method
        metric='euclidean'
    )
    optics_labels = optics_model.fit_predict(x_train_pca50)
    
    # Check results
    unique_labels = set(optics_labels)
    n_clusters = len(unique_labels) - (1 if -1 in unique_labels else 0)
    n_noise = list(optics_labels).count(-1)
    
    print(f"  Clusters found: {n_clusters}")
    print(f"  Noise points: {n_noise} ({n_noise/len(optics_labels)*100:.1f}%)")
    
    # Evaluate if we have at least 2 clusters
    if n_clusters >= 2 and n_clusters < len(optics_labels) // 2:
        try:
            metrics = evaluate_clustering(x_train_pca50, optics_labels, f"OPTICS (min_samples={min_samples})")
            if best_metrics is None or (metrics['silhouette'] and metrics['silhouette'] > (best_metrics.get('silhouette') or -1)):
                best_model = optics_model
                best_metrics = metrics
                best_min_samples = min_samples
        except:
            print(f"  Could not evaluate clustering for min_samples={min_samples}")

# Use best model
if best_model is not None:
    print(f"\n{'='*60}")
    print(f"Best OPTICS configuration: min_samples={best_min_samples}")
    print(f"{'='*60}")
    optics_best_labels = best_model.labels_
    
    # Plot reachability distance to visualize cluster structure
    plt.figure(figsize=(12, 6))
    
    # Reachability plot
    space = np.arange(len(x_train_pca50))
    reachability = best_model.reachability_[best_model.ordering_]
    labels = best_model.labels_[best_model.ordering_]
    
    plt.subplot(1, 2, 1)
    colors = ['g.', 'r.', 'b.', 'y.', 'c.', 'm.', 'k.', 'orange']
    for klass, color in zip(range(0, max(labels) + 1), colors):
        Xk = space[labels == klass]
        Rk = reachability[labels == klass]
        plt.plot(Xk, Rk, color, alpha=0.5)
    plt.plot(space[labels == -1], reachability[labels == -1], 'k+', alpha=0.1, label='Noise')
    plt.ylabel('Reachability Distance', fontsize=12)
    plt.xlabel('Sample Order', fontsize=12)
    plt.title('Reachability Plot', fontsize=14, fontweight='bold')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Cluster distribution
    plt.subplot(1, 2, 2)
    cluster_counts = [list(optics_best_labels).count(i) for i in range(max(optics_best_labels) + 1) if i != -1]
    cluster_ids = [i for i in range(max(optics_best_labels) + 1) if i != -1]
    plt.bar(cluster_ids, cluster_counts)
    plt.xlabel('Cluster ID', fontsize=12)
    plt.ylabel('Number of Documents', fontsize=12)
    plt.title('Cluster Size Distribution', fontsize=14, fontweight='bold')
    plt.grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.show()
else:
    print("\nWarning: Could not find suitable OPTICS clustering parameters")
    print("All tested configurations resulted in too few clusters or poor quality")


In [None]:
x_train_padded = pad_word_vectors(x_train,300000)

n_clusters = 5
model = KMeans(n_clusters, random_state=0, n_init="auto").fit(x_train_padded )

print(model.labels_)


In [None]:
x_train_padded = pad_word_vectors(x_train,300000)
x_train_3d = pca_reduce(x_train_padded,3)

n_clusters = 5
model = KMeans(n_clusters, random_state=0, n_init="auto").fit(x_train_3d)

print(model.labels_)


In [None]:
x_train_padded = pad_word_vectors(x_train,300000)

model = DBSCAN().fit(x_train_3d)

print(model.labels_)

In [None]:
x_train_padded = pad_word_vectors(x_train,300000)

model = OPTICS(min_samples=10).fit(np.array(x_train_padded))

print(model.labels_)

In [None]:
x_train_3d = pca_reduce(x_train_padded,3)

model = OPTICS(min_samples=10).fit(np.array(x_train_3d))

print(model.labels_)


In [None]:
model = OPTICS(min_samples=5, metric='minkowski').fit(np.array(x_train_3d))

print(model.get_params())
print(model.labels_)

In [None]:
import wikipediaapi as wiki_api

wiki = wiki_api.Wikipedia(
                language = 'en',
                extract_format=wiki_api.ExtractFormat.WIKI,
                user_agent = 'jmoses126@gmail.com')

p = wiki.page('Vehicular_automation')

print(len(p.categories))

print([name.replace('Category:', '') for name in p.categories.keys()])

In [None]:
import requests

def get_categories(article_name):
    wiki = wiki_api.Wikipedia(language = 'en', user_agent = 'jmoses126@gmail.com')

    p = wiki.page(article_name)

    if not p.exists():
        return None

    # Add proper headers to the request
    headers = {
        'User-Agent': 'jmoses126@gmail.com'
    }
    
    r = requests.get(p.fullurl, headers=headers)
    html = r.text
    
    # Updated regex to match the current Wikipedia HTML structure
    catlinks_regexp = re.compile(r'id="mw-normal-catlinks".*?</div>', re.DOTALL)
    catnames_regexp = re.compile(r'title="Category:(.*?)"')

    cat_matches = catlinks_regexp.findall(html)
    
    if len(cat_matches) == 0:
        return ['Uncategorized']

    cat_src = cat_matches[0]
    cats = catnames_regexp.findall(cat_src)

    if len(cats) == 0:
        return ['Uncategorized']
    else:
        return cats


In [None]:
get_categories('Artificial_intelligence')




In [None]:
get_categories('Vehicular_automation')

In [None]:
pipeline = Pipeline([
    ('corpus', WikipediaCorpusTransformer(root_path=root_path)),
    ('categorizer', Categorizer(WikipediaReader())),
])

In [None]:
class Categorizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.wiki = wiki_api.Wikipedia(language='en', user_agent='jmoses126@gmail.com')
        self.catlinks_regexp = re.compile(r'class="mw-normal-catlinks".*?<\/div>')
        self.catnames_regexp = re.compile(r'<a.*?>(.*?)<\/a>')

    def get_categories(self, article_name):
        p = self.wiki.page(article_name)

         # Add proper headers to the request
        headers = {
            'user-agent': 'jmoses126@gmail.com'
        }
        if p.exists:
            try:
                r = requests.get(p.fullurl, headers=headers)
                html = r.text.replace('\r', '').replace('\n', '')

                cat_src = self.catlinks_regexp.findall(html)[0]
                cats = self.catnames_regexp.findall(cat_src)

                if len(cats) > 0:
                    return dict.fromkeys(cats[1:len(cats)], 1)
            except Exception as e:
                print(e)

        return {'Uncategorized':1}


    def transform(self, X):
        X['categories'] = X['title'].apply(lambda title: self.get_categories(title))
        return X

In [None]:
pipeline = Pipeline([
    ('corpus', WikipediaCorpusTransformer(root_path=root_path)),
    ('categorizer', Categorizer()),
])

## 4. Multi-Label Classification
Train a model to predict Wikipedia article categories from text content.

### 4.1 Data Preparation
Split data into train/test sets and convert to Spacy format.


In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(X, test_size=0.2, random_state=42, shuffle=True)

In [None]:
import spacy
from spacy.tokens import DocBin
from copy import copy
from itertools import chain

def convert_multi_label(df, filename):
    db = DocBin()
    nlp = spacy.load('en_core_web_lg')
    total = len(df.index)
    print(f'{time()}: start processing {filename} with {total} files')

    categories_list = set(list(chain.from_iterable([list(d) for d in df["categories"].tolist()])))
    categories_dict = { cat: 0 for cat in categories_list }
    #print(categories_dict)

    count = 0
    for _, row in df.iterrows():
        count += 1
        print(f'Processing {count}/{total}')
        doc = nlp(row['preprocessed'])  # Changed from 'raw' to 'preprocessed'
        cats = copy(categories_dict)
        for cat in row['categories']:
            cats[cat] = 1

        doc.cats = cats
        #print(doc.cats)
        db.add(doc)

    print(f'{time()}: finish processing {filename}')
    db.to_disk(filename)

In [None]:
convert_multi_label(train, 'wikipedia_multi_label_train2.spacy')


In [None]:
convert_multi_label(test, 'wikipedia_multi_label_train2.spacy')


### 4.2 Model Training
Train a multi-label text classification model using Spacy's textcat component.


In [None]:
# Create Spacy training configuration for multi-label text classification
import spacy
from spacy.training import Example
from spacy.tokens import DocBin
import random
from pathlib import Path

# Create output directory for the model
output_dir = Path("./wikipedia_textcat_model")
output_dir.mkdir(exist_ok=True)

print("Creating blank Spacy model with textcat component...")
nlp = spacy.blank("en")

# Add the textcat component to the pipeline
textcat = nlp.add_pipe("textcat_multilabel", last=True)

# Get all unique categories from the training data
print("\nLoading training data to get categories...")
train_docbin = DocBin().from_disk("wikipedia_multi_label_train2.spacy")
train_docs = list(train_docbin.get_docs(nlp.vocab))

# Get all categories from the first document (they all have the same keys)
if train_docs:
    categories = list(train_docs[0].cats.keys())
    print(f"Found {len(categories)} categories:")
    print(f"Categories: {sorted(categories)[:10]}..." if len(categories) > 10 else sorted(categories))
    
    # Add labels to textcat
    for category in categories:
        textcat.add_label(category)
    
    print(f"\nAdded {len(categories)} labels to textcat component")
else:
    print("Warning: No training documents found!")

print("\nModel configuration complete!")
print(f"Pipeline components: {nlp.pipe_names}")


In [None]:
# Train the model
print("Starting training...")
print(f"Training on {len(train_docs)} documents\n")

# Initialize the model
nlp.initialize(lambda: [Example.from_dict(doc, {"cats": doc.cats}) for doc in train_docs])

# Training parameters
n_iter = 20  # Number of training iterations
batch_size = 8
dropout = 0.2

# Get the optimizer
optimizer = nlp.resume_training()

# Training loop
losses_history = []
for epoch in range(n_iter):
    random.shuffle(train_docs)
    losses = {}
    
    # Create batches
    batches = [train_docs[i:i+batch_size] for i in range(0, len(train_docs), batch_size)]
    
    for batch in batches:
        examples = []
        for doc in batch:
            # Create Example from doc
            example = Example.from_dict(doc, {"cats": doc.cats})
            examples.append(example)
        
        # Update the model
        nlp.update(examples, drop=dropout, losses=losses, sgd=optimizer)
    
    losses_history.append(losses.get('textcat_multilabel', 0))
    
    # Print progress every 5 iterations
    if (epoch + 1) % 5 == 0:
        print(f"Iteration {epoch + 1}/{n_iter} - Loss: {losses.get('textcat_multilabel', 0):.4f}")

print(f"\nTraining complete! Final loss: {losses_history[-1]:.4f}")

# Save the trained model
print(f"\nSaving model to {output_dir}...")
nlp.to_disk(output_dir)
print("Model saved!")

# Plot training loss
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.plot(range(1, n_iter + 1), losses_history, 'b-', linewidth=2)
plt.xlabel('Iteration', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.title('Training Loss Over Time', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


### 4.3 Model Evaluation
Evaluate the trained model on the test set and analyze performance.


In [None]:
# Evaluate the model on test data
print("Evaluating model on test set...")

# Load the trained model
nlp_trained = spacy.load(output_dir)

# Load test data
test_docbin = DocBin().from_disk("wikipedia_multi_label_train2.spacy")  # Note: should be test file
test_docs_raw = list(test_docbin.get_docs(nlp_trained.vocab))

print(f"Loaded {len(test_docs_raw)} test documents")

# Evaluate on test set
from sklearn.metrics import classification_report, hamming_loss, accuracy_score
import numpy as np

# Prepare data for evaluation
y_true = []
y_pred = []
category_names = sorted(test_docs_raw[0].cats.keys())

print(f"\nEvaluating on {len(category_names)} categories...")

for doc in test_docs_raw:
    # Get true labels
    true_labels = [doc.cats[cat] for cat in category_names]
    y_true.append(true_labels)
    
    # Get predictions
    predicted = nlp_trained(doc.text)
    pred_labels = [predicted.cats[cat] for cat in category_names]
    y_pred.append(pred_labels)

# Convert to numpy arrays
y_true = np.array(y_true)
y_pred_probs = np.array(y_pred)

# Convert probabilities to binary predictions (threshold = 0.5)
y_pred_binary = (y_pred_probs > 0.5).astype(int)

# Calculate metrics
print("\n" + "="*70)
print("MULTI-LABEL CLASSIFICATION RESULTS")
print("="*70)

# Overall metrics
hamming = hamming_loss(y_true, y_pred_binary)
accuracy_subset = accuracy_score(y_true, y_pred_binary)

print(f"\nOverall Metrics:")
print(f"  Hamming Loss: {hamming:.4f} (lower is better)")
print(f"  Subset Accuracy: {accuracy_subset:.4f} (exact match of all labels)")

# Per-label metrics
print(f"\nPer-Category Performance:")
print("-" * 70)

# Calculate precision, recall, F1 for each category
from sklearn.metrics import precision_recall_fscore_support

precisions, recalls, f1s, supports = precision_recall_fscore_support(
    y_true, y_pred_binary, average=None, zero_division=0
)

# Create results dataframe
results_df = pd.DataFrame({
    'Category': category_names,
    'Precision': precisions,
    'Recall': recalls,
    'F1-Score': f1s,
    'Support': supports
})

# Sort by F1 score
results_df = results_df.sort_values('F1-Score', ascending=False)

# Show top 10 and bottom 10
print("\nTop 10 Categories (by F1-Score):")
print(results_df.head(10).to_string(index=False))

print("\nBottom 10 Categories (by F1-Score):")
print(results_df.tail(10).to_string(index=False))

# Macro and micro averages
precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
    y_true, y_pred_binary, average='macro', zero_division=0
)
precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
    y_true, y_pred_binary, average='micro', zero_division=0
)

print("\n" + "="*70)
print("Average Metrics:")
print(f"  Macro-averaged F1: {f1_macro:.4f} (unweighted mean)")
print(f"  Micro-averaged F1: {f1_micro:.4f} (weighted by support)")
print(f"  Macro-averaged Precision: {precision_macro:.4f}")
print(f"  Macro-averaged Recall: {recall_macro:.4f}")
print("="*70)


In [None]:
# Visualize classification results
import matplotlib.pyplot as plt
import numpy as np

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. F1 scores for top categories
ax1 = axes[0, 0]
top_20 = results_df.head(20)
ax1.barh(range(len(top_20)), top_20['F1-Score'].values)
ax1.set_yticks(range(len(top_20)))
ax1.set_yticklabels(top_20['Category'].values, fontsize=8)
ax1.set_xlabel('F1-Score', fontsize=10)
ax1.set_title('Top 20 Categories by F1-Score', fontsize=12, fontweight='bold')
ax1.grid(True, alpha=0.3, axis='x')
ax1.invert_yaxis()

# 2. Precision vs Recall scatter
ax2 = axes[0, 1]
scatter = ax2.scatter(results_df['Recall'], results_df['Precision'], 
                     s=results_df['Support']*10, alpha=0.6, c=results_df['F1-Score'],
                     cmap='viridis')
ax2.set_xlabel('Recall', fontsize=10)
ax2.set_ylabel('Precision', fontsize=10)
ax2.set_title('Precision vs Recall (bubble size = support)', fontsize=12, fontweight='bold')
ax2.grid(True, alpha=0.3)
ax2.plot([0, 1], [0, 1], 'r--', alpha=0.3)
plt.colorbar(scatter, ax=ax2, label='F1-Score')

# 3. Distribution of F1 scores
ax3 = axes[1, 0]
ax3.hist(results_df['F1-Score'].values, bins=20, edgecolor='black', alpha=0.7)
ax3.axvline(f1_macro, color='r', linestyle='--', linewidth=2, label=f'Macro Avg: {f1_macro:.3f}')
ax3.axvline(f1_micro, color='g', linestyle='--', linewidth=2, label=f'Micro Avg: {f1_micro:.3f}')
ax3.set_xlabel('F1-Score', fontsize=10)
ax3.set_ylabel('Number of Categories', fontsize=10)
ax3.set_title('Distribution of F1-Scores Across Categories', fontsize=12, fontweight='bold')
ax3.legend()
ax3.grid(True, alpha=0.3, axis='y')

# 4. Category support distribution
ax4 = axes[1, 1]
support_sorted = results_df.sort_values('Support', ascending=False)
ax4.bar(range(min(30, len(support_sorted))), support_sorted['Support'].head(30).values)
ax4.set_xlabel('Category (sorted by support)', fontsize=10)
ax4.set_ylabel('Number of Test Samples', fontsize=10)
ax4.set_title('Top 30 Categories by Test Set Support', fontsize=12, fontweight='bold')
ax4.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print("\n" + "="*70)
print("Visualization complete!")
print("="*70)


In [None]:
# Show example predictions
print("="*70)
print("EXAMPLE PREDICTIONS")
print("="*70)

# Get a few test examples
num_examples = 5
for i in range(min(num_examples, len(test_docs_raw))):
    doc_raw = test_docs_raw[i]
    
    # Get prediction
    predicted = nlp_trained(doc_raw.text)
    
    # Get true and predicted categories (above threshold)
    threshold = 0.5
    true_cats = [cat for cat in category_names if doc_raw.cats[cat] == 1]
    pred_cats = [(cat, score) for cat, score in predicted.cats.items() if score > threshold]
    pred_cats_sorted = sorted(pred_cats, key=lambda x: x[1], reverse=True)
    
    print(f"\n--- Example {i+1} ---")
    print(f"Text preview: {doc_raw.text[:150]}...")
    print(f"\nTrue categories ({len(true_cats)}): {', '.join(true_cats[:5])}")
    if len(true_cats) > 5:
        print(f"  ... and {len(true_cats) - 5} more")
    
    print(f"\nPredicted categories ({len(pred_cats_sorted)}):")
    for cat, score in pred_cats_sorted[:5]:
        match = "✓" if cat in true_cats else "✗"
        print(f"  {match} {cat}: {score:.3f}")
    if len(pred_cats_sorted) > 5:
        print(f"  ... and {len(pred_cats_sorted) - 5} more")
    
    # Calculate accuracy for this example
    correct = len(set(true_cats) & set([c for c, _ in pred_cats_sorted]))
    precision_ex = correct / len(pred_cats_sorted) if pred_cats_sorted else 0
    recall_ex = correct / len(true_cats) if true_cats else 0
    
    print(f"\nExample metrics: Precision={precision_ex:.2f}, Recall={recall_ex:.2f}")

print("\n" + "="*70)


In [None]:
def estimate_cats(model, article):
    name = article["title"]
    text = article["raw"]
    expected_cats = article["categories"]

    nlp = spacy.load(f'{model}/model-best')
    doc = nlp(text)
    estimated_cats = (sorted(doc.cats.items(), key=lambda i:float(i[1]), reverse=True))

    print(f'Article {name} || model {model}"')
    print(expected_cats)
    print(estimated_cats)
