## Installing and importing Libraries

In [3]:
!pip install tqdm

Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1


In [4]:
import pandas as pd
import numpy as np
import bz2
import xml.etree.ElementTree as ET
from io import BytesIO
import pandas as pd
import requests
from tqdm import tqdm
import logging
from datetime import datetime

In [7]:
def extract_links(self, text):
    """
    Enhanced link extraction with better pattern matching
    """
    links = set()
    if not text:
        return links
        
    # Improved regex pattern for Wikipedia links
    link_pattern = re.compile(r'\[\[(.*?)\]\]')
    for match in link_pattern.finditer(text):
        link_content = match.group(1)
        
        # Skip empty links
        if not link_content.strip():
            continue
            
        # Handle piped links and section links
        link = link_content.split('|')[0].split('#')[0].strip()
        
        # Handle namespace prefixes
        if ':' in link:
            namespace, rest = link.split(':', 1)
            namespace = namespace.lower()
            if namespace in {'category', 'file', 'image', 'template', 'user', 
                             'wikipedia', 'wp', 'project', 'mediawiki'}:
                continue
            link = rest  # Use the part after namespace
        
        if link:  # Add if not empty after processing
            links.add(link)
    
    return list(links)

def parse_dump(self, file_obj, max_articles=None, block_size=1000):
    """
    More robust parsing with better memory management
    """
    try:
        logger.info("Starting to parse Wikipedia dump...")
        
        decompressor = bz2.BZ2Decompressor()
        buffer = b''
        article_count = 0
        
        # Use incremental parser
        parser = ET.XMLPullParser(['start', 'end'])
        
        while True:
            chunk = file_obj.read(8192)
            if not chunk:
                # Final flush of decompressor
                buffer += decompressor.flush()
                if not buffer:
                    break
            
            buffer += decompressor.decompress(chunk)
            
            while b'<page>' in buffer and b'</page>' in buffer:
                page_start = buffer.find(b'<page>')
                page_end = buffer.find(b'</page>') + len(b'</page>')
                page_xml = buffer[page_start:page_end]
                buffer = buffer[page_end:]
                
                try:
                    parser.feed(page_xml)
                    for event, elem in parser.read_events():
                        if event == 'end' and elem.tag == 'page':
                            yield from self._process_page(elem)
                            article_count += 1
                            elem.clear()  # Free memory
                            
                            # Check block size and max articles
                            if article_count % block_size == 0:
                                yield self._create_dataframes()
                            if max_articles and article_count >= max_articles:
                                return
                except ET.ParseError as e:
                    logger.warning(f"XML parsing error: {e}")
                    continue
        
        # Yield any remaining articles
        if self.current_block_articles or self.current_block_links:
            yield self._create_dataframes()
            
    except Exception as e:
        logger.error(f"Error parsing dump file: {e}")
        raise

In [8]:
# Example usage
if __name__ == "__main__":
    parser = WikipediaDumpParser()
    
    # Process a small sample (1000 articles) for demonstration
    # Remove max_articles parameter to process the entire dump
    articles_file, links_file = parser.process_dump(max_articles=1000, block_size=200)
    
    # Load the results to verify
    articles_df = pd.read_parquet(articles_file)
    links_df = pd.read_parquet(links_file)
    
    print("\nSample Articles:")
    print(articles_df.head())
    
    print("\nSample Links:")
    print(links_df.head())

2025-05-06 18:43:10,273 - INFO - Using dump file URL: https://dumps.wikimedia.org/simplewiki/latest/simplewiki-latest-pages-articles.xml.bz2
2025-05-06 18:43:10,276 - INFO - Starting download of Wikipedia dump file...
100%|██████████| 321M/321M [02:09<00:00, 2.49MB/s] 
2025-05-06 18:45:19,716 - INFO - Download completed successfully.
2025-05-06 18:45:19,719 - INFO - Starting to parse Wikipedia dump...
2025-05-06 18:45:19,720 - ERROR - Error parsing dump file: no element found: line 1, column 0
2025-05-06 18:45:19,722 - ERROR - Error in process_dump: no element found: line 1, column 0


ParseError: no element found: line 1, column 0 (<string>)