In [2]:
"""
WordNet Vocabulary Extractor for English Vocabulary Quiz App
Extracts words, definitions, examples, synonyms, and difficulty levels
"""

import requests
import tarfile
import io
import os
import re
import json
import csv
import sqlite3
from collections import defaultdict, Counter
from typing import Dict, List, Tuple, Set, Optional
import statistics
from pathlib import Path
import logging

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


class WordNetExtractor:
    """Extracts and processes WordNet data for vocabulary quiz application"""
    
    def __init__(self, output_dir: str = "vocabulary_data"):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        
        # POS mapping
        self.POS_MAP = {
            'n': 'noun',
            'v': 'verb',
            'a': 'adjective',
            's': 'adjective',  # satellite adjective
            'r': 'adverb'
        }
        
        # Reverse mapping for database
        self.POS_ID_MAP = {
            'noun': 1,
            'verb': 2,
            'adjective': 3,
            'adverb': 4,
            'preposition': 5,
            'conjunction': 6,
            'pronoun': 7,
            'interjection': 8
        }
        
        # Difficulty levels mapping (CEFR)
        self.DIFFICULTY_LEVELS = {
            'A1': 1,  # Beginner
            'A2': 2,  # Elementary
            'B1': 3,  # Intermediate
            'B2': 4,  # Upper Intermediate
            'C1': 5,  # Advanced
            'C2': 6   # Proficient
        }
        
        # Common word frequency list (for difficulty estimation)
        self.common_words = self._load_common_words()
        
    def _load_common_words(self) -> Set[str]:
        """Load common English words for frequency reference"""
        common_words = set()
        try:
            # You can replace this with a better frequency list
            basic_words = {
                'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'I',
                'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at',
                'this', 'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her',
                'she', 'or', 'an', 'will', 'my', 'one', 'all', 'would', 'there',
                'their', 'what', 'so', 'up', 'out', 'if', 'about', 'who', 'get',
                'which', 'go', 'me', 'when', 'make', 'can', 'like', 'time', 'no',
                'just', 'him', 'know', 'take', 'people', 'into', 'year', 'your',
                'good', 'some', 'could', 'them', 'see', 'other', 'than', 'then',
                'now', 'look', 'only', 'come', 'its', 'over', 'think', 'also',
                'back', 'after', 'use', 'two', 'how', 'our', 'work', 'first',
                'well', 'way', 'even', 'new', 'want', 'because', 'any', 'these',
                'give', 'day', 'most', 'us'
            }
            common_words.update(basic_words)
        except Exception as e:
            logger.warning(f"Could not load frequency list: {e}")
        
        return common_words
    
    def download_wordnet(self) -> bool:
        """Download and extract WordNet 3.0"""
        url = "http://wordnetcode.princeton.edu/3.0/WordNet-3.0.tar.gz"
        
        try:
            logger.info("Downloading WordNet 3.0...")
            response = requests.get(url, stream=True, timeout=30)
            
            if response.status_code == 200:
                # Create extraction directory
                extract_dir = self.output_dir / "wordnet_raw"
                extract_dir.mkdir(exist_ok=True)
                
                # Extract the archive
                logger.info("Extracting WordNet files...")
                with tarfile.open(fileobj=io.BytesIO(response.content), mode='r:gz') as tar:
                    tar.extractall(extract_dir)
                
                logger.info("WordNet downloaded and extracted successfully")
                return True
            else:
                logger.error(f"Failed to download WordNet. Status: {response.status_code}")
                return False
                
        except Exception as e:
            logger.error(f"Error downloading WordNet: {e}")
            return False
    
    def parse_wordnet_data(self) -> Dict:
        """Parse WordNet data files and extract vocabulary"""
        
        wordnet_dir = self.output_dir / "wordnet_raw" / "WordNet-3.0" / "dict"
        if not wordnet_dir.exists():
            logger.error(f"WordNet directory not found: {wordnet_dir}")
            return {}
        
        vocabulary = {}
        
        # Parse each data file
        data_files = [
            ('n', wordnet_dir / "data.noun"),
            ('v', wordnet_dir / "data.verb"),
            ('a', wordnet_dir / "data.adj"),
            ('r', wordnet_dir / "data.adv")
        ]
        
        for pos_code, data_file in data_files:
            if not data_file.exists():
                logger.warning(f"Data file not found: {data_file}")
                continue
            
            pos_name = self.POS_MAP.get(pos_code, pos_code)
            logger.info(f"Processing {pos_name}s from {data_file.name}...")
            
            with open(data_file, 'r', encoding='utf-8', errors='ignore') as f:
                for line_num, line in enumerate(f, 1):
                    if line.startswith('  '):
                        continue
                    
                    try:
                        word_entry = self._parse_synset_line(line, pos_code)
                        if word_entry:
                            for word in word_entry['words']:
                                if word not in vocabulary:
                                    vocabulary[word] = {
                                        'word': word,
                                        'definitions': [],
                                        'examples': [],
                                        'synonyms': defaultdict(set),
                                        'part_of_speech': pos_name,
                                        'difficulty': self._estimate_difficulty(word),
                                        'frequency': 0,
                                        'wordnet_offset': word_entry['offset']
                                    }
                                
                                # Add definition if unique
                                if word_entry['definition']:
                                    vocabulary[word]['definitions'].append(word_entry['definition'])
                                
                                # Add example if exists
                                if word_entry['example']:
                                    vocabulary[word]['examples'].append(word_entry['example'])
                                
                                # Add synonyms (other words in this synset)
                                for synonym in word_entry['words']:
                                    if synonym != word:
                                        vocabulary[word]['synonyms'][pos_name].add(synonym)
                    
                    except Exception as e:
                        logger.debug(f"Error parsing line {line_num}: {e}")
            
            logger.info(f"  Processed {len([w for w in vocabulary.values() if w['part_of_speech'] == pos_name])} {pos_name}s")
        
        logger.info(f"Total unique words extracted: {len(vocabulary)}")
        return vocabulary
    
    def _parse_synset_line(self, line: str, pos_code: str) -> Optional[Dict]:
        """Parse a single synset line from WordNet data file"""
        line = line.strip()
        if not line:
            return None
        
        parts = line.split()
        if len(parts) < 5:
            return None
        
        # Parse synset header
        synset_offset = parts[0]
        lex_filenum = parts[1]
        ss_type = parts[2]  # Should match pos_code
        
        # Number of words in synset (in hex)
        word_count = int(parts[3], 16)
        
        # Extract words
        words = []
        for i in range(word_count):
            word_idx = 4 + i * 2
            if word_idx < len(parts):
                word = parts[word_idx].replace('_', ' ').lower().strip()
                if word:  # Skip empty words
                    words.append(word)
        
        if not words:
            return None
        
        # Extract gloss (definition and example)
        gloss = ""
        if '|' in line:
            gloss = line.split('|', 1)[1].strip()
        
        definition, example = self._extract_definition_and_example(gloss)
        
        return {
            'offset': synset_offset,
            'type': ss_type,
            'words': words,
            'definition': definition,
            'example': example,
            'raw_line': line
        }
    
    def _extract_definition_and_example(self, gloss: str) -> Tuple[str, str]:
        """Extract definition and example from gloss"""
        definition = ""
        example = ""
        
        if not gloss:
            return definition, example
        
        # Clean up the gloss
        gloss = re.sub(r'\s+', ' ', gloss).strip()
        
        # Try to split by semicolon or quotation marks
        if ';' in gloss:
            parts = gloss.split(';', 1)
            definition = parts[0].strip()
            if len(parts) > 1:
                example = parts[1].strip()
                # Clean up example
                example = re.sub(r'^"|"$', '', example)  # Remove quotes
                example = re.sub(r'^\'', '', example)  # Remove single quotes
        elif '"' in gloss:
            # Definition might be in quotes
            match = re.search(r'"([^"]+)"', gloss)
            if match:
                example = match.group(1)
                definition = re.sub(r'"([^"]+)"', '', gloss).strip()
        else:
            definition = gloss
        
        # Clean up definition
        definition = definition.strip(';.,"\' ')
        
        return definition, example
    
    def _estimate_difficulty(self, word: str) -> str:
        """Estimate CEFR difficulty level for a word"""
        # Simple heuristics based on word characteristics
        word = word.lower()
        
        # Check if it's a very common word
        if word in self.common_words or len(word) <= 3:
            return 'A1'
        
        # Check word length
        if len(word) <= 5:
            return 'A2'
        elif len(word) <= 7:
            # Check for common prefixes/suffixes
            common_patterns = ['un', 're', 'dis', 'ing', 'ed', 'ly', 'er', 'est']
            if any(word.startswith(p) or word.endswith(p) for p in common_patterns):
                return 'B1'
            return 'B2'
        elif len(word) <= 9:
            return 'B2'
        elif len(word) <= 11:
            return 'C1'
        else:
            return 'C2'
    
    def filter_and_clean_vocabulary(self, vocabulary: Dict, min_definitions: int = 1) -> List[Dict]:
        """Filter and clean vocabulary data"""
        cleaned_vocabulary = []
        
        logger.info("Filtering and cleaning vocabulary...")
        
        for word, data in vocabulary.items():
            # Skip words that don't meet criteria
            if not self._should_include_word(word, data):
                continue
            
            # Skip words without definitions
            if len(data['definitions']) < min_definitions:
                continue
            
            # Get best definition (first one)
            best_definition = data['definitions'][0] if data['definitions'] else ""
            
            # Get best example (first one)
            best_example = data['examples'][0] if data['examples'] else ""
            
            # Get top synonyms (limit to 5)
            synonyms = []
            for pos_synonyms in data['synonyms'].values():
                synonyms.extend(list(pos_synonyms))
            synonyms = synonyms[:5]
            
            # Create cleaned entry
            entry = {
                'word': word,
                'part_of_speech': data['part_of_speech'],
                'definition': best_definition,
                'example': best_example,
                'difficulty_level': data['difficulty'],
                'synonyms': synonyms,
                'wordnet_offset': data['wordnet_offset'],
                'definition_count': len(data['definitions']),
                'example_count': len(data['examples'])
            }
            
            cleaned_vocabulary.append(entry)
        
        # Sort by word
        cleaned_vocabulary.sort(key=lambda x: x['word'])
        
        logger.info(f"After filtering: {len(cleaned_vocabulary)} words")
        return cleaned_vocabulary
    
    def _should_include_word(self, word: str, data: Dict) -> bool:
        """Determine if a word should be included in the vocabulary"""
        # Skip words with special characters (except hyphens in compound words)
        if re.search(r'[^a-zA-Z\- ]', word):
            return False
        
        # Skip single letters (except 'a' and 'i')
        if len(word) == 1 and word not in ['a', 'i']:
            return False
        
        # Skip proper nouns (capitalized in WordNet)
        if word[0].isupper():
            return False
        
        # Skip words that are just numbers
        if word.replace('-', '').replace(' ', '').isdigit():
            return False
        
        # Skip very obscure or technical words
        if len(word) > 20:
            return False
        
        return True
    
    def export_to_csv(self, vocabulary: List[Dict], filename: str = "vocabulary.csv"):
        """Export vocabulary to CSV file"""
        csv_path = self.output_dir / filename
        
        # Define CSV headers
        fieldnames = [
            'word', 'part_of_speech', 'definition', 'example', 
            'difficulty_level', 'synonyms', 'wordnet_offset'
        ]
        
        with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            
            for entry in vocabulary:
                # Convert synonyms list to string
                entry_copy = entry.copy()
                entry_copy['synonyms'] = ';'.join(entry['synonyms'])
                writer.writerow(entry_copy)
        
        logger.info(f"Exported {len(vocabulary)} words to {csv_path}")
    
    def export_to_sql(self, vocabulary: List[Dict], db_name: str = "vocabulary.db"):
        """Export vocabulary to SQLite database"""
        db_path = self.output_dir / db_name
        
        try:
            conn = sqlite3.connect(db_path)
            cursor = conn.cursor()
            
            # Create tables
            cursor.execute('''
                CREATE TABLE IF NOT EXISTS parts_of_speech (
                    pos_id INTEGER PRIMARY KEY,
                    name TEXT UNIQUE NOT NULL,
                    abbreviation TEXT,
                    description TEXT,
                    sort_order INTEGER DEFAULT 0
                )
            ''')
            
            cursor.execute('''
                CREATE TABLE IF NOT EXISTS difficulty_levels (
                    level_id INTEGER PRIMARY KEY,
                    name TEXT UNIQUE NOT NULL,
                    description TEXT,
                    sort_order INTEGER DEFAULT 0
                )
            ''')
            
            cursor.execute('''
                CREATE TABLE IF NOT EXISTS words (
                    word_id INTEGER PRIMARY KEY AUTOINCREMENT,
                    word TEXT UNIQUE NOT NULL,
                    part_of_speech_id INTEGER,
                    definition TEXT NOT NULL,
                    example_sentence TEXT,
                    phonetic_spelling TEXT,
                    difficulty_level_id INTEGER DEFAULT 1,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    FOREIGN KEY (part_of_speech_id) REFERENCES parts_of_speech(pos_id),
                    FOREIGN KEY (difficulty_level_id) REFERENCES difficulty_levels(level_id)
                )
            ''')
            
            # Create indexes
            cursor.execute('CREATE INDEX IF NOT EXISTS idx_word ON words(word)')
            cursor.execute('CREATE INDEX IF NOT EXISTS idx_difficulty ON words(difficulty_level_id)')
            
            # Insert parts of speech
            for pos_name, pos_id in self.POS_ID_MAP.items():
                cursor.execute('''
                    INSERT OR IGNORE INTO parts_of_speech (pos_id, name) 
                    VALUES (?, ?)
                ''', (pos_id, pos_name))
            
            # Insert difficulty levels
            for level_name, level_id in self.DIFFICULTY_LEVELS.items():
                cursor.execute('''
                    INSERT OR IGNORE INTO difficulty_levels (level_id, name) 
                    VALUES (?, ?)
                ''', (level_id, level_name))
            
            # Insert words
            for entry in vocabulary:
                pos_id = self.POS_ID_MAP.get(entry['part_of_speech'], 1)
                difficulty_id = self.DIFFICULTY_LEVELS.get(entry['difficulty_level'], 3)
                
                cursor.execute('''
                    INSERT OR REPLACE INTO words 
                    (word, part_of_speech_id, definition, example_sentence, difficulty_level_id)
                    VALUES (?, ?, ?, ?, ?)
                ''', (
                    entry['word'],
                    pos_id,
                    entry['definition'],
                    entry['example'],
                    difficulty_id
                ))
            
            conn.commit()
            logger.info(f"Exported {len(vocabulary)} words to SQLite database: {db_path}")
            
            # Show statistics
            cursor.execute("SELECT COUNT(*) FROM words")
            word_count = cursor.fetchone()[0]
            logger.info(f"Database contains {word_count} words")
            
            cursor.execute('''
                SELECT dl.name, COUNT(*) 
                FROM words w 
                JOIN difficulty_levels dl ON w.difficulty_level_id = dl.level_id 
                GROUP BY dl.name
            ''')
            difficulty_stats = cursor.fetchall()
            logger.info("Difficulty distribution:")
            for level, count in difficulty_stats:
                logger.info(f"  {level}: {count} words")
            
            conn.close()
            
        except sqlite3.Error as e:
            logger.error(f"SQLite error: {e}")
    
    def export_to_json(self, vocabulary: List[Dict], filename: str = "vocabulary.json"):
        """Export vocabulary to JSON file"""
        json_path = self.output_dir / filename
        
        with open(json_path, 'w', encoding='utf-8') as jsonfile:
            json.dump(vocabulary, jsonfile, ensure_ascii=False, indent=2)
        
        logger.info(f"Exported {len(vocabulary)} words to {json_path}")
    
    def generate_sample_quizzes(self, vocabulary: List[Dict], num_quizzes: int = 5):
        """Generate sample quiz data for testing"""
        quizzes = []
        
        # Group words by difficulty
        words_by_difficulty = defaultdict(list)
        for entry in vocabulary:
            words_by_difficulty[entry['difficulty_level']].append(entry)
        
        # Create quizzes for each difficulty level
        difficulty_order = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']
        
        for difficulty in difficulty_order[:num_quizzes]:
            if difficulty in words_by_difficulty and words_by_difficulty[difficulty]:
                quiz_words = words_by_difficulty[difficulty][:10]  # Take first 10 words
                
                quiz = {
                    'title': f"{difficulty} Level Vocabulary Test",
                    'description': f"Test your {difficulty} level English vocabulary",
                    'difficulty': difficulty,
                    'words': [w['word'] for w in quiz_words],
                    'questions': self._generate_questions(quiz_words)
                }
                quizzes.append(quiz)
        
        # Save quizzes to JSON
        quizzes_path = self.output_dir / "sample_quizzes.json"
        with open(quizzes_path, 'w', encoding='utf-8') as f:
            json.dump(quizzes, f, ensure_ascii=False, indent=2)
        
        logger.info(f"Generated {len(quizzes)} sample quizzes")
        return quizzes
    
    def _generate_questions(self, quiz_words: List[Dict]) -> List[Dict]:
        """Generate sample questions for quiz"""
        questions = []
        
        for word_data in quiz_words:
            # Question 1: Word to Definition
            q1 = {
                'type': 'word_to_definition',
                'question': f"What does '{word_data['word']}' mean?",
                'correct_answer': word_data['definition'],
                'options': [
                    word_data['definition'],
                    self._get_random_definition(quiz_words, word_data['definition']),
                    self._get_random_definition(quiz_words, word_data['definition']),
                    self._get_random_definition(quiz_words, word_data['definition'])
                ],
                'explanation': f"'{word_data['word']}' means: {word_data['definition']}"
            }
            questions.append(q1)
            
            # Question 2: Fill in the blank (if we have example)
            if word_data['example']:
                # Create fill-in-the-blank question
                blank_example = word_data['example'].replace(
                    word_data['word'], '______'
                )
                q2 = {
                    'type': 'fill_in_blank',
                    'question': f"Complete the sentence: {blank_example}",
                    'correct_answer': word_data['word'],
                    'options': [
                        word_data['word'],
                        *[w['word'] for w in quiz_words if w['word'] != word_data['word']][:3]
                    ],
                    'explanation': f"The correct word is '{word_data['word']}'. Example: {word_data['example']}"
                }
                questions.append(q2)
        
        return questions
    
    def _get_random_definition(self, word_list: List[Dict], exclude: str) -> str:
        """Get a random definition from word list, excluding specific definition"""
        import random
        
        candidates = [w['definition'] for w in word_list 
                     if w['definition'] and w['definition'] != exclude]
        
        if candidates:
            return random.choice(candidates)
        return "Not available"
    
    def run_full_extraction(self):
        """Run complete extraction pipeline"""
        logger.info("Starting WordNet vocabulary extraction...")
        
        # Step 1: Download WordNet
        if not self.download_wordnet():
            logger.error("Failed to download WordNet. Exiting.")
            return
        
        # Step 2: Parse WordNet data
        raw_vocabulary = self.parse_wordnet_data()
        if not raw_vocabulary:
            logger.error("No vocabulary extracted. Exiting.")
            return
        
        # Step 3: Filter and clean
        cleaned_vocabulary = self.filter_and_clean_vocabulary(raw_vocabulary)
        
        # Step 4: Export to various formats
        self.export_to_csv(cleaned_vocabulary)
        # self.export_to_json(cleaned_vocabulary)
        # self.export_to_sql(cleaned_vocabulary)
        
        # # Step 5: Generate sample quizzes
        # self.generate_sample_quizzes(cleaned_vocabulary)
        
        logger.info("Vocabulary extraction completed successfully!")
        logger.info(f"Output directory: {self.output_dir.absolute()}")



In [3]:

def main():
    """Main execution function"""
    print("=" * 60)
    print("WORDNET VOCABULARY EXTRACTOR")
    print("For English Vocabulary Quiz Application")
    print("=" * 60)
    
    # Create extractor instance
    extractor = WordNetExtractor(output_dir="vocabulary_data")
    
    # Run full extraction pipeline
    extractor.run_full_extraction()
    
    # Show output files
    print("\n" + "=" * 60)
    print("OUTPUT FILES GENERATED:")
    print("=" * 60)
    
    output_dir = Path("vocabulary_data")
    if output_dir.exists():
        for file in output_dir.iterdir():
            if file.is_file():
                size = file.stat().st_size
                print(f"  {file.name:30} {size:,} bytes")
    
    print("\n" + "=" * 60)
    print("NEXT STEPS:")
    print("=" * 60)
    print("1. Import vocabulary.csv or vocabulary.db into your application")
    print("2. Use sample_quizzes.json as starter quiz data")
    print("3. Customize difficulty levels as needed")
    print("4. Add more words from other sources if required")
    print("=" * 60)


if __name__ == "__main__":
    main()

2026-02-02 02:13:13,895 - INFO - Starting WordNet vocabulary extraction...
2026-02-02 02:13:13,896 - INFO - Downloading WordNet 3.0...


WORDNET VOCABULARY EXTRACTOR
For English Vocabulary Quiz Application


2026-02-02 02:13:15,692 - INFO - Extracting WordNet files...
2026-02-02 02:14:44,085 - INFO - WordNet downloaded and extracted successfully
2026-02-02 02:14:44,087 - INFO - Processing nouns from data.noun...
2026-02-02 02:14:45,499 - INFO -   Processed 117799 nouns
2026-02-02 02:14:45,500 - INFO - Processing verbs from data.verb...
2026-02-02 02:14:45,705 - INFO -   Processed 7433 verbs
2026-02-02 02:14:45,706 - INFO - Processing adjectives from data.adj...
2026-02-02 02:14:46,086 - INFO -   Processed 18634 adjectives
2026-02-02 02:14:46,087 - INFO - Processing adverbs from data.adv...
2026-02-02 02:14:46,154 - INFO -   Processed 3941 adverbs
2026-02-02 02:14:46,154 - INFO - Total unique words extracted: 147807
2026-02-02 02:14:46,155 - INFO - Filtering and cleaning vocabulary...


: 