In [12]:
import json
import time
import os
import re
import nltk
from tqdm.notebook import tqdm
import wikipedia
from wikipedia.exceptions import WikipediaException, DisambiguationError, PageError
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [14]:
print("✅ Working directory set to:", os.getcwd())


✅ Working directory set to: /Users/shailshah/Desktop/NEU MSCS/Sem4/IR/Final Project/CS6200InformationRetrievalProject


In [11]:
for root, dirs, files in os.walk(os.getcwd()):
    for file in files:
        print("Found:", os.path.join(root, file))

Found: /Users/shailshah/Desktop/NEU MSCS/Sem4/IR/Final Project/CS6200InformationRetrievalProject/DataExtraction.ipynb
Found: /Users/shailshah/Desktop/NEU MSCS/Sem4/IR/Final Project/CS6200InformationRetrievalProject/requirements.txt
Found: /Users/shailshah/Desktop/NEU MSCS/Sem4/IR/Final Project/CS6200InformationRetrievalProject/SampleDataAndGradeContract.txt
Found: /Users/shailshah/Desktop/NEU MSCS/Sem4/IR/Final Project/CS6200InformationRetrievalProject/README.md
Found: /Users/shailshah/Desktop/NEU MSCS/Sem4/IR/Final Project/CS6200InformationRetrievalProject/.git/ORIG_HEAD
Found: /Users/shailshah/Desktop/NEU MSCS/Sem4/IR/Final Project/CS6200InformationRetrievalProject/.git/config
Found: /Users/shailshah/Desktop/NEU MSCS/Sem4/IR/Final Project/CS6200InformationRetrievalProject/.git/HEAD
Found: /Users/shailshah/Desktop/NEU MSCS/Sem4/IR/Final Project/CS6200InformationRetrievalProject/.git/description
Found: /Users/shailshah/Desktop/NEU MSCS/Sem4/IR/Final Project/CS6200InformationRetrievalPr

In [15]:
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Now download NLTK data
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/shailshah/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shailshah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shailshah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
# Load the annotated queries data from JSON file
def load_queries(json_path):
    with open(json_path, 'r') as f:
        data = json.load(f)
    return data['queries']

# Function to extract page title from Wikipedia URL
def extract_title_from_url(url):
    # Extract the page title from the URL path
    parts = url.split('/')
    title = parts[-1]
    # Replace underscores with spaces and URL decode
    title = title.replace('_', ' ')
    return title

# Function to extract content from Wikipedia using the wikipedia library
def extract_wikipedia_content(url):
    try:
        # Extract the title from the URL
        title = extract_title_from_url(url)
        
        # Add a small delay to be respectful
        time.sleep(0.5)
        
        # Get the page content
        page = wikipedia.page(title, auto_suggest=False)
        content = page.content
        
        # Clean the content
        content = re.sub(r'\n+', ' ', content)  # Replace newlines with spaces
        content = re.sub(r'\s+', ' ', content)  # Normalize whitespace
        content = content.strip()
        
        return content, page.url
    except DisambiguationError as e:
        print(f"Disambiguation error for {title}: {e}")
        return "", url
    except PageError as e:
        print(f"Page not found for {title}: {e}")
        return "", url
    except WikipediaException as e:
        print(f"Wikipedia API error for {title}: {e}")
        return "", url
    except Exception as e:
        print(f"Error extracting content for {title}: {str(e)}")
        return "", url


# Function to preprocess text (tokenize, lemmatize, remove stopwords)
def preprocess_text(text):
    # Remove URLs, hyphens, and non-alphanumeric characters (except whitespace)
    finaltext = re.sub(r'http\S+', '', text)  # URLs
    finaltext = re.sub(r'-', ' ', finaltext)       # Hyphens → spaces
    finaltext = re.sub(r'[^\w\s]', ' ', finaltext) # Punctuation
    
    # Remove numbers and extra whitespace
    finaltext = re.sub(r'\d+', ' ', finaltext)     # Numbers
    finaltext = re.sub(r'\s+', ' ', finaltext).strip().lower()
    
    # Tokenize and lowercase
    tokens = word_tokenize(finaltext.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Remove non-alphabetic tokens
    filtered_tokens = [token for token in filtered_tokens if token.isalpha()]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    # Join tokens back into a string
    return ' '.join(lemmatized_tokens)

In [18]:
from tqdm import tqdm


# Function to build dataset with highly relevant documents
def build_dataset(queries, output_path, min_relevance=3):
    dataset = []
    
    # Process each query
    for query in tqdm(queries, desc="Processing queries"):
        
        if 'documents' not in query:
            print(f"Skipping query ID {query.get('id', 'unknown')}: No documents found")
            continue
            
        query_id = query.get('id')
        query_text = query.get('query', '')
        narrative = query.get('narrative', '')
        
        # Extract documents with relevance >= min_relevance
        relevant_docs = []
        for doc in query['documents']:
            relevance = doc.get('relevance_score')
            if relevance is not None and relevance >= min_relevance:
                doc_url = doc.get('url', '')
                doc_title = doc.get('title', '')
                
                if not doc_url:
                    print(f"Skipping document {doc_title}: No URL provided")
                    continue
                
                # Extract content from Wikipedia
                content, actual_url = extract_wikipedia_content(doc_url)
                
                if content:
                    preprocessed_content = preprocess_text(content)
                    relevant_docs.append({
                        'title': doc_title,
                        'url': actual_url,
                        'content': preprocessed_content,
                        'relevance_score': relevance
                    })
        
        # Only add the query if it has relevant documents
        if relevant_docs:
            dataset.append({
                'query_id': query_id,
                'query': query_text,
                'narrative': narrative,
                'documents': relevant_docs
            })
    
    # Save the dataset as JSON
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump({'dataset': dataset}, f, indent=2, ensure_ascii=False)
    
    print(f"Dataset saved to {output_path}")
    return dataset

# Main execution
json_path = 'data/ManualAnnotatedQueries.json'
output_path = 'data/WikipediaRelevantDocs.json'

# Load queries and build dataset
queries = load_queries(json_path)
print(f"Loaded {len(queries)} queries from {json_path}")

# Build the dataset with documents that have relevance score >= 3
dataset = build_dataset(queries, output_path, min_relevance=3)

# Print some statistics
total_docs = sum(len(item['documents']) for item in dataset)
print(f"Total queries with relevant documents: {len(dataset)}")
print(f"Total documents extracted: {total_docs}")
print(f"Average documents per query: {total_docs / len(dataset) if len(dataset) > 0 else 0:.2f}")

# Display first query and document as sample (if available)
if dataset and dataset[0].get('documents'):
    sample_query = dataset[0]
    sample_doc = sample_query['documents'][0]
    
    print("\nSample Query:")
    print(f"ID: {sample_query.get('query_id')}")
    print(f"Query: {sample_query.get('query')}")
    print(f"Narrative: {sample_query.get('narrative', '')[:100]}...")
    
    print("\nSample Document:")
    print(f"Title: {sample_doc.get('title')}")
    print(f"URL: {sample_doc.get('url')}")
    print(f"Content (first 200 chars): {sample_doc.get('content', '')[:200]}...")

Loaded 50 queries from data/ManualAnnotatedQueries.json




Page not found for Cold storage (cryptocurrency): Page id "Cold storage (cryptocurrency)" does not match any pages. Try another id!




Page not found for Risk and return: Page id "Risk and return" does not match any pages. Try another id!




Page not found for Apportionment in the United States House of Representatives: Page id "Apportionment in the United States House of Representatives" does not match any pages. Try another id!




Page not found for Meditation and education: Page id "Meditation and education" does not match any pages. Try another id!




Page not found for Neuroscience of meditation: Page id "Neuroscience of meditation" does not match any pages. Try another id!
Page not found for Meditation and neuroplasticity: Page id "Meditation and neuroplasticity" does not match any pages. Try another id!




Page not found for Meditation and the brain: Page id "Meditation and the brain" does not match any pages. Try another id!




Page not found for Sleep and academic performance: Page id "Sleep and academic performance" does not match any pages. Try another id!




Page not found for Sleep and mental health: Page id "Sleep and mental health" does not match any pages. Try another id!




Page not found for 3D printing in medicine: Page id "3D printing in medicine" does not match any pages. Try another id!




Page not found for 3D printing in aerospace: Page id "3D printing in aerospace" does not match any pages. Try another id!
Page not found for 3D printing in automotive: Page id "3D printing in automotive" does not match any pages. Try another id!




Page not found for 3D printing in education: Page id "3D printing in education" does not match any pages. Try another id!




Page not found for Future of 3D printing: Page id "Future of 3D printing" does not match any pages. Try another id!


Processing queries: 100%|███████████████████████| 50/50 [03:03<00:00,  3.67s/it]

Dataset saved to data/WikipediaRelevantDocs.json
Total queries with relevant documents: 50
Total documents extracted: 154
Average documents per query: 3.08

Sample Query:
ID: 1
Query: Cryptocurrency basics
Narrative: The user wants to understand the fundamental concepts of cryptocurrencies, including how they work, ...

Sample Document:
Title: Cryptocurrency
URL: https://en.wikipedia.org/wiki/Cryptocurrency
Content (first 200 chars): cryptocurrency colloquially crypto digital currency designed work computer network reliant central authority government bank uphold maintain individual coin ownership record stored digital ledger bloc...



