# Career Page Analyzer Debugger
This notebook helps debug why certain career pages are or aren't being detected correctly.

In [5]:
# Import required libraries
import asyncio
import json
import sys
import os
from urllib.parse import urlparse
import pandas as pd

# Add scripts directory to path
sys.path.append('../scripts')
sys.path.append('./scripts')

# Import our modules
from email_crawler import EmailCareerWebCrawler
from career_page_graphrag import CareerPageClassifier

print("✅ Modules loaded successfully")

✅ Modules loaded successfully


In [6]:
# Initialize the crawler and classifier
crawler = EmailCareerWebCrawler()
career_classifier = CareerPageClassifier()

print("✅ Crawler and classifier initialized")

🤖 Loading career page classifier...


OSError: Incorrect path_or_model_id: './models/career_classifier'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

## Step 1: Enter Website URL to Test

In [None]:
# Enter the website URL you want to test
TEST_WEBSITE = "http://www.magnoliabakery.com/"

print(f"🎯 Testing website: {TEST_WEBSITE}")

## Step 2: Crawl the Website

In [None]:
# Crawl the website
print("🕷️ Starting crawl...")
print("=" * 80)

# Run the crawl
crawl_result = await crawler.process_single_website(TEST_WEBSITE)

print("\n" + "=" * 80)
print("📊 Crawl Results Summary:")
print(f"   Status: {crawl_result.status}")
print(f"   Pages crawled: {crawl_result.pages_crawled}")
print(f"   Emails found: {len(crawl_result.emails)}")
print(f"   Career links found: {len(crawl_result.career_links)}")
print(f"   Time taken: {crawl_result.time_taken:.1f}s")

## Step 3: Analyze All Career Links Found

In [None]:
# Display all career links found by the crawler
print(f"\n🔗 All Career Links Found ({len(crawl_result.career_links)} total):")
print("=" * 80)

for i, link in enumerate(crawl_result.career_links, 1):
    print(f"\n{i}. {link}")
    
    # Parse the URL to show its components
    parsed = urlparse(link)
    print(f"   Path: {parsed.path}")
    print(f"   Domain: {parsed.netloc}")

## Step 4: Clean and Filter Career Links

In [None]:
# Clean the career links
print("🧹 Cleaning career links...")
print("=" * 80)

# Remove duplicates and normalize
unique_links = []
seen_normalized = set()

for link in crawl_result.career_links:
    # Normalize URL (remove protocol differences)
    normalized = link.replace('https://', '').replace('http://', '').rstrip('/')
    if normalized not in seen_normalized:
        seen_normalized.add(normalized)
        unique_links.append(link)
        print(f"✅ Kept: {link}")
    else:
        print(f"🔄 Duplicate removed: {link}")

# Filter out tracking/analytics URLs
filtered_links = []
for link in unique_links:
    if any(bad in link for bad in ['@', 'pixel', 'analytics', 'tracking', 'wpm@']):
        print(f"❌ Filtered out (tracking): {link}")
    else:
        filtered_links.append(link)
        
print(f"\n📋 After cleaning: {len(filtered_links)} links remain")

## Step 5: Analyze Each Career Link with the Classifier

In [None]:
# Analyze each link in detail
print("🔍 Detailed Analysis of Each Career Link:")
print("=" * 80)

all_results = []

for link in filtered_links:
    print(f"\n🔗 Analyzing: {link}")
    print("-" * 60)
    
    # Extract URL features
    url_features = career_classifier.extract_url_features(link)
    print(f"\n📊 URL Features:")
    print(f"   Path: {url_features['path']}")
    print(f"   Path clean: {url_features['path_clean']}")
    print(f"   Path parts: {url_features['path_parts']}")
    print(f"   URL Pattern Score: {url_features['url_score']:.2%}")
    print(f"   Found patterns: {url_features['found_patterns']}")
    
    # Classify the URL
    result = career_classifier.classify_url(link)
    all_results.append(result)
    
    print(f"\n🤖 Classification Result:")
    print(f"   Is Career Page: {result['is_career_page']}")
    print(f"   Confidence: {result['confidence']:.2%}")
    print(f"   Model Confidence: {result['model_confidence']:.2%}")
    print(f"   Model Prediction: {result['model_prediction']}")
    print(f"   URL Pattern Score: {result['url_pattern_score']:.2%}")
    print(f"   Requires Content: {result['requires_content']}")
    
    # Show the text that was analyzed
    print(f"\n📝 Text analyzed by model:")
    print(f"   Title: {result['title_used']}")
    print(f"   Description: {result['description_used']}")

## Step 6: Rank All Career Pages

In [None]:
# Rank the career pages
print("🏆 Ranking Career Pages:")
print("=" * 80)

# Convert to format expected by rank_career_pages
urls_to_rank = [{'url': link} for link in filtered_links]
ranked_pages = career_classifier.rank_career_pages(urls_to_rank)

# Create a DataFrame for better visualization
ranking_data = []
for i, page in enumerate(ranked_pages, 1):
    ranking_data.append({
        'Rank': i,
        'URL': page['url'],
        'Is Career': page['is_career_page'],
        'Final Score': f"{page['final_score']:.2%}",
        'Confidence': f"{page['confidence']:.2%}",
        'URL Score': f"{page['url_pattern_score']:.2%}",
        'Patterns': ', '.join(page['found_patterns']) if page['found_patterns'] else 'none'
    })

df = pd.DataFrame(ranking_data)
display(df)

## Step 7: Determine Best Career Page

In [None]:
# Determine the best career page
print("🎯 Best Career Page Analysis:")
print("=" * 80)

# Filter for actual career pages
career_pages = [p for p in ranked_pages if p['is_career_page'] or p['url_pattern_score'] > 0.5]

if career_pages:
    best_page = career_pages[0]
    print(f"\n✅ BEST CAREER PAGE: {best_page['url']}")
    print(f"\nWhy this was chosen:")
    print(f"   1. Final Score: {best_page['final_score']:.2%}")
    print(f"   2. Is Career Page: {best_page['is_career_page']}")
    print(f"   3. Confidence: {best_page['confidence']:.2%}")
    print(f"   4. URL Pattern Score: {best_page['url_pattern_score']:.2%}")
    print(f"   5. Patterns Found: {', '.join(best_page['found_patterns'])}")
    
    if len(career_pages) > 1:
        print(f"\n📋 Other candidates:")
        for i, page in enumerate(career_pages[1:4], 2):
            print(f"   {i}. {page['url']} (Score: {page['final_score']:.2%})")
else:
    print("❌ No career pages identified!")
    print("\nPossible reasons:")
    print("   1. The crawler didn't find the career links")
    print("   2. The URL patterns don't match our known patterns")
    print("   3. The model couldn't classify them without page content")

## Step 8: Show All Emails Found (Bonus)

In [None]:
# Also show emails found
if crawl_result.emails:
    print(f"\n📧 Emails Found ({len(crawl_result.emails)}):")
    print("=" * 80)
    for email in sorted(crawl_result.emails):
        print(f"   - {email}")

## Step 9: Debug Why Career Pages Might Be Missed

In [None]:
# Check if the crawler is missing career pages
print("🐛 Debugging: Checking for Common Career Page Patterns")
print("=" * 80)

# Common career page URLs to check
common_patterns = [
    '/careers', '/jobs', '/join-us', '/join-our-team', '/work-with-us',
    '/employment', '/opportunities', '/hiring', '/recruitment',
    '/pages/careers', '/pages/jobs', '/pages/join-our-team',
    '/about/careers', '/about/jobs', '/company/careers'
]

base_url = TEST_WEBSITE.rstrip('/')
parsed_base = urlparse(base_url)
base_domain = f"{parsed_base.scheme}://{parsed_base.netloc}"

print("Checking if these URLs were found by the crawler:")
for pattern in common_patterns:
    test_url = base_domain + pattern
    
    # Check if this URL was found
    found = any(pattern in link for link in crawl_result.career_links)
    
    if found:
        print(f"✅ {pattern} - FOUND")
    else:
        print(f"❌ {pattern} - NOT FOUND")

# Check all links that were crawled
print(f"\n📄 Total pages visited: {crawl_result.pages_crawled}")
if hasattr(crawl_result, 'visited_urls') and crawl_result.visited_urls:
    print("\nAll URLs visited during crawl:")
    for url in sorted(crawl_result.visited_urls):
        print(f"   - {url}")

## Step 10: Test Specific URL Manually

In [None]:
# Test a specific URL manually
# Change this to test any specific URL
TEST_CAREER_URL = "https://www.magnoliabakery.com/pages/join-our-team"

print(f"🔬 Manual Test of: {TEST_CAREER_URL}")
print("=" * 80)

# Extract features
features = career_classifier.extract_url_features(TEST_CAREER_URL)
print("\nURL Features:")
for key, value in features.items():
    print(f"   {key}: {value}")

# Classify
result = career_classifier.classify_url(TEST_CAREER_URL, 
                                       page_title="Join Our Team",
                                       meta_description="Career opportunities at our company")

print("\nClassification Result:")
for key, value in result.items():
    if key not in ['title_used', 'description_used']:
        print(f"   {key}: {value}")