## **Project 2: Job Board Aggregator**  
*Scrape multiple job boards and create a unified job database with filtering capabilities*

### **Objective**
Build a comprehensive job aggregator that:
- Scrapes Indeed, LinkedIn Jobs, and Glassdoor
- Normalizes job data into a consistent format
- Provides search and filtering capabilities
- Updates job listings daily and removes expired posts

---

### **Step 1: Project Structure & Dependencies**

```python
# Install dependencies
!pip install requests beautifulsoup4 selenium pandas flask apscheduler python-dateutil lxml

# Project structure
"""
job_aggregator/
├── scrapers/
│   ├── indeed.py
│   ├── linkedin.py
│   └── glassdoor.py
├── data/
│   └── jobs.db
├── api/
│   └── app.py
├── utils/
│   ├── database.py
│   ├── job_normalizer.py
│   └── date_parser.py
├── scheduler.py
└── main.py
"""
```

### **Step 2: Database Design**

```python
# utils/database.py
import sqlite3
import pandas as pd
from datetime import datetime, timedelta

class JobDatabase:
    def __init__(self, db_path='data/jobs.db'):
        self.db_path = db_path
        self.init_database()
    
    def init_database(self):
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS jobs (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                job_title TEXT NOT NULL,
                company_name TEXT NOT NULL,
                location TEXT,
                salary_min REAL,
                salary_max REAL,
                salary_currency TEXT DEFAULT 'USD',
                job_url TEXT UNIQUE NOT NULL,
                job_description TEXT,
                job_type TEXT, -- full-time, part-time, contract, etc.
                experience_level TEXT, -- entry, mid, senior
                source TEXT NOT NULL, -- indeed, linkedin, glassdoor
                posted_date DATE,
                scraped_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                is_active BOOLEAN DEFAULT TRUE
            )
        ''')
        
        # Create indexes for performance
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_source ON jobs(source)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_location ON jobs(location)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_posted_date ON jobs(posted_date)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_company ON jobs(company_name)')
        
        conn.commit()
        conn.close()
    
    def save_job(self, job_data):
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        try:
            cursor.execute('''
                INSERT OR REPLACE INTO jobs 
                (job_title, company_name, location, salary_min, salary_max, 
                 salary_currency, job_url, job_description, job_type, 
                 experience_level, source, posted_date, is_active)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                job_data['job_title'],
                job_data['company_name'],
                job_data['location'],
                job_data.get('salary_min'),
                job_data.get('salary_max'),
                job_data.get('salary_currency', 'USD'),
                job_data['job_url'],
                job_data.get('job_description', ''),
                job_data.get('job_type'),
                job_data.get('experience_level'),
                job_data['source'],
                job_data.get('posted_date'),
                True
            ))
            conn.commit()
            return cursor.lastrowid
        except Exception as e:
            print(f"Error saving job: {e}")
            return None
        finally:
            conn.close()
    
    def get_jobs(self, filters=None, limit=100):
        conn = sqlite3.connect(self.db_path)
        
        base_query = "SELECT * FROM jobs WHERE is_active = TRUE"
        params = []
        
        if filters:
            if filters.get('location'):
                base_query += " AND location LIKE ?"
                params.append(f"%{filters['location']}%")
            if filters.get('job_title'):
                base_query += " AND job_title LIKE ?"
                params.append(f"%{filters['job_title']}%")
            if filters.get('company'):
                base_query += " AND company_name LIKE ?"
                params.append(f"%{filters['company']}%")
            if filters.get('source'):
                base_query += " AND source = ?"
                params.append(filters['source'])
            if filters.get('days_old'):
                date_threshold = (datetime.now() - timedelta(days=filters['days_old'])).strftime('%Y-%m-%d')
                base_query += " AND posted_date >= ?"
                params.append(date_threshold)
        
        base_query += " ORDER BY posted_date DESC LIMIT ?"
        params.append(limit)
        
        df = pd.read_sql_query(base_query, conn, params=params)
        conn.close()
        return df
    
    def mark_expired_jobs(self, days_old=30):
        """Mark jobs as inactive if older than specified days"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        date_threshold = (datetime.now() - timedelta(days=days_old)).strftime('%Y-%m-%d')
        cursor.execute('''
            UPDATE jobs 
            SET is_active = FALSE 
            WHERE posted_date < ? AND is_active = TRUE
        ''', (date_threshold,))
        affected_rows = cursor.rowcount
        conn.commit()
        conn.close()
        return affected_rows
```

### **Step 3: Date Parser Utility**

```python
# utils/date_parser.py
from datetime import datetime, timedelta
import re

class DateParser:
    @staticmethod
    def parse_posted_date(date_str, source):
        """
        Parse various date formats from different job boards
        """
        if not date_str:
            return datetime.now().strftime('%Y-%m-%d')
        
        date_str = date_str.lower().strip()
        
        # Handle relative dates (e.g., "2 days ago", "Just posted")
        if 'ago' in date_str or 'just' in date_str or 'today' in date_str:
            if 'hour' in date_str or 'min' in date_str or 'just' in date_str or 'today' in date_str:
                return datetime.now().strftime('%Y-%m-%d')
            elif 'day' in date_str:
                match = re.search(r'(\d+)', date_str)
                if match:
                    days_ago = int(match.group(1))
                    date_obj = datetime.now() - timedelta(days=days_ago)
                    return date_obj.strftime('%Y-%m-%d')
        
        # Handle absolute dates (MM/DD/YYYY, YYYY-MM-DD, etc.)
        date_formats = [
            '%Y-%m-%d',
            '%m/%d/%Y',
            '%d/%m/%Y',
            '%B %d, %Y',
            '%b %d, %Y',
            '%d %B %Y',
            '%d %b %Y'
        ]
        
        for fmt in date_formats:
            try:
                date_obj = datetime.strptime(date_str, fmt)
                return date_obj.strftime('%Y-%m-%d')
            except ValueError:
                continue
        
        # Default to today if parsing fails
        return datetime.now().strftime('%Y-%m-%d')
```

### **Step 4: Job Data Normalizer**

```python
# utils/job_normalizer.py
import re

class JobNormalizer:
    @staticmethod
    def normalize_salary(salary_text):
        """
        Extract min and max salary from various formats
        """
        if not salary_text:
            return None, None
        
        salary_text = salary_text.replace(',', '').replace('$', '').lower()
        
        # Handle ranges like "$50,000 - $70,000"
        range_pattern = r'([\d.]+)\s*(?:k|thousand)?\s*[-–]\s*([\d.]+)\s*(?:k|thousand)?'
        match = re.search(range_pattern, salary_text)
        if match:
            min_val = float(match.group(1))
            max_val = float(match.group(2))
            # Handle 'k' notation
            if 'k' in salary_text or 'thousand' in salary_text:
                min_val *= 1000
                max_val *= 1000
            return min_val, max_val
        
        # Handle single values like "$60,000/year"
        single_pattern = r'([\d.]+)\s*(?:k|thousand)?'
        match = re.search(single_pattern, salary_text)
        if match:
            val = float(match.group(1))
            if 'k' in salary_text or 'thousand' in salary_text:
                val *= 1000
            return val, val
        
        return None, None
    
    @staticmethod
    def normalize_job_type(job_type_text):
        """Normalize job type to standard values"""
        if not job_type_text:
            return None
        
        job_type_text = job_type_text.lower()
        if any(word in job_type_text for word in ['full', 'full-time', 'full time']):
            return 'full-time'
        elif any(word in job_type_text for word in ['part', 'part-time', 'part time']):
            return 'part-time'
        elif any(word in job_type_text for word in ['contract', 'freelance']):
            return 'contract'
        elif 'intern' in job_type_text:
            return 'internship'
        else:
            return 'full-time'  # Default assumption
    
    @staticmethod
    def normalize_experience_level(job_title):
        """Infer experience level from job title"""
        if not job_title:
            return None
        
        job_title = job_title.lower()
        if any(word in job_title for word in ['senior', 'lead', 'principal', 'director', 'manager']):
            return 'senior'
        elif any(word in job_title for word in ['junior', 'associate', 'entry', 'graduate']):
            return 'entry'
        elif 'mid' in job_title or 'intermediate' in job_title:
            return 'mid'
        else:
            return 'mid'  # Default assumption
```

### **Step 5: Indeed Scraper**

```python
# scrapers/indeed.py
import requests
from bs4 import BeautifulSoup
import urllib.parse
from ..utils.date_parser import DateParser
from ..utils.job_normalizer import JobNormalizer

class IndeedScraper:
    def __init__(self):
        self.base_url = "https://www.indeed.com"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
    
    def search_jobs(self, query, location, max_pages=5):
        jobs = []
        for page in range(max_pages):
            start = page * 10
            search_url = f"{self.base_url}/jobs?q={urllib.parse.quote(query)}&l={urllib.parse.quote(location)}&start={start}"
            
            try:
                response = self.session.get(search_url, timeout=10)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, 'html.parser')
                
                job_cards = soup.select('div.job_seen_beacon')
                if not job_cards:
                    break
                
                for card in job_cards:
                    job_data = self.extract_job_data(card)
                    if job_
                        jobs.append(job_data)
                
                # Respect rate limits
                import time
                time.sleep(2)
                
            except Exception as e:
                print(f"Error scraping Indeed page {page}: {e}")
                break
        
        return jobs
    
    def extract_job_data(self, job_card):
        try:
            # Job title
            title_elem = job_card.select_one('h2.jobTitle a')
            if not title_elem:
                return None
            
            job_title = title_elem.get('title') or title_elem.get_text().strip()
            job_url = self.base_url + title_elem.get('href')
            
            # Company name
            company_elem = job_card.select_one('[data-testid="company-name"]')
            company_name = company_elem.get_text().strip() if company_elem else "Unknown Company"
            
            # Location
            location_elem = job_card.select_one('[data-testid="job-location"]')
            location = location_elem.get_text().strip() if location_elem else ""
            
            # Posted date
            date_elem = job_card.select_one('span[data-testid="myJobsStateDate"]')
            posted_date_str = date_elem.get_text().strip() if date_elem else ""
            posted_date = DateParser.parse_posted_date(posted_date_str, 'indeed')
            
            # Salary
            salary_elem = job_card.select_one('[data-testid="attribute_snippet_testid"]')
            salary_text = salary_elem.get_text().strip() if salary_elem else ""
            salary_min, salary_max = JobNormalizer.normalize_salary(salary_text)
            
            # Job type (from metadata if available)
            job_type_elem = job_card.select_one('div.heading6.tapItem-gutter.metadataContainer')
            job_type_text = job_type_elem.get_text().strip() if job_type_elem else ""
            job_type = JobNormalizer.normalize_job_type(job_type_text)
            
            # Experience level
            experience_level = JobNormalizer.normalize_experience_level(job_title)
            
            return {
                'job_title': job_title,
                'company_name': company_name,
                'location': location,
                'salary_min': salary_min,
                'salary_max': salary_max,
                'job_url': job_url,
                'posted_date': posted_date,
                'job_type': job_type,
                'experience_level': experience_level,
                'source': 'indeed'
            }
        except Exception as e:
            print(f"Error extracting job  {e}")
            return None
```

### **Step 6: LinkedIn Jobs Scraper (Using Selenium)**

```python
# scrapers/linkedin.py
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
from urllib.parse import quote
from ..utils.date_parser import DateParser
from ..utils.job_normalizer import JobNormalizer

class LinkedInScraper:
    def __init__(self, headless=True):
        options = webdriver.ChromeOptions()
        if headless:
            options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
        
        self.driver = webdriver.Chrome(options=options)
        self.wait = WebDriverWait(self.driver, 10)
    
    def __del__(self):
        if hasattr(self, 'driver'):
            self.driver.quit()
    
    def search_jobs(self, query, location, max_pages=3):
        jobs = []
        search_url = f"https://www.linkedin.com/jobs/search/?keywords={quote(query)}&location={quote(location)}"
        
        try:
            self.driver.get(search_url)
            time.sleep(3)  # Wait for initial load
            
            for page in range(max_pages):
                # Scroll to load more jobs
                self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(2)
                
                job_listings = self.driver.find_elements(By.CSS_SELECTOR, "div.base-card")
                
                for listing in job_listings[len(jobs):]:  # Avoid duplicates
                    try:
                        job_data = self.extract_job_data(listing)
                        if job_
                            jobs.append(job_data)
                    except Exception as e:
                        continue
                
                # Try to click next page
                try:
                    next_button = self.driver.find_element(By.CSS_SELECTOR, "button[aria-label='Next']")
                    if next_button.is_enabled():
                        next_button.click()
                        time.sleep(3)
                    else:
                        break
                except NoSuchElementException:
                    break
                
                if len(jobs) >= 50:  # Limit total jobs
                    break
            
        except Exception as e:
            print(f"Error scraping LinkedIn: {e}")
        
        return jobs
    
    def extract_job_data(self, job_element):
        try:
            # Click to view job details
            job_element.click()
            time.sleep(1)
            
            # Job title
            try:
                title_elem = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "h2.jobs-unified-top-card__job-title")))
                job_title = title_elem.text.strip()
            except TimeoutException:
                return None
            
            # Company name
            try:
                company_elem = self.driver.find_element(By.CSS_SELECTOR, "span.jobs-unified-top-card__company-name a")
                company_name = company_elem.text.strip()
            except NoSuchElementException:
                company_name = "Unknown Company"
            
            # Location
            try:
                location_elem = self.driver.find_element(By.CSS_SELECTOR, "span.jobs-unified-top-card__bullet")
                location = location_elem.text.strip()
            except NoSuchElementException:
                location = ""
            
            # Posted date
            try:
                date_elem = self.driver.find_element(By.CSS_SELECTOR, "span.jobs-unified-top-card__posted-date")
                posted_date_str = date_elem.text.strip()
                posted_date = DateParser.parse_posted_date(posted_date_str, 'linkedin')
            except NoSuchElementException:
                posted_date = DateParser.parse_posted_date("", 'linkedin')
            
            # Get current URL for job posting
            job_url = self.driver.current_url
            
            # Salary (LinkedIn rarely shows this publicly)
            salary_min, salary_max = None, None
            
            # Job type and experience (inferred)
            job_type = JobNormalizer.normalize_job_type("")
            experience_level = JobNormalizer.normalize_experience_level(job_title)
            
            return {
                'job_title': job_title,
                'company_name': company_name,
                'location': location,
                'salary_min': salary_min,
                'salary_max': salary_max,
                'job_url': job_url,
                'posted_date': posted_date,
                'job_type': job_type,
                'experience_level': experience_level,
                'source': 'linkedin'
            }
        except Exception as e:
            return None
```

### **Step 7: Scheduler and Main Application**

```python
# scheduler.py
from apscheduler.schedulers.blocking import BlockingScheduler
from scrapers.indeed import IndeedScraper
from scrapers.linkedin import LinkedInScraper
from utils.database import JobDatabase
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class JobAggregatorScheduler:
    def __init__(self):
        self.db = JobDatabase()
        self.search_queries = [
            {'query': 'software engineer', 'location': 'Remote'},
            {'query': 'data scientist', 'location': 'New York, NY'},
            {'query': 'product manager', 'location': 'San Francisco, CA'}
        ]
    
    def scrape_all_sources(self):
        logger.info("Starting job aggregation...")
        
        # Scrape Indeed
        indeed_scraper = IndeedScraper()
        for search in self.search_queries:
            jobs = indeed_scraper.search_jobs(search['query'], search['location'])
            for job in jobs:
                self.db.save_job(job)
            logger.info(f"Scraped {len(jobs)} jobs from Indeed for {search['query']}")
        
        # Scrape LinkedIn (more resource intensive)
        linkedin_scraper = LinkedInScraper()
        for search in self.search_queries[:1]:  # Limit LinkedIn due to restrictions
            jobs = linkedin_scraper.search_jobs(search['query'], search['location'])
            for job in jobs:
                self.db.save_job(job)
            logger.info(f"Scraped {len(jobs)} jobs from LinkedIn for {search['query']}")
        
        # Mark expired jobs
        expired_count = self.db.mark_expired_jobs(days_old=30)
        logger.info(f"Marked {expired_count} jobs as expired")
        
        logger.info("Job aggregation completed.")
    
    def start_scheduler(self):
        scheduler = BlockingScheduler()
        # Run daily at 2 AM
        scheduler.add_job(self.scrape_all_sources, 'cron', hour=2, minute=0)
        scheduler.start()

# main.py
from scheduler import JobAggregatorScheduler

if __name__ == "__main__":
    aggregator = JobAggregatorScheduler()
    # Run once immediately for testing
    aggregator.scrape_all_sources()
    # Start scheduler for production
    # aggregator.start_scheduler()
```

### **Step 8: Flask API for Job Search**

```python
# api/app.py
from flask import Flask, request, jsonify
from utils.database import JobDatabase

app = Flask(__name__)
db = JobDatabase()

@app.route('/api/jobs', methods=['GET'])
def get_jobs():
    filters = {}
    
    # Extract filters from query parameters
    if request.args.get('location'):
        filters['location'] = request.args.get('location')
    if request.args.get('job_title'):
        filters['job_title'] = request.args.get('job_title')
    if request.args.get('company'):
        filters['company'] = request.args.get('company')
    if request.args.get('source'):
        filters['source'] = request.args.get('source')
    if request.args.get('days_old'):
        filters['days_old'] = int(request.args.get('days_old'))
    
    limit = int(request.args.get('limit', 50))
    jobs_df = db.get_jobs(filters=filters, limit=limit)
    
    # Convert to JSON
    jobs_list = jobs_df.to_dict('records')
    return jsonify({
        'total': len(jobs_list),
        'jobs': jobs_list
    })

@app.route('/api/stats', methods=['GET'])
def get_stats():
    conn = sqlite3.connect(db.db_path)
    stats = {}
    
    # Total active jobs
    stats['total_jobs'] = pd.read_sql_query("SELECT COUNT(*) as count FROM jobs WHERE is_active = TRUE", conn).iloc[0]['count']
    
    # Jobs by source
    stats['by_source'] = pd.read_sql_query("SELECT source, COUNT(*) as count FROM jobs WHERE is_active = TRUE GROUP BY source", conn).to_dict('records')
    
    # Jobs by location (top 10)
    stats['top_locations'] = pd.read_sql_query("SELECT location, COUNT(*) as count FROM jobs WHERE is_active = TRUE AND location != '' GROUP BY location ORDER BY count DESC LIMIT 10", conn).to_dict('records')
    
    conn.close()
    return jsonify(stats)

if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=5000)
```

---

## **Project Deployment Considerations**

### **For Production Deployment:**

1. **Use Docker** for containerization
2. **Implement proper error handling and logging**
3. **Add rate limiting and proxy rotation**
4. **Use environment variables for sensitive data**
5. **Set up monitoring and alerts**
6. **Consider using Scrapy for large-scale scraping**
7. **Implement data validation and cleaning pipelines**

### **Ethical and Legal Compliance:**
- Always respect `robots.txt`
- Implement reasonable delays between requests
- Don't scrape personal information
- Check terms of service for each website
- Consider using official APIs when available

These projects demonstrate professional-grade web scraping implementations that handle real-world complexities including anti-bot measures, data normalization, scheduling, and user interfaces.