In [1]:
import time
import pandas as pd
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# Setup WebDriver
def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in background
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    service = Service(ChromeDriverManager().install())
    return webdriver.Chrome(service=service, options=options)

# Function to scrape LinkedIn with startup filters
def scrape_linkedin_startups():
    driver = setup_driver()
    url = "https://www.linkedin.com/jobs/search/?f_C=urn%3Ali%3Afsd_company%3Aunicorns&keywords=Data%20Science&location=United%20States"
    driver.get(url)

    time.sleep(5)

    for _ in range(3):  # Scroll to load more jobs
        driver.find_element(By.TAG_NAME, "body").send_keys(Keys.PAGE_DOWN)
        time.sleep(2)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    jobs = []
    cutoff_time = datetime.now() - timedelta(days=1)  # 24-hour filter

    for job_card in soup.find_all("div", class_="base-card"):
        title = job_card.find("h3").text.strip() if job_card.find("h3") else "N/A"
        company = job_card.find("h4").text.strip() if job_card.find("h4") else "N/A"
        location = job_card.find("span", class_="job-search-card__location").text.strip() if job_card.find("span", class_="job-search-card__location") else "N/A"
        link = job_card.find("a")["href"] if job_card.find("a") else "N/A"
        posted_text = job_card.find("time")["datetime"] if job_card.find("time") else "N/A"

        posted_date = datetime.strptime(posted_text, "%Y-%m-%d") if posted_text != "N/A" else datetime.now() - timedelta(days=2)

        if posted_date >= cutoff_time:
            jobs.append({
                "Source": "LinkedIn - Unicorn Startups",
                "Title": title,
                "Company": company,
                "Location": location,
                "URL": link,
                "Posted Date": posted_date.strftime("%Y-%m-%d"),
                "Date Scraped": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

    return jobs

# Function to scrape Amazon jobs
def scrape_amazon():
    driver = setup_driver()
    driver.get("https://www.amazon.jobs/en/search?base_query=Data+Science&loc_query=United+States")

    time.sleep(5)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    jobs = []
    cutoff_time = datetime.now() - timedelta(days=1)

    for job_card in soup.find_all("div", class_="job"):
        title = job_card.find("h3").text.strip() if job_card.find("h3") else "N/A"
        location = job_card.find("p", class_="location-and-id").text.strip() if job_card.find("p", class_="location-and-id") else "N/A"
        link = "https://www.amazon.jobs" + job_card.find("a")["href"] if job_card.find("a") else "N/A"
        posted_date = datetime.now()  # Amazon doesn't show posting date, assuming recent.

        if posted_date >= cutoff_time:
            jobs.append({
                "Source": "Amazon",
                "Title": title,
                "Company": "Amazon",
                "Location": location,
                "URL": link,
                "Posted Date": posted_date.strftime("%Y-%m-%d"),
                "Date Scraped": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

    return jobs

# Function to scrape Google jobs
def scrape_google():
    driver = setup_driver()
    driver.get("https://careers.google.com/jobs/results/?q=Data%20Science&location=United%20States")

    time.sleep(5)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    jobs = []
    cutoff_time = datetime.now() - timedelta(days=1)

    for job_card in soup.find_all("li", class_="gc-card"):
        title = job_card.find("div", class_="gc-card__title").text.strip() if job_card.find("div", class_="gc-card__title") else "N/A"
        location = job_card.find("div", class_="gc-card__location").text.strip() if job_card.find("div", class_="gc-card__location") else "N/A"
        link = "https://careers.google.com" + job_card.find("a")["href"] if job_card.find("a") else "N/A"
        posted_date = datetime.now()

        if posted_date >= cutoff_time:
            jobs.append({
                "Source": "Google",
                "Title": title,
                "Company": "Google",
                "Location": location,
                "URL": link,
                "Posted Date": posted_date.strftime("%Y-%m-%d"),
                "Date Scraped": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

    return jobs

# Function to scrape TCS jobs
def scrape_tcs():
    driver = setup_driver()
    driver.get("https://ibegin.tcs.com/iBegin/jobs?keywords=Data%20Science&country=US")

    time.sleep(5)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    jobs = []
    cutoff_time = datetime.now() - timedelta(days=1)

    for job_card in soup.find_all("div", class_="joblist"):
        title = job_card.find("h2").text.strip() if job_card.find("h2") else "N/A"
        location = job_card.find("p", class_="location").text.strip() if job_card.find("p", class_="location") else "N/A"
        link = "https://ibegin.tcs.com" + job_card.find("a")["href"] if job_card.find("a") else "N/A"
        posted_date = datetime.now()

        if posted_date >= cutoff_time:
            jobs.append({
                "Source": "TCS",
                "Title": title,
                "Company": "TCS",
                "Location": location,
                "URL": link,
                "Posted Date": posted_date.strftime("%Y-%m-%d"),
                "Date Scraped": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

    return jobs

# Function to save jobs to CSV
def save_to_csv(jobs, filename="job_listings.csv"):
    df_new = pd.DataFrame(jobs)

    try:
        df_existing = pd.read_csv(filename)
        df_combined = pd.concat([df_new, df_existing], ignore_index=True)
    except FileNotFoundError:
        df_combined = df_new

    df_combined.to_csv(filename, index=False)

# Main function
def main():
    print("Scraping jobs from multiple sources...")
    jobs = scrape_amazon() + scrape_google() + scrape_tcs() + scrape_linkedin_startups()
    if jobs:
        save_to_csv(jobs)
        print(f"Scraping completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}. Data saved.")
    else:
        print("No new jobs found.")

if __name__ == "__main__":
    main()


Scraping jobs from multiple sources...
Scraping completed at 2025-03-17 21:59:44. Data saved.


In [2]:
import time
import json
import pandas as pd
import schedule
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# Load config.json for automation toggle
def load_config():
    try:
        with open("config.json", "r") as f:
            return json.load(f)
    except FileNotFoundError:
        return {"automation_enabled": False}

# Save config.json (toggle automation)
def save_config(config):
    with open("config.json", "w") as f:
        json.dump(config, f, indent=4)

# Setup WebDriver
def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in background
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    service = Service(ChromeDriverManager().install())
    return webdriver.Chrome(service=service, options=options)

# Function to scrape Amazon jobs
def scrape_amazon():
    driver = setup_driver()
    driver.get("https://www.amazon.jobs/en/search?base_query=Data+Science&loc_query=United+States")
    time.sleep(5)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    jobs = []
    cutoff_time = datetime.now() - timedelta(days=1)

    for job_card in soup.find_all("div", class_="job"):
        title = job_card.find("h3").text.strip() if job_card.find("h3") else "N/A"
        location = job_card.find("p", class_="location-and-id").text.strip() if job_card.find("p", class_="location-and-id") else "N/A"
        link = "https://www.amazon.jobs" + job_card.find("a")["href"] if job_card.find("a") else "N/A"
        posted_date = datetime.now().strftime("%Y-%m-%d")

        if datetime.now() >= cutoff_time:
            jobs.append({
                "Source": "Amazon",
                "Title": title,
                "Company": "Amazon",
                "Location": location,
                "URL": link,
                "Posted Date": posted_date,
                "Date Scraped": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

    return jobs

# Add more functions for Google, TCS, Microsoft, and unicorn startups...

# Function to save jobs to CSV (avoid duplicates)
def save_to_csv(jobs, filename="job_listings.csv"):
    df_new = pd.DataFrame(jobs)

    try:
        df_existing = pd.read_csv(filename)
    except FileNotFoundError:
        df_existing = pd.DataFrame(columns=["URL"])  # Empty DataFrame if file doesn't exist

    # Remove duplicates based on job URL
    df_combined = pd.concat([df_new, df_existing]).drop_duplicates(subset=["URL"], keep="first")

    # Save only unique jobs
    df_combined.to_csv(filename, index=False)
    print(f"Saved {len(df_new)} new jobs. Total unique jobs: {len(df_combined)}.")

# Main function to run scraper
def run_scraper():
    print(f"Scraping jobs at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}...")
    jobs = scrape_amazon()  # Add other scrapers here (Google, TCS, Microsoft...)
    
    if jobs:
        save_to_csv(jobs)
        print(f"Scraping completed. {len(jobs)} new jobs attempted to be added.")
    else:
        print("No new jobs found.")

# Schedule job if automation is enabled
def schedule_scraper():
    config = load_config()
    if config.get("automation_enabled", False):
        print("Automation enabled. Running scraper every 12 hours.")
        schedule.every(12).hours.do(run_scraper)
        
        while True:
            schedule.run_pending()
            time.sleep(60)  # Wait 1 min before checking the schedule again

# Enable automation manually
def enable_automation():
    config = load_config()
    config["automation_enabled"] = True
    save_config(config)
    print("Automation enabled. The scraper will run every 12 hours.")

# Disable automation manually
def disable_automation():
    config = load_config()
    config["automation_enabled"] = False
    save_config(config)
    print("Automation disabled.")

# Start the script
if __name__ == "__main__":
    config = load_config()
    
    if config.get("automation_enabled", False):
        schedule_scraper()
    else:
        run_scraper()


Scraping jobs at 2025-03-18 13:16:14...
Saved 10 new jobs. Total unique jobs: 10.
Scraping completed. 10 new jobs attempted to be added.
