In [None]:
import requests
import base64
import time
import csv
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

#Reed API key
api_key = "532d0640-6b26-4083-974c-1bbd034ae1df"

# Encode API key using Base64
encoded_api_key = base64.b64encode(f"{api_key}:".encode()).decode()

# Define API endpoint
url = "https://www.reed.co.uk/api/1.0/search"

# Keywords to search
search_keywords = [
    "Data Analyst", "Data Scientist", "Business Intelligence Analyst", "BI Analyst", "Machine Learning Engineer",
    "AI Engineer", "Data Engineer", "Analytics Consultant", "Big Data Analyst", "SQL Analyst", "Power BI Developer"
]

# Set headers with authentication
headers = {"Authorization": f"Basic {encoded_api_key}"}

# File path for saving job data in CSV format
csv_path = "/Users/kirancorreya/Downloads/reed_jobs_full_description.csv"

# Open CSV for writing
with open(csv_path, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Job Title", "Company", "Location", "Minimum Salary", "Maximum Salary", "Contract Type", "Job Type", "Date Posted", "Job URL", "Job Description", "Applications Count", "Industry", "Experience Level", "Search Keyword"])
    
    seen_jobs = set()  # To avoid duplicate jobs
    
    for keyword in search_keywords:
        print(f"Searching for: {keyword}")
        params = {"keywords": keyword, "location": "United Kingdom", "resultsToTake": 100}
        results_skipped = 0
        
        while True:
            params["resultsToSkip"] = results_skipped
            response = requests.get(url, headers=headers, params=params)
            
            if response.status_code == 200:
                job_data = response.json().get("results", [])
                if not job_data:
                    break
                
                for job in job_data:
                    job_url = job.get("jobUrl", "N/A")
                    if job_url in seen_jobs:
                        continue  # Skip duplicates
                    seen_jobs.add(job_url)
                    
                    writer.writerow([
                        job.get("jobTitle", "N/A"),
                        job.get("employerName", "N/A"),
                        job.get("locationName", "N/A"),
                        job.get("minimumSalary", "N/A"),
                        job.get("maximumSalary", "N/A"),
                        job.get("contractType", "N/A"),
                        job.get("jobType", "N/A"),
                        job.get("date", "N/A"),
                        job_url,
                        job.get("jobDescription", "N/A").replace("\n", " ").replace(",", " "),
                        job.get("applications", "N/A"),
                        job.get("industry", "N/A"),
                        job.get("experienceRequired", "N/A"),
                        keyword
                    ])
                
                results_skipped += len(job_data)
                time.sleep(2)  # Avoid hitting API rate limits
            else:
                print(f"Error: {response.status_code} - {response.text}")
                break

print(f"Job search completed. Data saved to {csv_path}")

# ---------------------------------------------
# Step 2: Extract Full Job Descriptions with Selenium
# ---------------------------------------------
options = webdriver.SafariOptions()
options.add_argument('--headless')  # Run in headless mode

driver = webdriver.Safari(options=options)
df = pd.read_csv(csv_path)

results = []

for job_url in df['Job URL'].tolist():
    print(f"Accessing job: {job_url}")
    
    try:
        driver.get(job_url)
        time.sleep(3)
        
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'div.job-details-container_jobDetailsContainer__V1Mtj'))
            )
        except:
            print(f"Job details container not found for {job_url}")
            continue
        
        job_description = "Description not found"
        try:
            job_description_element = driver.find_element(By.CSS_SELECTOR, 'div.job-details_jobDescription__1dErB')
            job_description = job_description_element.text
        except:
            pass
        
        if job_description == "Description not found":
            try:
                job_description_element = driver.find_element(By.CSS_SELECTOR, 'div.description span[itemprop="description"]')
                job_description = job_description_element.text
            except:
                pass
        
        results.append([job_url, job_description])
    
    except Exception as e:
        print(f"Error accessing job: {job_url}\n{e}")

driver.quit()

df_descriptions = pd.DataFrame(results, columns=["Job URL", "Full Job Description"])
df_final = df.merge(df_descriptions, on="Job URL", how="left")
df_final.to_csv(csv_path, index=False)

print(f"Full job descriptions extracted and updated in {csv_path}")
