In [1]:
from apify_client import ApifyClient
from fastapi import FastAPI
from pydantic import BaseModel
from typing import List, Dict
import requests
from bs4 import BeautifulSoup
import google.generativeai as genai
import os
import json
import re
from dotenv import load_dotenv
import time
from fastapi.concurrency import run_in_threadpool
import asyncio
import nest_asyncio
import uvicorn
import threading

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Initialize the ApifyClient with your API token
client = ApifyClient("")

class JobQuery(BaseModel):
    position: str
    experience: str
    salary: str
    jobNature: str
    location: str
    skills: str

In [3]:
def scrape_linkedin(query):

    # run_input = {
    #     "title": query.position,
    #     "location": query.location.split(',')[0],
    #     "publishedAt": "",
    #     "rows": 20,
    #     "proxy": {
    #         "useApifyProxy": True,
    #         "apifyProxyGroups": ["RESIDENTIAL"],
    #     },
    # }

    # run = client.actor("BHzefUZlZRKWxkTck").call(run_input=run_input)
    # extracted_jobs = []

    # data_to_keep = ['salary', 'title', 'jobUrl', 'companyName', 'location', 'experienceLevel']
    # for item in client.dataset(run["defaultDatasetId"]).iterate_items():
    #     job_info = {key: item[key] for key in data_to_keep if key in item}
    #     print(job_info)
    #     extracted_jobs.append(job_info)

    # return extracted_jobs

    nature_map = {
        "Onsite": "1",
        "Remote": "2",
        "Hybrid": "3"
    }

    # Safely map jobNature from query
    work_type = nature_map.get(query.jobNature.capitalize(), 0)  

    run_input = {
        "title": query.position,
        "location": query.location.split(',')[0],
        "publishedAt": "",
        "rows": 15,
        "proxy": {
            "useApifyProxy": True,
            "apifyProxyGroups": ["RESIDENTIAL"],
        },
        "workType": str(work_type)
    }

    print(run_input)

    # Run the Actor and wait for it to finish
    run = client.actor("BHzefUZlZRKWxkTck").call(run_input=run_input)
    extracted_jobs = []

    data_to_keep = ['salary', 'title', 'jobUrl', 'companyName', 'location', 'experienceLevel', 'description', 'contractType']
    for item in client.dataset(run["defaultDatasetId"]).iterate_items():
        job_info = {key: item[key] for key in data_to_keep if key in item}
        print(job_info)
        extracted_jobs.append(job_info)

    return extracted_jobs

In [4]:
def scrape_indeed(query):

    # Prepare the Actor input
    run_input = {
        "position": query.position,
        "country": "PK",
        "location": query.location.split(',')[0],
        "maxItems": 15,
        "parseCompanyDetails": False,
        "saveOnlyUniqueItems": True,
        "followApplyRedirects": False,
    }
    print(run_input)

    # Run the Actor and wait for it to finish
    run = client.actor("hMvNSpz3JnHgl5jkh").call(run_input=run_input)

    extracted_jobs = []

    data_to_keep = ['salary', 'positionName', 'url', 'company', 'location', 'experienceLevel', 'description', 'jobType']
    for item in client.dataset(run["defaultDatasetId"]).iterate_items():
        job_info = {key: item[key] for key in data_to_keep if key in item}
        print(job_info)
        extracted_jobs.append(job_info)

    return extracted_jobs

In [5]:
def extract_jobs_from_html(html):

    soup = BeautifulSoup(html, 'html.parser')
    script_tags = soup.find_all('script')
    apresp_data = None

    for script in script_tags:
        if script.string and "var apResp =" in script.string:
            match = re.search(r"var apResp\s*=\s*(\{.*?\});", script.string, re.DOTALL)
            if match:
                apresp_data = match.group(1)
                break

    if not apresp_data:
        print("Couldn't find apResp data in HTML")
        return []

    try:
        data = json.loads(apresp_data)
    except json.JSONDecodeError as e:
        print("JSON decoding failed:", e)
        return []

    job_list = data.get("response", {}).get("jobs", {}).get("basic", [])
    extracted_jobs = []

    for job in job_list[0: 15]:
        job_info = {
            "job_title": job.get("title", ""),
            "company": job.get("company", ""),
            "min_experience_text": job.get("experience_text", ""),
            "max_experience_text": job.get("max_experience_text", ""),
            "jobNature": job.get("type", ""),
            "location": ", ".join(job.get("city_exact", [])) + (", " + job.get("country", "") if job.get("country") else ""),
            "salary_starting_limit": (
                str(job.get("salaryN_exact")) + " " + job.get("currency_unit", "")
                if job.get("salaryN_exact") is not None
                else "N/A"
            ),
            "salary_ending_limit": (
                str(job.get("salaryT_exact")) + " " + job.get("currency_unit", "")
                if job.get("salaryT_exact") is not None
                else "N/A"
            ),
            "apply_link": "https://www.rozee.pk/" + job.get("rozeePermaLink", ""),
        }
        extracted_jobs.append(job_info)

    return extracted_jobs


def scrape_rozee_pk(query):
    url = f"https://www.rozee.pk/job/jsearch/q/{query.position.replace(' ', '%')}"
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)

    extracted = extract_jobs_from_html(response.text)
    return extracted

In [6]:
# query = JobQuery(
#     position="Full Stack Engineer",
#     experience="2 years",
#     salary="70,000 PKR to 120,000 PKR",
#     jobNature="Remote",
#     location="Lahore, Pakistan",
#     skills="full stack, MERN, Node.js, Express.js, React.js, Next.js, Firebase, TailwindCSS, CSS Frameworks, Tokens handling"
# )

# rozee_data = scrape_rozee_pk(query)
# print(rozee_data)

In [7]:
# query = JobQuery(
#     position="Full Stack Engineer",
#     experience="2 years",
#     salary="70,000 PKR to 120,000 PKR",
#     jobNature="Remote",
#     location="Lahore, Pakistan",
#     skills="full stack, MERN, Node.js, Express.js, React.js, Next.js, Firebase, TailwindCSS, CSS Frameworks, Tokens handling"
# )

# indeed_data = scrape_indeed(query)
# print(indeed_data)

In [8]:
# query = JobQuery(
#     position="Full Stack Engineer",
#     experience="2 years",
#     salary="70,000 PKR to 120,000 PKR",
#     jobNature="Remote",
#     location="Lahore, Pakistan",
#     skills="full stack, MERN, Node.js, Express.js, React.js, Next.js, Firebase, TailwindCSS, CSS Frameworks, Tokens handling"
# )

# linkedin_data = scrape_linkedin(query)
# print(linkedin_data)

In [None]:
import os
import json
import google.generativeai as genai

# Set API key
os.environ["GOOGLE_API_KEY"] = ""

def ask_llm(job_data, extracted_data):
    genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
    model = genai.GenerativeModel("models/gemini-1.5-pro-latest")

    # Convert to JSON strings
    job_data_str = json.dumps(job_data, indent=2)
    extracted_data_str = json.dumps(extracted_data, indent=2)

    prompt = f"""
You are an intelligent job filtering assistant.

You will receive:

1. A required job description (position, experience, salary range, job nature, location, required skills).
2. A list of available jobs in JSON format. Each job has fields like title, company, experienceLevel, salary, location, jobType, jobUrl, etc.

Your task:

- Carefully analyze EVERY available job.
- Compare each one against the required job description.
- Identify ALL jobs that are strongly relevant — meaning they match or closely align with the required role, experience, salary, location, job nature, and skills.
- Return **ALL** the relevant jobs (not just one or a few) in the EXACT JSON format shown below.
- Ensure all matching jobs are returned in a valid single JSON object under the key "relevant_jobs". No commentary.


Strict Output Format:
```json
{{
  "relevant_jobs": [
    {{
      "job_title": "Full Stack Engineer",
      "company": "XYZ Pvt Ltd",
      "experience": "2+ years",
      "jobNature": "onsite",
      "location": "Islamabad, Pakistan",
      "salary": "100,000 PKR",
      "apply_link": "https://linkedin.com/job123"
    }},
    {{
      "job_title": "...",
      ...
    }}
  ]
}}

---

Given Job Data:
{job_data_str}

---

Available Jobs:
{extracted_data_str}
"""

    response = model.generate_content(prompt)
    return response.text


In [10]:
# all_jobs = [*linkedin_data, *indeed_data, *rozee_data]

# print("\n\n📋 ALL JOBS")
# for job in all_jobs:
#     print(json.dumps(job, indent=2))

# relevant_jobs = ask_llm(query.__dict__, all_jobs)


# print("\n\n✅ RELEVANT JOBS")
# print(relevant_jobs)

In [11]:
def restructure_data(llm_response):
    try:
        json_text = re.search(r'\{.*\}', llm_response, re.DOTALL).group()
        parsed = json.loads(json_text)
        return parsed.get("relevant_jobs", [])
    except Exception as e:
        print("Error parsing LLM output:", e)
        return []


In [12]:
app = FastAPI()

@app.post("/search_jobs")
def search_jobs(query: JobQuery):
    linkedin_data = scrape_linkedin(query)
    print("LinkedIn:", linkedin_data)
    indeed_data = scrape_indeed(query)
    print("Indeed:", indeed_data)
    rozee_data = scrape_rozee_pk(query)
    print("Rozee:", rozee_data)

    all_jobs = [*linkedin_data, *indeed_data, *rozee_data]
    print("all_jobs:\n\n\n")
    print(all_jobs)

    relevant_jobs = ask_llm(query.__dict__, all_jobs)
    print("relevent_jobs:\n\n\n")
    print(relevant_jobs)

    cleaned_jobs = restructure_data(relevant_jobs)

    return cleaned_jobs

In [None]:
def run_server():
    uvicorn.run(app, host="127.0.0.1", port=8000)

nest_asyncio.apply()
thread = threading.Thread(target=lambda: uvicorn.run(app, host="127.0.0.1", port=8001), daemon=True)

thread.start()

INFO:     Started server process [25084]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8001 (Press CTRL+C to quit)


INFO:     127.0.0.1:60558 - "GET /docs HTTP/1.1" 200 OK
INFO:     127.0.0.1:60558 - "GET /openapi.json HTTP/1.1" 200 OK
{'title': 'Full Stack Engineer', 'location': 'Lahore', 'publishedAt': '', 'rows': 15, 'proxy': {'useApifyProxy': True, 'apifyProxyGroups': ['RESIDENTIAL']}, 'workType': '1'}
{'salary': '', 'title': 'Full Stack Engineer', 'jobUrl': 'https://pk.linkedin.com/jobs/view/full-stack-engineer-at-taxgpt-4199391427?trk=public_jobs_topcard-title', 'companyName': 'TaxGPT', 'location': 'Lahore, Punjab, Pakistan', 'experienceLevel': 'Entry level', 'description': "Engineering\n\nFull Stack Engineer\n\nJoin us as a Full Stack Engineer at our Lahore office. Design and develop innovative tax tech solutions for professionals and businesses.\n\nLahore / Pakistan\n\nFull time\n\nApply now\n\nTaxGPT, a cutting-edge Silicon Valley-based firm, is revolutionizing the tax consultation space with its AI-driven solutions. With the backing of some of the world's most prominent venture capitalists