In [6]:
import asyncio
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import re
import os
from typing import Dict
from openai import AzureOpenAI
import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# --- Parámetros de Azure OpenAI ---
ENDPOINT   = "https://fausp-mbmvwtiw-eastus2.cognitiveservices.azure.com/"
API_KEY    = "8U02J0d4710ZcPDqs9J6cWj7l1CDWKv8Yg8sWRO4eLwLEtsIOfDSJQQJ99BFACHYHv6XJ3w3AAAAACOGRHrr"
DEPLOYMENT = "gpt-4o-mini-faus"

client = AzureOpenAI(
    api_version="2024-12-01-preview",
    azure_endpoint=ENDPOINT,
    api_key=API_KEY
)

REMOTE_KEYWORDS = ["remote", "remoto", "teletrabajo", "work from home"]
INACTIVE_KEYWORDS = [
    "no longer accepting applications", "job is no longer available",
    "position filled", "job expired", "vacante cerrada"
]

def extract_keywords(text, keyword_list):
    text_lower = text.lower()
    return any(keyword in text_lower for keyword in keyword_list)

def extract_responsibilities(text):
    lines = text.split('\n')
    bullets = [line.strip('-• ').strip() for line in lines if re.search(r'[-•]', line)]
    return [b for b in bullets if len(b) > 20]

def extract_skills(text):
    skill_keywords = [
        "python", "java", "sql", "machine learning", "deep learning", "tensorflow", "pandas",
        "numpy", "docker", "git", "linux", "cloud", "azure", "aws", "scikit", "data analysis",
        "visualization", "nlp", "pytorch", "r", "power bi", "excel", "spark"
    ]
    text_lower = text.lower()
    return list({kw for kw in skill_keywords if kw in text_lower})

def extract_education(text):
    edu_keywords = ["licenciatura", "grado", "ingeniería", "doctorado", "maestría", "msc", "bsc"]
    for kw in edu_keywords:
        if kw in text.lower():
            return kw
    return None

async def scrape_linkedin_job_async(link: str) -> Dict:
    """
    Scrapea título, empresa, ubicación y descripción de una oferta LinkedIn,
    capturando timeouts de forma segura.
    """
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(link, timeout=20000)
        # Espera genérica corta para h1
        try:
            await page.wait_for_selector("h1", timeout=5000)
        except PlaywrightTimeout:
            logger.warning(f"Timeout al cargar <h1> en {link}")

        # Helper para inner_text con timeout corto y captura de error
        async def safe_inner_text(selector: str) -> str:
            try:
                return await page.locator(selector).inner_text(timeout=5000)
            except PlaywrightTimeout:
                logger.warning(f"Timeout al buscar selector '{selector}' en {link}")
                return ""

        # Scrapeo de campos
        title    = await safe_inner_text("h1")
        company  = await safe_inner_text("a.topcard__org-name-link")
        location = await safe_inner_text("span.topcard__flavor--bullet")

        # Descripción (puede ser más larga)
        try:
            await page.wait_for_selector("div.description__text", timeout=5000)
            desc = await page.locator("div.description__text").inner_text(timeout=5000)
        except PlaywrightTimeout:
            logger.warning(f"Timeout al cargar descripción en {link}")
            desc = ""

        await browser.close()

        # Procesamiento de texto
        is_remote   = extract_keywords(desc, REMOTE_KEYWORDS)
        is_active   = not extract_keywords(desc, INACTIVE_KEYWORDS)
        responsibilities = extract_responsibilities(desc)
        skills           = extract_skills(desc)
        education        = extract_education(desc)

        return {
            "title": title,
            "company": company,
            "location": location,
            "desc": desc,
            "is_remote": is_remote,
            "is_active": is_active,
            "responsibilities": responsibilities,
            "skills": skills,
            "education": education
        }

def build_summary_prompt(job: Dict) -> str:
    return f"""
You are an assistant that reads a job posting and produces a concise single-paragraph summary covering the key highlights.

Job details:
• Title: {job['title']}
• Company: {job['company']}
• Location: {job['location']}
• Remote: {"Yes" if job["is_remote"] else "No"}
• Responsibilities: {', '.join(job['responsibilities'])}
• Skills: {', '.join(job['skills'])}
• Education: {job['education']}

Task:
Write one brief paragraph (2–3 sentences) that summarizes the role (including whether it’s remote), main responsibilities, required skills, and any standout detail. Use plain, professional English without bullet points.
""".strip()

def summarize_job_with_azure(job: Dict) -> str:
    prompt = build_summary_prompt(job)
    resp = client.chat.completions.create(
        model=DEPLOYMENT,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user",   "content": prompt}
        ],
        temperature=0.3,
        max_tokens=150
    )
    return resp.choices[0].message.content.strip()

async def scrape_and_summarize(link: str) -> Dict:
    job     = await scrape_linkedin_job_async(link)
    summary = summarize_job_with_azure(job)
    return {"job": job, "summary": summary}


In [7]:
%pip install googlesearch-python


Collecting googlesearch-python
  Downloading googlesearch_python-1.3.0-py3-none-any.whl.metadata (3.4 kB)
Downloading googlesearch_python-1.3.0-py3-none-any.whl (5.6 kB)
Installing collected packages: googlesearch-python
Successfully installed googlesearch-python-1.3.0


In [8]:
import os
import logging
import requests
from typing import List
from googlesearch import search   # pip install googlesearch-python

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

LANG_REMOTE_TERMS = {
    "es": ["remoto", "teletrabajo"],
    "en": ["remote"],
    "fr": ["télétravail"],
    "de": ["Fernarbeit", "Homeoffice"],
    "pt": ["remoto", "teletrabalho"]
}

def build_search_query(query: str, localization: str, linkedin_only: bool = True) -> str:
    loc_term = f'"{localization}"'
    remote_terms = [t for terms in LANG_REMOTE_TERMS.values() for t in terms]
    all_terms = set([loc_term] + [f'"{t}"' for t in remote_terms])
    base = f'"{query}" ({" OR ".join(all_terms)})'
    if linkedin_only:
        base += " site:linkedin.com/jobs/view"
    return base

def verify_url(url: str, session: requests.Session, timeout: float = 5.0) -> bool:
    """Devuelve True si la URL responde con un código HTTP < 400."""
    try:
        r = session.head(url, timeout=timeout, allow_redirects=True)
        return r.status_code < 400
    except Exception:
        return False

def run_search_no_api(
    query: str,
    localization: str,
    linkedin_only: bool,
    max_urls: int
) -> List[str]:
    """
    Usa googlesearch.search() para obtener URLs, luego las filtra
    y verifica su disponibilidad HTTP.
    """
    q = build_search_query(query, localization, linkedin_only)
    session = requests.Session()
    found, seen = [], set()

    # pedimos el doble de resultados para tener margen
    for url in search(q, num_results=max_urls * 2, lang="es"):
        if url in seen:
            continue
        seen.add(url)
        if verify_url(url, session):
            found.append(url)
            if len(found) >= max_urls:
                break

    return found

def search_jobs_multilingual(
    query: str,
    max_urls: int = 10,
    localization: str = "Buenos Aires",
    antiguedad_maxima: str = "semana",
) -> List[str]:
    """
    Busca ofertas de trabajo en varios idiomas:
    1) LinkedIn (site:linkedin.com/jobs/view)
    2) Fallback a toda la web
    """
    # 1) Intento LinkedIn
    urls = run_search_no_api(query, localization, linkedin_only=True, max_urls=max_urls)
    if urls:
        return urls

    # 2) Fallback (toda la web)
    logger.info("No se encontraron resultados en LinkedIn; buscando en toda la web...")
    return run_search_no_api(query, localization, linkedin_only=False, max_urls=max_urls)


In [9]:
%pip install -U sentence-transformers transformers huggingface_hub



Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.33.2-py3-none-any.whl.metadata (14 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.

In [None]:
import os
from openai import AzureOpenAI
from serpapi import Client
import requests
from bs4 import BeautifulSoup
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer, util
from huggingface_hub import login
import json
import re
import nest_asyncio
import requests

nest_asyncio.apply()

import asyncio

# ── Configuración de Azure OpenAI ──────────────────────────────────────────────
ENDPOINT = "https://fausp-mbmvwtiw-eastus2.cognitiveservices.azure.com/"
API_KEY = "8U02J0d4710ZcPDqs9J6cWj7l1CDWKv8Yg8sWRO4eLwLEtsIOfDSJQQJ99BFACHYHv6XJ3w3AAAAACOGRHrr"
DEPLOYMENT = "gpt-4o-mini-faus"

client = AzureOpenAI(
    api_version="2024-12-01-preview",
    azure_endpoint=ENDPOINT,
    api_key=API_KEY
)

PROMPT_TEMPLATE = """
You will receive a resume in raw text format. Your task is to:

1. Extract the candidate’s name.
2. Extract the candidate’s email address.
3. Extract the candidate’s phone number.
4. Analyze the candidate’s background (education, skills, experience, roles, industries, etc.) and craft **one concise keyword search string** (single line) for LinkedIn Jobs.
5. Extract a cleaned and structured version of the resume that removes any personal contact information (name, email, phone, address, LinkedIn, etc.).

For de fourth part:
- Contain only the most relevant **job title(s)** plus 3-5 **key skills / industry terms**, all separated by single spaces.
- **DO NOT** use Boolean operators (AND, OR, NOT), parentheses, quotation marks, plus signs, or any other special characters.
- **DO NOT** include personal identifiers (name, email, phone, etc.).
- Write it in the same language that predominates in the résumé (Spanish or English).
- Remember: the calling function will later append `site:linkedin.com/jobs`; you only output the keywords.

For the fifth part:
- Do NOT summarize or omit key content.
- Instead, preserve as much of the original job-related information as possible.
- Reorganize and rephrase disconnected items into full sentences with proper structure and connectors (e.g., “The candidate worked at...”, “They were responsible for...”, “Their skills include...”).
- You may rewrite bullet points and lists as prose, but keep all relevant details intact.
- Do NOT include any personal identifiers or contact information.
- Imagine you are preparing the resume for analysis by an AI model – you want to keep the full context but make it more readable.

You may use the following fields **only if present** in the text:
- Career Objective
- Skills
- Institution
- Degree
- Results
- Field of Study
- Companies
- Job Skills
- Positions
- Responsibilities
- Organizations
- Roles
- Languages
- Proficiency
- Certifications

Respond **only** with a valid JSON object, without additional text or explanations


Exact structure of the output:

{{
"area_job":"...",
"cv_information":"..."
}}

CV TEXT:
{cv_text}
"""

def safe_json_load(content: str):
    opens = content.count('{')
    closes = content.count('}')
    if closes < opens:
        content = content + '"' + '}' * (opens - closes)
    return json.loads(content)


def generate_linkedin_query(cv_text: str) -> str:
    prompt = PROMPT_TEMPLATE.format(cv_text=cv_text)
    response = client.chat.completions.create(
        model=DEPLOYMENT,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
        max_tokens=256
    )
    content = response.choices[0].message.content.strip()
    if content.startswith("```json"):
            content = content.removeprefix("```json").removesuffix("```").strip()
    elif content.startswith("```"):
            content = content.removeprefix("```").removesuffix("```").strip()

    data = safe_json_load(content)
    area_job = data.get("area_job", "")
    cv_information = data.get("cv_information", "")

    return [area_job, cv_information]


# ── SerpApi Search ──────────────────────────────────────────────────────────────
SERPAPI_API_KEYS = [
    "1a992f2a6dbaed0c95203a2ed73768f29b4b7f423a5f218c4834e355c5c31918" # tizi
    # "e898c7f95cdb5692528a009eb2ee7d08d24a2f37c22f9623a065a96fd6072892", # mati
    # "141d74f945c81589527847238881362de5f08cc31dae86209dcb2c04d7e5ccc7", # faus
    # "c18acdb7b9b75162b53059cd6f094669c33323e898758841176351ac8a59e8c7" # giaco
]

def pdf_a_string(ruta_pdf):
    doc = fitz.open(ruta_pdf)
    contenido = ""
    for pagina in doc:
        contenido += pagina.get_text()
    doc.close()
    return contenido

def scrape_job_pages(urls: list) -> dict:
    scraped = {}
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
    }
    for url in urls:
        try:
            resp = requests.get(url, headers=headers, timeout=10)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")
            text = soup.get_text(separator=" ", strip=True)
            scraped[url] = text
        except Exception as e:
            scraped[url] = f"Error: {e}"
    return scraped

def build_job_description_prompt(job):
    """Builds a prompt that generates a professional job description from job data."""
    return f"""
    You are given the following job information:

    - Job Position: "{job.get('title', '')}"
    - Company: "{job.get('company', '')}"
    - Location: "{job.get('location', '')}"
    - Education: "{job.get('education', '')}"
    - Responsibilities: {', '.join(job.get('responsibilities', []))}
    - Skills: {', '.join(job.get('skills', []))}

    Task:
    Generate a SINGLE, well-written `job_description` in plain, professional English.
    - Use the position, company, location, education, responsibilities, and skills provided.
    - Maintain a natural, human-readable style.
    - Do NOT invent any details beyond what's given.

    Example format:
    "The position is for a [Job Position] at [Company], located in [Location], requiring a candidate with an educational background in [Education]. The role involves key responsibilities such as [Responsibility 1], [Responsibility 2], and [Responsibility N]. The required skills for this role include [Skill 1], [Skill 2], and [Skill N]."

    Now generate the `job_description`.
    """




def filter_url(list_jobs):
  N = len(list_jobs)
  list_results = []
  for idx in range(N):
    response = client.chat.completions.create(
      model=DEPLOYMENT,  # o el modelo que tengas en Azure
      temperature=0.3,
      max_tokens=800,
      messages=[
          {"role": "system", "content": "You are a job filtering and summarizing assistant."},
          {"role": "user", "content": build_job_description_prompt(list_jobs[idx])}
      ]
    )
    result = response.choices[0].message.content
    list_results.append(result)

  return list_results


def predict(cv_desc, jb_desc, model):
    embedding1 = model.encode(cv_desc, convert_to_tensor=True)
    embedding2 = model.encode(jb_desc, convert_to_tensor=True)

    return util.cos_sim(embedding1, embedding2)

def test_all(cv_ruth: str, ruth_model, max_urls):
        model = SentenceTransformer(ruth_model)
        cv_text = pdf_a_string(cv_ruth)
        area_job, cv_information = generate_linkedin_query(cv_text)

        print(f"Generated Query: {area_job}")
        print(f"Generated cv: {cv_information}")

        links = search_jobs_multilingual(area_job, max_urls, localization="Buenos Aires", antiguedad_maxima='mes')

        print("Links encontrados:",links)

        data_job = []
        summary_dob = []

        for url in links:
          content_map = asyncio.run(scrape_and_summarize(url))
          data_job.append(content_map["job"])
          summary_dob.append(content_map["summary"])

        print(content_map["job"])
        print("ahora si")
        print(content_map["summary"])
        result_filter = filter_url(list_jobs=data_job)

        score_cv_jd = []

        for job_desc in result_filter:
          score_cv_jd.append(predict(cv_information, job_desc, model))

        max_score = max(score_cv_jd)

        max_index = score_cv_jd.index(max_score)

        print("El maximo puesto tiene un score de:", max_score)
        print("Link del puesto:", links[max_index])
        print("Resumen del puesto:", summary_dob[max_index])

# https://in.linkedin.com/jobs/view/ai-ml-engineer-at-oneseven-tech-ost-4241281880
# bueno

In [None]:
test_all(cv_ruth="/content/drive/MyDrive/ApplAI/Curriculum_Tiziano_Martin.pdf", ruth_model="/content/drive/MyDrive/ApplAI/modelsSave/mini_finetuned_Allmini", max_urls=2)