In [1]:
import requests
from dotenv import load_dotenv
import os
import pandas as pd
from huggingface_hub import login
from langchain.agents import load_tools
from langchain.agents import initialize_agent
from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage
from pydantic import BaseModel, Field


load_dotenv()


  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
login(os.getenv("HF_TOKEN"))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
data = {
  "include_total_results": False,
  "order_by": [
    {
      "desc": True,
      "field": "date_posted"
    }
  ],
  "posted_at_max_age_days": 15,
  "job_country_code_or": [
    "IT"
  ],
  "job_title_or": [
    "Data Scientist",
    "AI Engineer"
  ],
  "remote": True,
  "easy_apply": True,
  "page": 0,
  "limit": 10,
  "blur_company_data": False
}

In [4]:
url ="https://api.theirstack.com/v1/jobs/search"

headers = {
    "Accept": "application/json",
    "Content-Type": "application/json",
    "Authorization": f"Bearer {os.getenv('their_stack_api_key')}"}

if os.path.exists("/home/giuseppe/projetos-pessoais/linkedin-bot/data/raw/jobs_1.csv"):
    df_jobs = pd.read_csv("/home/giuseppe/projetos-pessoais/linkedin-bot/data/raw/jobs_1.csv")
else:
    response = requests.post(url, headers=headers, json=data)
    request = response.json()
    df_jobs = pd.DataFrame(request["data"])

In [5]:
df_jobs.loc[0, "description"]

"### **Accepted Locations**\n\n\nWe accept applicants from the US, Canada, and most countries in LATAM and Europe. We are also accepting candidates from some countries in Africa and Asia. For the complete list of accepted locations, click here. This work is 100% remote.\n\n**Loom Video**\n\n\nOur Founder/CEO, Gabe Greenberg, created an in\\-depth Loom video that we highly recommend you watch! Check it out here: Loom Video\n\n**Overview**\n\n\nJoin our expert annotation team to create training data for the world's most advanced AI models. No previous AI experience is necessary. You'll get your foot in the door with one of the most prominent players in the AI/LLM space today. We're primarily seeking JavaScript/React developers with 3\\+ years of experience to train large AI language models, helping cutting\\-edge generative AI models write better frontend code. Projects typically include discrete, highly variable problems that involve engaging with these models as they learn to code. We 

In [6]:
print(df_jobs.description[0])

### **Accepted Locations**


We accept applicants from the US, Canada, and most countries in LATAM and Europe. We are also accepting candidates from some countries in Africa and Asia. For the complete list of accepted locations, click here. This work is 100% remote.

**Loom Video**


Our Founder/CEO, Gabe Greenberg, created an in\-depth Loom video that we highly recommend you watch! Check it out here: Loom Video

**Overview**


Join our expert annotation team to create training data for the world's most advanced AI models. No previous AI experience is necessary. You'll get your foot in the door with one of the most prominent players in the AI/LLM space today. We're primarily seeking JavaScript/React developers with 3\+ years of experience to train large AI language models, helping cutting\-edge generative AI models write better frontend code. Projects typically include discrete, highly variable problems that involve engaging with these models as they learn to code. We currently have 20

In [12]:
print(df_jobs.description[6])

Mavriq, parte di Moltiply Group, è la tech company a cui appartengono alcuni tra i più importanti brand di comparazione ed intermediazione online in Italia (MutuiOnline.it, Segugio.it, SOStariffe.it, Trovaprezzi.it, Switcho e molti altri) e all’estero (LeLynx.fr, Rastreator, Pricewise, Verivox). I servizi offerti dai brand di Mavriq aiutano con trasparenza i consumatori a trovare ciò di cui hanno bisogno, al miglior prezzo. Siamo un team di circa 1\.000 “smart disruptors” distribuiti in Europa, America Latina ed Asia.

  

Il successo di Mavriq è legato al successo dei nostri team. Per questo, siamo oggi alla ricerca di un nuovo o una nuova team member con cui continuare a scrivere la nostra storia nel mondo della comparazione ed intermediazione internazionale.

  

  

**Posizione:**
--------------

**Il ruolo**


Per supportare la crescita dei progetti di Machine Learning in Mavriq per i brand Segugio.it, PrestitiOnline e altri, siamo alla ricerca di un\-a Junior Data Scientist che, 

In [None]:
text = df_jobs.description[0]

In [None]:
class SummarizeDescription(BaseModel):
    resume: str = Field(
        description="Summarize the relevant informations contained in descrption  job position"
    )

    smart_working: bool = Field(
        description="Flag with true or false if this jobs in remote or not"
    )

    required: str = Field(
        description="The required skills and qualifications for the job position"
    )
    nice_to_have: str = Field(
        description="The nice to have skills and qualifications for the job position"
    )
    company: str = Field(description="Summarize the company name and its mission")
    location: str = Field(description="Summarize the location of the job position")

    job_responsibilities: list[str] = Field(
        description="The job responsibilities associated with the position"
    )

    hards_skills: list[str] = Field(
        description="The hard skills required for the job position"
    )
    soft_skills: list[str] = Field(
        description="The soft skills required for the job position"
    )

In [46]:
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace

llm = HuggingFaceEndpoint(
    repo_id="Qwen/Qwen2.5-Coder-32B-Instruct",
    task="text-generation",
    do_sample=False,
    repetition_penalty=1.03,
    grammar={"type": "json", "value": SummarizeDescription.model_json_schema()},

)
chat = ChatHuggingFace(llm=llm, verbose=True)

                    grammar was transferred to model_kwargs.
                    Please make sure that grammar is what you intended.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [50]:
prompt = SystemMessage("""Given the following job description, extract and summarize the key information into a structured JSON object with the following fields:
- resume
- smart_working
- required
- nice_to_have
- company
- location
- job_responsibilities
- hards_skills
- soft_skills

Job Description:
""")

In [54]:
text = HumanMessage(df_jobs.description[0], role="user",
                    additional_kwargs={"grammar": SummarizeDescription.model_json_schema()})
messages = [prompt, text]
response = chat(messages)

  response = chat(messages)


In [65]:
response.content

'Here is the structured JSON object based on the provided job description:\n\n```json\n{\n  "resume": "Accepted from US, Canada, LATAM, Europe, some African and Asian countries. The job is 100% remote.",\n  "smart_working": "Flexible schedules; no fixed working hours, expected to work 40 hours a week. Work according to the platform\'s time tracking.",\n  "required": [\n    "3+ years of experience in a software engineering/software development role.",\n    "Strong proficiency with JavaScript/React and frontend development.",\n    "Complete fluency in the English language.",\n    "Ability to articulate complex technical concepts clearly and engagingly.",\n    "Excellent attention to detail and ability to maintain consistency in writing. Solid understanding of grammar, punctuation, and style guidelines."\n  ],\n  "nice_to_have": [\n    "Bachelor\'s or Master\'s degree in Computer Science.",\n    "Experience with modern JavaScript frameworks and libraries (Next.js, Vue, Angular).",\n    "F

In [None]:
import json
import re

def extract_json_from_response(response: str) -> dict:
    """
    Extract JSON code block from a model response string and convert it to a Python dict.
    """
    try:
        # Find the JSON block inside triple backticks
        match = re.search(r"```json\n(.*?)\n```", response, re.DOTALL)
        if not match:
            raise ValueError("JSON block not found in the response.")

        json_str = match.group(1).strip()
        
        # Load as Python dict
        data = json.loads(json_str)
        return data

    except json.JSONDecodeError as e:
        print("Failed to parse JSON:", e)
    except Exception as e:
        print("Error:", e)

    return {}


parsed_data = extract_json_from_response(response.content)
    
# Now `parsed_data` is a clean dictionary
print(parsed_data)


{'resume': 'Accepted from US, Canada, LATAM, Europe, some African and Asian countries. The job is 100% remote.', 'smart_working': "Flexible schedules; no fixed working hours, expected to work 40 hours a week. Work according to the platform's time tracking.", 'required': ['3+ years of experience in a software engineering/software development role.', 'Strong proficiency with JavaScript/React and frontend development.', 'Complete fluency in the English language.', 'Ability to articulate complex technical concepts clearly and engagingly.', 'Excellent attention to detail and ability to maintain consistency in writing. Solid understanding of grammar, punctuation, and style guidelines.'], 'nice_to_have': ["Bachelor's or Master's degree in Computer Science.", 'Experience with modern JavaScript frameworks and libraries (Next.js, Vue, Angular).', 'Familiarity with frontend testing frameworks (Jest, React Testing Library, Cypress).', 'Knowledge of state management solutions (Redux, Context API, M

In [73]:
parsed_data

{'resume': 'Accepted from US, Canada, LATAM, Europe, some African and Asian countries. The job is 100% remote.',
 'smart_working': "Flexible schedules; no fixed working hours, expected to work 40 hours a week. Work according to the platform's time tracking.",
 'required': ['3+ years of experience in a software engineering/software development role.',
  'Strong proficiency with JavaScript/React and frontend development.',
  'Complete fluency in the English language.',
  'Ability to articulate complex technical concepts clearly and engagingly.',
  'Excellent attention to detail and ability to maintain consistency in writing. Solid understanding of grammar, punctuation, and style guidelines.'],
 'nice_to_have': ["Bachelor's or Master's degree in Computer Science.",
  'Experience with modern JavaScript frameworks and libraries (Next.js, Vue, Angular).',
  'Familiarity with frontend testing frameworks (Jest, React Testing Library, Cypress).',
  'Knowledge of state management solutions (Redu

In [74]:
messages

[SystemMessage(content='Given the following job description, extract and summarize the key information into a structured JSON object with the following fields:\n- resume\n- smart_working\n- required\n- nice_to_have\n- company\n- location\n- job_responsibilities\n- hards_skills\n- soft_skills\n\nJob Description:\n', additional_kwargs={}, response_metadata={}),
 HumanMessage(content="### **Accepted Locations**\n\n\nWe accept applicants from the US, Canada, and most countries in LATAM and Europe. We are also accepting candidates from some countries in Africa and Asia. For the complete list of accepted locations, click here. This work is 100% remote.\n\n**Loom Video**\n\n\nOur Founder/CEO, Gabe Greenberg, created an in\\-depth Loom video that we highly recommend you watch! Check it out here: Loom Video\n\n**Overview**\n\n\nJoin our expert annotation team to create training data for the world's most advanced AI models. No previous AI experience is necessary. You'll get your foot in the door

In [None]:
print(test.content)

Accepted Locations: United States, Canada, most countries in LATAM, Europe, some countries in Africa, Asia. For the complete list of locations: Click here.

From Ashby form:

Job type: Remote.

Payment structure: Performed work with hourly payment; no cap on hours but the expectation is for delivery of 15+ hours weekly; expected commitment period: 12 months.

Programming Languages: Highly experienced in JavaScript and React; comfort level with TypeScript is a Plus.

Looking for: Skilled JavaScript/React developers, experienced in modern web development frameworks and libraries; proficient use of Jest and React Testing Library. Knowledge of state management (Redux, Context API, MobX) is a plus.

On-boarding: Password for RLHF platform provided upon offer acceptance; group call with RLHF team to explain onboarding procedures; simulated RLHF task is the final step in the hiring process.


In [None]:
aaaaaaaaaaaaaaa

In [None]:
from transformers import pipeline

# # Replace this with your own checkpoint
# model_checkpoint = "google/gemma-3-1b-pt"
# token_classifier = pipeline(
#     "text-generation", model=model_checkpoint
# )
# token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

: 

In [None]:
df_jobs.description[0]

"### **Accepted Locations**\n\n\nWe accept applicants from the US, Canada, and most countries in LATAM and Europe. We are also accepting candidates from some countries in Africa and Asia. For the complete list of accepted locations, click here. This work is 100% remote.\n\n**Loom Video**\n\n\nOur Founder/CEO, Gabe Greenberg, created an in\\-depth Loom video that we highly recommend you watch! Check it out here: Loom Video\n\n**Overview**\n\n\nJoin our expert annotation team to create training data for the world's most advanced AI models. No previous AI experience is necessary. You'll get your foot in the door with one of the most prominent players in the AI/LLM space today. We're primarily seeking JavaScript/React developers with 3\\+ years of experience to train large AI language models, helping cutting\\-edge generative AI models write better frontend code. Projects typically include discrete, highly variable problems that involve engaging with these models as they learn to code. We 

In [None]:
token_classifier(df_jobs.description[0])

[{'entity_group': 'LOC',
  'score': np.float32(0.99939656),
  'word': 'US',
  'start': 59,
  'end': 61},
 {'entity_group': 'LOC',
  'score': np.float32(0.9994357),
  'word': 'Canada',
  'start': 63,
  'end': 69},
 {'entity_group': 'LOC',
  'score': np.float32(0.89667124),
  'word': 'LATAM',
  'start': 93,
  'end': 98},
 {'entity_group': 'LOC',
  'score': np.float32(0.99877053),
  'word': 'Europe',
  'start': 103,
  'end': 109},
 {'entity_group': 'LOC',
  'score': np.float32(0.99940634),
  'word': 'Africa',
  'start': 167,
  'end': 173},
 {'entity_group': 'LOC',
  'score': np.float32(0.99892527),
  'word': 'Asia',
  'start': 178,
  'end': 182},
 {'entity_group': 'PER',
  'score': np.float32(0.9992668),
  'word': 'Gabe Greenberg',
  'start': 302,
  'end': 316},
 {'entity_group': 'MISC',
  'score': np.float32(0.77809995),
  'word': 'Loom',
  'start': 339,
  'end': 343},
 {'entity_group': 'MISC',
  'score': np.float32(0.874496),
  'word': 'AI /',
  'start': 648,
  'end': 651},
 {'entity_gr