In [4]:
#Load dataset
import pandas as pd
df = pd.read_csv("O*NET_Skills_Tasks_Description.csv")

In [5]:
print(df.head())

  O*NET-SOC Code                         Element Name  \
0     11-1011.00                     Chief Executives   
1     11-1011.03        Chief Sustainability Officers   
2     11-1021.00      General and Operations Managers   
3     11-2011.00  Advertising and Promotions Managers   
4     11-2021.00                   Marketing Managers   

                                         Description  \
0  Determine and formulate policies and provide o...   
1  Communicate and coordinate with management, sh...   
2  Plan, direct, or coordinate the operations of ...   
3  Plan, direct, or coordinate advertising polici...   
4  Plan, direct, or coordinate marketing policies...   

                                              Skills  \
0  [{'skill': 'Reading Comprehension', 'importanc...   
1  [{'skill': 'Reading Comprehension', 'importanc...   
2  [{'skill': 'Reading Comprehension', 'importanc...   
3  [{'skill': 'Reading Comprehension', 'importanc...   
4  [{'skill': 'Reading Comprehension', '

In [9]:
df.shape

(894, 5)

### API Authentication

In the following we setting up the API Configuartion. We will use Gemini for processing the text data.

In [11]:
import os
import json
from openai import OpenAI
from dotenv import load_dotenv
from pydantic import BaseModel, Field, ValidationError
from typing import List, Optional, Dict
from datetime import date
import time
from tqdm import tqdm
import pandas as pd

# Load environment variables from .env file
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")

### Test Connection

Let's do a quick check to ensure we can connect to the API.

In [12]:
client = None
if not api_key:
    print("⚠️ Set GEMINI_API_KEY in environment or a .env file")
else:
    # Configure the OpenAI client to point to Google's endpoint
    client = OpenAI(
        api_key=api_key,
        base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
    )
    print("✅ OpenAI client configured for Gemini.")

✅ OpenAI client configured for Gemini.


In [13]:
completion = client.chat.completions.create(
            model="gemini-2.5-flash",
            messages=[{"role": "user", "content": "Hello, can you hear me?"}],
            timeout=10
        )

In [14]:
completion.choices[0].message

ChatCompletionMessage(content='Hello! Yes, I can hear you (or rather, "read" you, as I am a text-based AI). How can I help you today?', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None)

Schema definition:

In [78]:
from pydantic import BaseModel, Field

class JobActivitySummary(BaseModel):
    summary: str = Field(
        description=(
            "An 80–120 word concise paragraph describing what people in this occupation do day to day. Focus on actions, decisions, and interactions. "
            "No industries, no company types, and no brand names."
        )
    )


Prompt

In [79]:
extraction_prompt = """
You are generating a neutral, general job-activity description based on occupational data.

Return ONLY a JSON object that matches the schema exactly.
Do NOT include any extra text, explanations, or comments.

Task:
Using the occupation title, description, skills, and tasks provided below, generate an 80–120 word paragraph describing what people in this occupation typically do day to day.

Instructions:
- Focus on actions, decisions, interactions, and responsibilities.
- Use neutral, general language.
- Do NOT mention industries or company types.
- Do NOT include industry context in the description.
- Do NOT reference brand names.
- Do NOT mention specific products, brands, or tools (e.g., shampoo, Excel, machinery types).
- Do NOT use regulatory, institutional, or context-specific terminology.
- Emphasize general, transferable actions and responsibilities.
- DO NOT mention occupation titles in the description/summary.
- Write a smooth paragraph, not a list, not bullet points.
- Do NOT invent responsibilities not supported by the input.

SCHEMA:
{schema}

INPUT:
Occupation Title:
{occupation}

Description:
{description}

Skills:
{skills}

Tasks:
{tasks}
"""


Implementation

10 samples

In [80]:
df_few = df.head(10)   # remove random_state if you want different rows every run
print(df_few)


  O*NET-SOC Code                         Element Name  \
0     11-1011.00                     Chief Executives   
1     11-1011.03        Chief Sustainability Officers   
2     11-1021.00      General and Operations Managers   
3     11-2011.00  Advertising and Promotions Managers   
4     11-2021.00                   Marketing Managers   
5     11-2022.00                       Sales Managers   
6     11-2033.00                 Fundraising Managers   
7     11-3012.00     Administrative Services Managers   
8     11-3013.00                  Facilities Managers   
9     11-3013.01                    Security Managers   

                                         Description  \
0  Determine and formulate policies and provide o...   
1  Communicate and coordinate with management, sh...   
2  Plan, direct, or coordinate the operations of ...   
3  Plan, direct, or coordinate advertising polici...   
4  Plan, direct, or coordinate marketing policies...   
5  Plan, direct, or coordinate the a

In [81]:
import json
import pandas as pd
import time
from tqdm import tqdm

# --- Use the small test dataframe (10 random rows) ---
df_test = df_few.copy()

# --- Rate limit config ---
rows_per_minute = 15
delay = 60 / rows_per_minute   # ≈ 4 seconds per call

# --- Schema ---
schema = JobActivitySummary.model_json_schema()
schema_json = json.dumps(schema, ensure_ascii=False)

# --- Add output column if missing ---
if "Summary" not in df_test.columns:
    df_test["Summary"] = None

# --- Loop over only the 10 rows ---
for i in tqdm(range(len(df_test)), desc="Testing job summaries", unit="row"):
    row = df_test.iloc[i]

    occupation = str(row.get("Element Name", "") or "").strip()
    description = str(row.get("Description", "") or "").strip()
    skills = str(row.get("Skills", "") or "").strip()
    tasks = str(row.get("Tasks", "") or "").strip()

    # handle empty cases
    if not occupation:
        df_test.loc[i, "Summary"] = None
        continue

    # Build prompt
    prompt = extraction_prompt.format(
        schema=schema_json,
        occupation=occupation,
        description=description,
        skills=skills,
        tasks=tasks
    )

    try:
        completion = client.chat.completions.create(
            model="gemini-2.5-flash-lite",
            messages=[{"role": "user", "content": prompt}],
            response_format={
                "type": "json_schema",
                "json_schema": {
                    "name": "JobActivitySummary",
                    "strict": True,
                    "schema": schema
                }
            },
            timeout=30
        )

        response_text = completion.choices[0].message.content

        # Validate against schema
        result = JobActivitySummary.model_validate_json(response_text)

        # Pydantic field is 'summary' (lowercase)
        df_test.loc[i, "Summary"] = result.summary

    except Exception as e:
        print(f"⚠️ Error on row {i} ({occupation}): {e}")
        # Optional: print the raw response for debugging
        # print("Raw response:", response_text)
        df_test.loc[i, "Summary"] = None

    time.sleep(delay)

# --- SAVE RESULT ---
output_name = "test_10_job_summaries.csv"
df_test.to_csv(output_name, index=False)

print(f"✅ Done! Saved results to {output_name}")


Testing job summaries: 100%|██████████| 10/10 [00:58<00:00,  5.85s/row]

✅ Done! Saved results to test_10_job_summaries.csv





BATCH

In [None]:
import json
import pandas as pd
import time
from tqdm import tqdm

# If not already loaded:
# df = pd.read_csv("O*NET_Skills_Tasks_Description.csv")

# --- CONFIG ---
rows_per_minute = 15    # be gentle with rate limits
delay = 60 / rows_per_minute

# Columns in your input file
occ_col = "Element Name"
desc_col = "Description"
skills_col = "Skills"
tasks_col = "Tasks"

# Column to be filled by the LLM
target_col = "Summary"
if target_col not in df.columns:
    df[target_col] = None

# Build schema once
job_schema = JobActivitySummary.model_json_schema()
job_schema_json = json.dumps(job_schema, indent=2, ensure_ascii=False)

# tqdm progress bar
for i in tqdm(range(len(df)), desc="Generating job summaries", unit="row"):
    row = df.iloc[i]

    occupation = str(row.get(occ_col, "") or "").strip()
    description = str(row.get(desc_col, "") or "").strip()
    skills = str(row.get(skills_col, "") or "").strip()
    tasks = str(row.get(tasks_col, "") or "").strip()

    # Skip empty occupations quickly
    if not occupation:
        df.loc[i, target_col] = None
        continue

    # Build schema + prompt
    prompt = extraction_prompt.format(
        occupation=occupation,
        description=description,
        skills=skills,
        tasks=tasks,
        schema=job_schema_json
    )

    try:
        completion = client.chat.completions.create(
            model="gemini-2.5-flash-lite",
            messages=[{"role": "user", "content": prompt}],
            response_format={
                "type": "json_schema",
                "json_schema": {
                    "name": "JobActivitySummary",
                    "strict": True,
                    "schema": job_schema
                }
            },
            timeout=30
        )

        response_text = completion.choices[0].message.content

        # Validate against schema
        result = JobActivitySummary.model_validate_json(response_text)

        # Pydantic field is 'summary' (lowercase)
        df.loc[i, target_col] = result.summary

    except Exception as e:
        print(f"⚠️ Error on row {i} ({occupation}): {e}")
        df.loc[i, target_col] = None

    time.sleep(delay)

# Save enriched dataset
output_path = "O*NET_Skills_Tasks_Description_with_Summaries.csv"
df.to_csv(output_path, index=False)
print(f"✅ Extraction completed and saved to {output_path}")


## THE FINAL FILE (output_path, O*NET_Skills_Tasks_Description_with_Summaries.csv) was later named ExtractedSummaries.csv. This file is attached in our repository.