In [None]:
import asyncio
import time
import os
import pandas as pd
from tqdm.asyncio import tqdm
from sklearn.metrics import classification_report
from enum import Enum
from pydantic import BaseModel
from google import genai
from google.genai import types

In [None]:
GOOGLE_API_KEY = ""
EXAMPLE_FILE = "department-v2.csv"
NEW_CSV = "data_jobs_nolabels.csv"
TEXT_COLUMN = "text"
LABEL_COLUMN = "label"
MODEL_NAME = "gemini-2.5-flash" 

MAX_CONCURRENT = 10
sem = asyncio.Semaphore(MAX_CONCURRENT)


client = genai.Client(api_key=GOOGLE_API_KEY)

In [None]:
df_examples = pd.read_csv(EXAMPLE_FILE)

df_new = pd.read_csv(NEW_CSV)
df_new = df_new.rename(columns={df_new.columns[0]: "text"})


if TEXT_COLUMN not in df_examples.columns:
    if "title" in df_examples.columns:
        df_examples[TEXT_COLUMN] = df_examples["title"]
    else:
        df_examples[TEXT_COLUMN] = df_examples.iloc[:, 0]

if LABEL_COLUMN not in df_examples.columns:
    if "department" in df_examples.columns:
        df_examples[LABEL_COLUMN] = df_examples["department"]
    elif "category" in df_examples.columns:
        df_examples[LABEL_COLUMN] = df_examples["category"]
    else:
        df_examples[LABEL_COLUMN] = "Other"

if TEXT_COLUMN not in df_new.columns:
    if "title" in df_new.columns:
        df_new[TEXT_COLUMN] = df_new["title"]
    else:
        df_new[TEXT_COLUMN] = df_new.iloc[:, 0]

df_new[LABEL_COLUMN] = None



def build_prompt_with_generated_examples(examples_df, text_col, label_col):
    
    categories = ["Other", "Sales", "Consulting", "Information Technology", "Project Management", 
                  "Marketing", "Business Development", "Human Resources", "Purchasing", "Administrative", "Customer Support"]
    examples_by_category = {}
    
    for category in categories:
        subset = examples_df[examples_df[label_col].str.strip().str.lower() == category.lower()]
        if len(subset) > 0:
            samples = subset[text_col].sample(n=min(len(subset), 3), random_state=42).tolist()
            examples_by_category[category] = samples
        else:
            examples_by_category[category] = []
    
    if not examples_by_category.get("Other") or len(examples_by_category["Other"]) == 0:
        generated_other = ["Finance Manager", "Legal Counsel", "Executive Director"]
        examples_by_category["Other"] = generated_other
    
    prompt = """## ROLE DEFINITION
You are a domain expert in job title classification with extensive knowledge of organizational departments and business functions.

## TASK DESCRIPTION
Classify the provided job title into exactly ONE of these pre-defined department categories:
- OTHER
- SALES
- CONSULTING
- INFORMATION TECHNOLOGY
- PROJECT MANAGEMENT
- MARKETING
- BUSINESS DEVELOPMENT
- HUMAN RESOURCES
- PURCHASING
- ADMINISTRATIVE
- CUSTOMER SUPPORT

## CONSTRAINTS
1. Return ONLY the department name (no explanations, no conversation)
2. If unsure, default to OTHER
3. Return exactly one of: Other, Sales, Consulting, Information Technology, Project Management, Marketing, Business Development, Human Resources, Purchasing, Administrative, Customer Support

## CLASSIFICATION RULES
1. **INFORMATION TECHNOLOGY**: Software, hardware, network, systems, data, tech support, developers, engineers
2. **SALES**: Selling products/services, account executives, sales representatives, business development associates
3. **CONSULTING**: Advisory roles, strategy implementation, external consultants, business consultants
4. **PROJECT MANAGEMENT**: PMO, project coordinators, scrum masters, delivery managers, project leads
5. **MARKETING**: Brand management, advertising, social media, content, SEO, marketing specialists
6. **BUSINESS DEVELOPMENT**: Partnerships, growth strategy, new market identification (distinct from direct Sales)
7. **HUMAN RESOURCES**: Recruiting, people operations, talent acquisition, benefits, L&D, HR managers
8. **PURCHASING**: Procurement, buying, supply chain, vendor management, procurement specialists
9. **ADMINISTRATIVE**: Office assistants, secretaries, receptionists, clerks, administrative assistants
10. **CUSTOMER SUPPORT**: Client success, help desk, customer service agents, support specialists
11. **OTHER**: Finance, Legal, Manufacturing, Executive C-Suite, Finance, roles not fitting specific categories

## INPUT EXAMPLES (3 per category):

"""
    
    for category, examples in examples_by_category.items():
        if examples:
            prompt += f"\n**{category}:**\n"
            for ex in examples:
                prompt += f"  • {ex}\n"
    
    prompt += """
## OUTPUT FORMAT
Return ONLY the department name (no period, no quotes, no explanation).
Example responses:
  ✓ Other
  ✓ Sales
  ✓ Information Technology
  ✓ Human Resources
"""
    
    return prompt

system_prompt_new = build_prompt_with_generated_examples(df_examples, TEXT_COLUMN, LABEL_COLUMN)

async def classify_new_row(index, title, client, system_prompt):
    title_str = str(title).strip()
    
    async with sem:
        try:
            response = await client.aio.models.generate_content(
                model=MODEL_NAME,
                contents=f"Classify this job title: {title_str}",
                config=types.GenerateContentConfig(
                    system_instruction=system_prompt,
                    temperature=0.0,
                    max_output_tokens=25,
                    safety_settings=[
                        types.SafetySetting(
                            category="HARM_CATEGORY_HATE_SPEECH",
                            threshold="BLOCK_NONE"
                        ),
                        types.SafetySetting(
                            category="HARM_CATEGORY_DANGEROUS_CONTENT",
                            threshold="BLOCK_NONE"
                        ),
                        types.SafetySetting(
                            category="HARM_CATEGORY_SEXUALLY_EXPLICIT",
                            threshold="BLOCK_NONE"
                        ),
                        types.SafetySetting(
                            category="HARM_CATEGORY_HARASSMENT",
                            threshold="BLOCK_NONE"
                        )
                    ]
                )
            )
            
            if not response.text:
                return index, "Other"
            
            raw_pred = response.text.strip().replace(".", "").replace('"', "").replace("*", "")
            clean_pred = raw_pred.title()
            
            valid_classes = ["Other", "Sales", "Consulting", "Information Technology", "Project Management", 
                           "Marketing", "Business Development", "Human Resources", "Purchasing", "Administrative", "Customer Support"]
            if clean_pred in valid_classes:
                return index, clean_pred
            
            for valid_class in valid_classes:
                if valid_class.lower() in clean_pred.lower():
                    return index, valid_class
            
            return index, "Other"
            
        except Exception as e:
            print(f"Error classifying row {index}: {e}")
            return index, "Error"

async def predict_new_data():
    
    tasks = [
        classify_new_row(i, row[TEXT_COLUMN], client, system_prompt_new) 
        for i, row in df_new.iterrows()
    ]
    
    results = []
    for future in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Classifying"):
        res = await future
        results.append(res)
    
    results.sort(key=lambda x: x[0])
    results_map = {idx: pred for idx, pred in results}
    
    df_new["prediction"] = df_new.index.map(lambda i: results_map.get(i, "Error"))
    
    return df_new



df_predictions = await predict_new_data()


output_file = "department_predictions_gem.csv"
df_predictions.to_csv(output_file, index=False)
print(f"Predictions saved to {output_file}")