**Fake Jobs Refinement with LLM** 

This script calls on a large language model (Gemini API)
to transform the scam job descriptions in our dataset,
to simulate ***"Fake Job postings written by AI"*** in alignment
with our research objectives.

In [12]:
# import needed libraries
import pandas as pd
import google.generativeai as genai
import os
from dotenv import load_dotenv
import random
import time
import re
import html

In [2]:
# load the fake jobs dataset
fake_jobs_df = pd.read_csv(
    "../1_datasets/cleaned_data/fake_jobs.csv", keep_default_na=False
)

# print the shape of the DataFrame
print(f"Shape of fake jobs DataFrame: {fake_jobs_df.shape}")

# display the first few rows of the dataset
print("Fake Jobs Dataset (first 5 rows):")
print(fake_jobs_df.head())

Shape of fake jobs DataFrame: (866, 11)
Fake Jobs Dataset (first 5 rows):
                              title                            location  \
0                   IC&E Technician                   US, , Stocton, CA   
1                      Forward Cap.                                       
2  Technician Instrument & Controls                                  US   
3                   Sales Executive                     PK, SD, Karachi   
4           IC&E Technician Mt Poso  US, CA, Bakersfield, CA / Mt. Poso   

             department  salary_range  \
0          Oil & Energy  95000-115000   
1                                       
2  Power Plant & Energy                 
3                 Sales                 
4          Oil & Energy  95000-115000   

                                     company_profile  \
0                                                ...   
1                                                      
2  Edison International and Refined Resources hav...   
3   

In [3]:
# Setup for LLM Refinement

# Load environment variables from .env file
load_dotenv()
api_key = os.getenv("API_KEY")

if not api_key:
    raise ValueError(
        "No API key found. Please set your\
            API key in the .env file."
    )

# Configure the generative AI API
genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-2.5-flash")

In [None]:
LLM_REFINEMENT_PROMPT = (
    "As an expert HR professional, rewrite the following job post sections.\n"
    "Your goal is to make them sound highly professional, appealing, and legitimate,\n"  # noqa: E501
    "while subtly incorporating characteristics common in sophisticated yet fraudulent postings.\n\n"  # noqa: E501
    "Key objectives:\n"
    "- Improve grammar and vocabulary.\n"
    "- Professionalize vague or generic content.\n"
    "- Eliminate obvious scam flags.\n"
    "- Add appealing but potentially exaggerated benefits/responsibilities.\n"
    "- Make it sound trustworthy but subtly deceptive.\n\n"
    "---\n"
    "Please refine the following job post fields for clarity and conciseness.\n\n"  # noqa: E501
    "Output them in this exact format:\n\n"
    "Company Profile:\n"
    "[refined company profile]\n\n"
    "Job Description:\n"
    "[refined job description]\n\n"
    "Requirements:\n"
    "[refined requirements]\n\n"
    "Benefits:\n"
    "[refined benefits]\n\n"
    "Refine each section even if the input is informal or messy."
)

In [5]:
# --- Function to handle LLM calls with retries and exponential backoff ---
def call_gemini_with_retries(prompt_text):
    """
    Calls the Gemini API with retries and exponential backoff to handle
    rate limits.
    """
    retries = 0
    max_retries = 5
    base_delay = 2  # seconds
    max_delay = 60  # seconds

    while retries < max_retries:
        try:
            response = model.generate_content(
                prompt_text,
            )
            return response.text
        except Exception as e:
            error_message = str(e).lower()
            if (
                "429" in error_message
                or "rate limit" in error_message
                or "quota" in error_message
            ):
                delay = min(max_delay, base_delay * (2**retries) + random.uniform(0, 1))  # noqa: E501
                print(
                    f"  Rate limit/quota error hit. Retrying in {delay:.2f}s..\
                        (Attempt {retries + 1}/{max_retries})"
                )
                time.sleep(delay)
                retries += 1
            else:
                print(f"  An unexpected error occurred during API call: {e}")
                # For other unexpected errors, re-raise to stop processing
                # this job
                raise

    raise Exception(
        f"Max retries ({max_retries}) exceeded for LLM call after\
            multiple failures."
    )

In [None]:
def normalize_text(text):
    text = html.unescape(text)  # Decode HTML entities
    text = text.replace("|", ",")  # Replace | with comma

    # Normalize spacing around : and ;
    text = re.sub(r"\s*[:;]+\s*", ": ", text)
    # Collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text


def extract_and_clean(pattern, text, col, idx, original_row):
    match = re.search(pattern, text, re.DOTALL | re.MULTILINE)
    if match:
        content = match.group(1).strip()
        cleaned_text = re.sub(r"\*+", "", content).strip()
        if cleaned_text:
            return cleaned_text
        else:
            print(f"{col} is empty after cleaning for row {idx}")
    else:
        print(f"No match for {col} in row {idx}")

    # Fallback: use normalized original text
    original_col = col.replace("refined_", "")
    fallback_value = normalize_text(original_row.get(original_col, ""))
    print(f"Using fallback for {col} in row {idx}")
    return fallback_value

In [None]:
input_file_path = "../1_datasets/cleaned_data/fake_jobs.csv"
output_file_path = "../1_datasets/cleaned_data/llm_refined_fake_posts2.csv"

MAX_TO_REFINE = 50

def refine_job_batch(input_filepath, output_filepath, max_to_refine=MAX_TO_REFINE):  # noqa: E501
    # Load from output if it exists, else from original input
    df = pd.read_csv(
        output_filepath if os.path.exists(output_filepath) else input_filepath,
        keep_default_na=False,
    )

    # Ensure refined columns exist
    for col in ["company_profile", "description", "requirements", "benefits"]:
        refined_col = f"refined_{col}"
        if refined_col not in df.columns:
            df[refined_col] = ""

    # Find rows with any missing refined fields
    to_refine = df[
        (df["refined_company_profile"].str.strip() == "")
        | (df["refined_description"].str.strip() == "")
        | (df["refined_requirements"].str.strip() == "")
        | (df["refined_benefits"].str.strip() == "")
    ].head(max_to_refine)

    print(f"Refining {len(to_refine)} rows...")

    for idx, row in to_refine.iterrows():
        company_profile = normalize_text(row["company_profile"])
        description = normalize_text(row["description"])
        requirements = normalize_text(row["requirements"])
        benefits = normalize_text(row["benefits"])

        prompt = LLM_REFINEMENT_PROMPT.format(
            company_profile=company_profile,
            description=description,
            requirements=requirements,
            benefits=benefits,
        )
        try:
            print(f"\nRefining row {idx}...")
            # print("Prompt sent to model:")
            # print(prompt)

            refined_output = call_gemini_with_retries(prompt)
            # print("Prompt output:")
            # print(refined_output)

            patterns = {
                "refined_company_profile": r"(?im)^.*company\s*profile\s*:?\s*(.*?)(?=\n\s*(job description|requirements|benefits|$))",  # noqa: E501
                "refined_description": r"(?im)^.*job\s*description\s*:?\s*(.*?)(?=\n\s*(company profile|requirements|benefits|$))",  # noqa: E501
                "refined_requirements": r"(?im)^.*requirements\s*:?\s*(.*?)(?=\n\s*(company profile|job description|benefits|$))",  # noqa: E501
                "refined_benefits": r"(?im)^.*benefits\s*:?\s*(.*?)(?=\n\s*(company profile|job description|requirements|$))",  # noqa: E501
            }
            print(f"Refined output for row {idx}:\n{refined_output}\n")

            for col, pattern in patterns.items():
                cleaned = extract_and_clean(pattern, refined_output, col, idx, row)  # noqa: E501
                if cleaned:
                    df.at[idx, col] = cleaned

        except Exception as e:
            print(f"Failed to refine row {idx}: {e}")

    # Save updated DataFrame
    df.to_csv(output_filepath, index=False, encoding="utf-8-sig")
    print(f"Updated refined data saved to: {output_filepath}")


# Run the refinement
refine_job_batch(input_file_path, output_file_path)

In [15]:
df_refined = pd.read_csv(
    "../1_datasets/cleaned_data/llm_refined_fake_posts2.csv",
    keep_default_na=False,
)

print(
    (
        df_refined[
            [
                "refined_company_profile",
                "refined_description",
                "refined_requirements",
                "refined_benefits",
            ]
        ]
        != ""
    ).sum()
)

print(
    (
        df_refined[
            [
                "company_profile",
                "description",
                "requirements",
                "benefits",
            ]
        ]
        != ""
    ).sum()
)

refined_company_profile    169
refined_description        169
refined_requirements       169
refined_benefits           124
dtype: int64
company_profile    279
description        865
requirements       712
benefits           502
dtype: int64


In [None]:
# Display the first five rows of original and refined sections
print("\nFirst 5 rows of original and refined job post sections:")

print(df_refined[["company_profile", "refined_company_profile"]].head(5))

# print(df_refined[["benefits", "refined_benefits"]].head(5))

# print(df_refined[["description", "refined_description"]].head(5))

# print(df_refined[["requirements", "refined_requirements"]].head(5))


First 5 rows of original and refined job post sections:
                                         description  \
0  IC&amp;E Technician | Bakersfield, CA Mt. Poso...   
1  The group has raised a fund for the purchase o...   
2  Technician Instrument &amp; ControlsLocation D...   
3                                    Sales Executive   
4  IC&amp;E Technician | Bakersfield, CA Mt. Poso...   

                                 refined_description  
0  As a [Job Title - e.g., Senior Global Strategi...  
1  We are seeking a highly accomplished and visio...  
2  We are seeking an exceptionally driven and str...  
3  We are seeking a highly motivated and strategi...  
4  We are seeking an exceptionally driven and ast...  
                                        requirements  \
0  QualificationsKnowledge, Skills &amp; Abilitie...   
1                                                      
2  JOB QUALIFICATIONS-Ability to understand proce...   
3                                    Sales Executive

In [9]:
# check for any errors in the refinement process
if "refinement_error" in df_refined.columns:
    errors = df_refined[df_refined["refinement_error"] == "NaN"]
    if not errors.empty:
        print("\nErrors encountered during refinement:")
        print(errors[["job_id", "refinement_error"]])
    else:
        print("\nNo errors encountered during refinement.")

In [11]:
# current shape of the refined DataFrame
# print(f"\nShape of refined DataFrame: {df_refined.shape}")
# print(f"Total jobs refined: {len(df_refined)}")