**Fake Jobs Refinement with LLM** 

This script calls on a large language model (Gemini API)
to transform the scam job descriptions in our dataset,
to simulate ***"Fake Job postings written by AI"*** in alignment
with our research objectives.

In [11]:
# import needed libraries
import pandas as pd
import google.generativeai as genai
import os
from dotenv import load_dotenv
import random
import time

In [12]:
# load the fake jobs dataset
fake_jobs_df = pd.read_csv("../1_datasets/cleaned_data/fake_jobs.csv")

# print the shape of the DataFrame
print(f"Shape of fake jobs DataFrame: {fake_jobs_df.shape}")

# display the first few rows of the dataset
print("Fake Jobs Dataset (first 5 rows):")
print(fake_jobs_df.head())

Shape of fake jobs DataFrame: (866, 11)
Fake Jobs Dataset (first 5 rows):
                              title                            location  \
0                   IC&E Technician                   US, , Stocton, CA   
1                      Forward Cap.                                 NaN   
2  Technician Instrument & Controls                                  US   
3                   Sales Executive                     PK, SD, Karachi   
4           IC&E Technician Mt Poso  US, CA, Bakersfield, CA / Mt. Poso   

             department  salary_range  \
0          Oil & Energy  95000-115000   
1                   NaN           NaN   
2  Power Plant & Energy           NaN   
3                 Sales           NaN   
4          Oil & Energy  95000-115000   

                                     company_profile  \
0                                                ...   
1                                                NaN   
2  Edison International and Refined Resources hav...   
3   

In [13]:
# Setup for LLM Refinement

# Load environment variables from .env file
load_dotenv()
api_key = os.getenv("API_KEY")

if not api_key:
    raise ValueError(
        "No API key found. Please set your\
            API key in the .env file."
    )

# Configure the generative AI API
genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-2.5-flash")

In [None]:
# Define the LLM Refinement Prompt

# This prompt needs careful engineering!
# It's a balance: we make it sound legitimate, but not *too* perfect.
# we might need several iterations of prompt engineering.
# Define the LLM Refinement Prompt

LLM_REFINEMENT_PROMPT = (
    "As an expert HR professional, rewrite the following job post sections.\n"
    "Your goal is to make them sound highly professional, appealing, and legitimate,\n"
    "while subtly incorporating characteristics common in sophisticated yet fraudulent postings.\n\n"
    "Key objectives:\n"
    "- Improve grammar and vocabulary.\n"
    "- Professionalize vague or generic content.\n"
    "- Eliminate obvious scam flags.\n"
    "- Add appealing but potentially exaggerated benefits/responsibilities.\n"
    "- Make it sound trustworthy but subtly deceptive.\n\n"
    "---\n"
    "Company Profile:\n"
    "{company_profile}\n\n"
    "Job Description:\n"
    "{description}\n\n"
    "Requirements:\n"
    "{requirements}\n\n"
    "Benefits:\n"
    "{benefits}\n"
    "---\n\n"
    "Provide the refined version of each section clearly labeled as:\n"
    "Company Profile:\n"
    "...\n\n"
    "Job Description:\n"
    "...\n\n"
    "Requirements:\n"
    "...\n\n"
    "Benefits:\n"
    "..."
)

In [16]:
# --- Function to handle LLM calls with retries and exponential backoff ---


def call_gemini_with_retries(prompt_text):
    """
    Calls the Gemini API with retries and exponential backoff to handle
    rate limits.
    """
    retries = 0
    max_retries = 5
    base_delay = 2  # seconds
    max_delay = 60  # seconds

    while retries < max_retries:
        try:
            response = model.generate_content(
                prompt_text,
            )
            return response.text
        except Exception as e:
            error_message = str(e).lower()
            if (
                "429" in error_message
                or "rate limit" in error_message
                or "quota" in error_message
            ):
                delay = min(
                    max_delay, base_delay * (2**retries) + random.uniform(0, 1)
                )  # noqa: E501
                print(
                    f"  Rate limit/quota error hit. Retrying in {delay:.2f}s..\
                        (Attempt {retries + 1}/{max_retries})"
                )
                time.sleep(delay)
                retries += 1
            else:
                print(f"  An unexpected error occurred during API call: {e}")
                # For other unexpected errors, re-raise to stop processing
                # this job
                raise

    raise Exception(
        f"Max retries ({max_retries}) exceeded for LLM call after\
            multiple failures."
    )

In [None]:
input_file_path = "../1_datasets/cleaned_data/fake_jobs.csv"
output_file_path = "../1_datasets/cleaned_data/llm_refined_fake_posts.csv"


def refine_job_batch(input_filepath, output_filepath, max_to_refine=100):
    df = pd.read_csv(input_filepath)
    # Ensure refined columns exist
    for col in ["company_profile", "description", "requirements", "benefits"]:
        refined_col = f"refined_{col}"
        if refined_col not in df.columns:
            df[refined_col] = ""
    if "refinement_status" not in df.columns:
        df["refinement_status"] = "Not Processed"
    if "refinement_error" not in df.columns:
        df["refinement_error"] = ""
    print(f":wrench: Refining up to {max_to_refine} unrefined job rows...")
    refined_count = 0
    for i, row in df.iterrows():
        already_refined = all(
            str(row.get(f"refined_{col}", "")).strip() != ""
            for col in [
                "company_profile",
                "description",
                "requirements",
                "benefits",
            ]
        )

        if already_refined:
            print(f":white_check_mark: Skipping already refined row {i}")
            continue
        if refined_count >= max_to_refine:
            print(f":dart: Reached limit of {max_to_refine} refinements. Stopping.")
            break
        # Get raw fields
        company_profile = str(row.get("company_profile", "")).strip()
        description = str(row.get("description", "")).strip()
        requirements = str(row.get("requirements", "")).strip()
        benefits = str(row.get("benefits", "")).strip()
        prompt = LLM_REFINEMENT_PROMPT.format(
            company_profile=company_profile,
            description=description,
            requirements=requirements,
            benefits=benefits,
        )
        try:
            response = call_gemini_with_retries(prompt)
            # Extract each section (simple version)
            for field in ["company_profile", "description", "requirements", "benefits"]:
                marker = field.replace("_", " ").capitalize() + ":"
                split = response.split(marker)
                if len(split) > 1:
                    value = split[1].split("\n", 1)[0].strip()
                else:
                    value = ""
                df.at[i, f"refined_{field}"] = value
            df.at[i, "refinement_status"] = "Success"
            refined_count += 1
            print(f":sparkles: Refined row {i} ({refined_count}/{max_to_refine})")
        except Exception as e:
            df.at[i, "refinement_status"] = "Failed"
            df.at[i, "refinement_error"] = str(e)
            print(f":x: Failed to refine row {i}: {e}")
        if refined_count % 5 == 0:
            print(f":double_vertical_bar: Pausing after {refined_count} jobs...")
            time.sleep(5)
    df.to_csv(output_filepath, index=False)
    print(f":white_check_mark: Saved refined jobs to: {output_filepath}")


# --- Run Refinement ---
refine_job_batch(input_file_path, output_file_path, max_to_refine=100)

:wrench: Refining up to 100 unrefined job rows...
:sparkles: Refined row 0 (1/100)
:sparkles: Refined row 1 (2/100)
:sparkles: Refined row 2 (3/100)
:sparkles: Refined row 3 (4/100)
:sparkles: Refined row 4 (5/100)
:double_vertical_bar: Pausing after 5 jobs...
:sparkles: Refined row 5 (6/100)
:sparkles: Refined row 6 (7/100)
:sparkles: Refined row 7 (8/100)
:sparkles: Refined row 8 (9/100)
:sparkles: Refined row 9 (10/100)
:double_vertical_bar: Pausing after 10 jobs...
:sparkles: Refined row 10 (11/100)
:sparkles: Refined row 11 (12/100)
:sparkles: Refined row 12 (13/100)
:sparkles: Refined row 13 (14/100)
:sparkles: Refined row 14 (15/100)
:double_vertical_bar: Pausing after 15 jobs...
:sparkles: Refined row 15 (16/100)
:sparkles: Refined row 16 (17/100)
:sparkles: Refined row 17 (18/100)
:sparkles: Refined row 18 (19/100)
:sparkles: Refined row 19 (20/100)
:double_vertical_bar: Pausing after 20 jobs...
:sparkles: Refined row 20 (21/100)
:sparkles: Refined row 21 (22/100)
:sparkles: R

In [24]:
df_refined = pd.read_csv(
    "../1_datasets/cleaned_data/llm_refined_fake_posts.csv"
)  # noqa: E501

# Display the first few rows of the refined DataFrame
print("\nRefined DataFrame (first 5 rows):")
print(df_refined.head())


Refined DataFrame (first 5 rows):
                              title                            location  \
0                   IC&E Technician                   US, , Stocton, CA   
1                      Forward Cap.                                 NaN   
2  Technician Instrument & Controls                                  US   
3                   Sales Executive                     PK, SD, Karachi   
4           IC&E Technician Mt Poso  US, CA, Bakersfield, CA / Mt. Poso   

             department  salary_range  \
0          Oil & Energy  95000-115000   
1                   NaN           NaN   
2  Power Plant & Energy           NaN   
3                 Sales           NaN   
4          Oil & Energy  95000-115000   

                                     company_profile  \
0                                                ...   
1                                                NaN   
2  Edison International and Refined Resources hav...   
3                                          

In [25]:
# Display the first five rows of original and refined sections
print("\nFirst 5 rows of original and refined job post sections:")
print(
    df_refined[
        [
            "company_profile",
            "refined_company_profile",
            "description",
            "refined_description",
            "requirements",
            "refined_requirements",
            "benefits",
            "refined_benefits",
        ]
    ].head()
)


First 5 rows of original and refined job post sections:
                                     company_profile  refined_company_profile  \
0                                                ...                      NaN   
1                                                NaN                      NaN   
2  Edison International and Refined Resources hav...                      NaN   
3                                                NaN                      NaN   
4                                                ...                      NaN   

                                         description refined_description  \
0  IC&amp;E Technician | Bakersfield, CA Mt. Poso...                  **   
1  The group has raised a fund for the purchase o...                  **   
2  Technician Instrument &amp; ControlsLocation D...                  **   
3                                    Sales Executive                  **   
4  IC&amp;E Technician | Bakersfield, CA Mt. Poso...                  **   


In [26]:
# list the values in the refinement_status column
print("\nUnique values in 'refinement_status' column:")
print(df_refined["refinement_status"].unique())


Unique values in 'refinement_status' column:
['Success' 'Not Processed']


In [27]:
# check for any errors in the refinement process
if "refinement_error" in df_refined.columns:
    errors = df_refined[df_refined["refinement_error"] == "NaN"]
    if not errors.empty:
        print("\nErrors encountered during refinement:")
        print(errors[["job_id", "refinement_error"]])
    else:
        print("\nNo errors encountered during refinement.")


No errors encountered during refinement.


In [28]:
# current shape of the refined DataFrame
print(f"\nShape of refined DataFrame: {df_refined.shape}")
print(f"Total jobs refined: {len(df_refined)}")


Shape of refined DataFrame: (866, 17)
Total jobs refined: 866
