**Fake Jobs Refinement with LLM** 

This script calls on a large language model (Gemini API)
to transform the scam job descriptions in our dataset,
to simulate ***"Fake Job postings written by AI"*** in alignment
with our research objectives.

In [7]:
# import needed libraries
import pandas as pd
import google.generativeai as genai
import os
from dotenv import load_dotenv
import random
import time

In [55]:
# load the fake jobs dataset
fake_jobs_df = pd.read_csv("../1_datasets/cleaned_data/fake_jobs.csv")

# print the shape of the DataFrame
print(f"Shape of fake jobs DataFrame: {fake_jobs_df.shape}")

# display the first few rows of the dataset
print("Fake Jobs Dataset (first 5 rows):")
print(fake_jobs_df.head())

Shape of fake jobs DataFrame: (866, 11)
Fake Jobs Dataset (first 5 rows):
                              title                            location  \
0                   IC&E Technician                   US, , Stocton, CA   
1                      Forward Cap.                                 NaN   
2  Technician Instrument & Controls                                  US   
3                   Sales Executive                     PK, SD, Karachi   
4           IC&E Technician Mt Poso  US, CA, Bakersfield, CA / Mt. Poso   

             department  salary_range  \
0          Oil & Energy  95000-115000   
1                   NaN           NaN   
2  Power Plant & Energy           NaN   
3                 Sales           NaN   
4          Oil & Energy  95000-115000   

                                     company_profile  \
0                                                ...   
1                                                NaN   
2  Edison International and Refined Resources hav...   
3   

In [56]:
# Setup for LLM Refinement

# Load environment variables from .env file
load_dotenv()
api_key = os.getenv("API_KEY")

if not api_key:
    raise ValueError(
        "No API key found. Please set your\
            API key in the .env file."
    )

# Configure the generative AI API
genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-2.5-flash")

In [57]:
# Define the LLM Refinement Prompt

# This prompt needs careful engineering!
# It's a balance: we make it sound legitimate, but not *too* perfect.
# we might need several iterations of prompt engineering.

LLM_REFINEMENT_PROMPT = """
As an expert HR professional, rewrite the following job description.
Your goal is to make it sound highly professional, appealing, and legitimate,
while subtly incorporating characteristics common in sophisticated yet
fraudulent postings.

Key objectives:
- Improve grammar and vocabulary.
- Professionalize vague tasks (e.g., "data entry" -> "information management").
- Eliminate obvious scam flags
(e.g., "send money," "no experience - huge pay").
- Add appealing but potentially exaggerated benefits/responsibilities.
- Ensure the application process sounds normal.
- Retain the original core job type (e.g., data-related role for 'data entry').

Crucially:
- DO NOT make it sound *too* perfect; aim for subtle deception.
- DO NOT use explicit scam language or mention fraud.

Provide ONLY the refined job description. Keep it concise.
---
{job_description_text}
---
"""

In [58]:
# --- Function to handle LLM calls with retries and exponential backoff ---


def call_gemini_with_retries(prompt_text):
    """
    Calls the Gemini API with retries and exponential backoff to handle
    rate limits.
    """
    retries = 0
    max_retries = 5
    base_delay = 2  # seconds
    max_delay = 60  # seconds

    while retries < max_retries:
        try:
            response = model.generate_content(
                prompt_text,
            )
            return response.text
        except Exception as e:
            error_message = str(e).lower()
            if (
                "429" in error_message
                or "rate limit" in error_message
                or "quota" in error_message
            ):
                delay = min(max_delay, base_delay * (2**retries) + random.uniform(0, 1))  # noqa: E501
                print(
                    f"  Rate limit/quota error hit. Retrying in {delay:.2f}s..\
                        (Attempt {retries + 1}/{max_retries})"
                )
                time.sleep(delay)
                retries += 1
            else:
                print(f"  An unexpected error occurred during API call: {e}")
                # For other unexpected errors, re-raise to stop processing
                # this job
                raise

    raise Exception(
        f"Max retries ({max_retries}) exceeded for LLM call after\
            multiple failures."
    )

In [None]:
input_file_path = "../1_datasets/cleaned_data/fake_jobs.csv"
output_file_path = "../1_datasets/cleaned_data/llm_refined_fake_posts.csv"


def refine_job_batch(input_filepath, output_filepath, max_to_refine=100):
    df = pd.read_csv(input_filepath)

    if "refined_description" not in df.columns:
        df["refined_description"] = ""
    if "refinement_status" not in df.columns:
        df["refinement_status"] = "Not Processed"
    if "refinement_error" not in df.columns:
        df["refinement_error"] = ""

    print(f"🔧 Refining up to {max_to_refine} unrefined job descriptions...")

    refined_count = 0

    for i, row in df.iterrows():
        if row["refined_description"]:
            print(f"✅ Skipping already refined row {i}")
            continue

        if refined_count >= max_to_refine:
            print(f"🎯 Reached limit of {max_to_refine} refinements. Stopping.")
            break

        original = row["description"]
        prompt = LLM_REFINEMENT_PROMPT.format(job_description_text=original)

        try:
            refined = call_gemini_with_retries(prompt)
            df.at[i, "refined_description"] = refined
            df.at[i, "refinement_status"] = "Success"
            refined_count += 1
            print(f"✨ Refined row {i} ({refined_count}/{max_to_refine})")
        except Exception as e:
            df.at[i, "refined_description"] = original
            df.at[i, "refinement_status"] = "Failed"
            df.at[i, "refinement_error"] = str(e)
            print(f"❌ Failed to refine row {i}: {e}")

        if (refined_count) % 5 == 0:
            print(f"⏸️ Pausing after {refined_count} jobs...")
            time.sleep(5)

    df.to_csv(output_filepath, index=False)
    print(f"✅ Saved refined jobs to: {output_filepath}")


refine_job_batch(input_file_path, output_file_path, max_to_refine=100)

In [41]:
df_refined = pd.read_csv("../1_datasets/cleaned_data/llm_refined_fake_posts.csv")  # noqa: E501

# Display the first few rows of the refined DataFrame
print("\nRefined DataFrame (first 5 rows):")
print(df_refined.head())


Refined DataFrame (first 5 rows):
                              title                            location  \
0                   IC&E Technician                   US, , Stocton, CA   
1                      Forward Cap.                                 NaN   
2  Technician Instrument & Controls                                  US   
3                   Sales Executive                     PK, SD, Karachi   
4           IC&E Technician Mt Poso  US, CA, Bakersfield, CA / Mt. Poso   

             department  salary_range  \
0          Oil & Energy  95000-115000   
1                   NaN           NaN   
2  Power Plant & Energy           NaN   
3                 Sales           NaN   
4          Oil & Energy  95000-115000   

                                     company_profile  \
0                                                ...   
1                                                NaN   
2  Edison International and Refined Resources hav...   
3                                          

In [42]:
# display the first five rows of decriptions and refined descriptions
print("\nFirst 5 rows of original and refined descriptions:")
print(df_refined[["description", "refined_description"]].head())


First 5 rows of original and refined descriptions:
                                         description  \
0  IC&amp;E Technician | Bakersfield, CA Mt. Poso...   
1  The group has raised a fund for the purchase o...   
2  Technician Instrument &amp; ControlsLocation D...   
3                                    Sales Executive   
4  IC&amp;E Technician | Bakersfield, CA Mt. Poso...   

                                 refined_description  
0  **Senior Instrumentation, Controls, and Electr...  
1  **Investment Strategy Associate**\n\nOur dynam...  
2  **Lead Instrumentation & Control Systems Speci...  
3  **Strategic Account Executive**\n\n**Overview:...  
4  **Lead Instrumentation, Controls, & Electrical...  


In [43]:
# list the values in the refinement_status column
print("\nUnique values in 'refinement_status' column:")
print(df_refined["refinement_status"].unique())


Unique values in 'refinement_status' column:
['Success' 'Not Processed']


In [44]:
# check for any errors in the refinement process
if "refinement_error" in df_refined.columns:
    errors = df_refined[df_refined["refinement_error"] == "NaN"]
    if not errors.empty:
        print("\nErrors encountered during refinement:")
        print(errors[["job_id", "refinement_error"]])
    else:
        print("\nNo errors encountered during refinement.")


No errors encountered during refinement.


In [45]:
# current shape of the refined DataFrame
print(f"\nShape of refined DataFrame: {df_refined.shape}")
print(f"Total jobs refined: {len(df_refined)}")


Shape of refined DataFrame: (866, 14)
Total jobs refined: 866
