**Fake Jobs Extraction**

There are 866 fake jobs in our raw fake_jobs dataset.
Our goal here is to inspect dataset, drop features that are not needed,
and randomly extract 30 fake jobs for our research purpose.

In [1]:
import pandas as pd

# import random  # For random sampling

# Display settings for better viewing
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)
pd.set_option("display.max_colwidth", 200)

In [None]:
# load the dataset
print("Loading large job dataset...")
try:
    large_df = pd.read_csv("../1_datasets/aegean_raw_data/all_job_postings.csv")  # noqa: E501
    print("Dataset loaded successfully.")
    print(f"Initial shape: {large_df.shape}")
    print("\nFirst 5 rows:")
    print(large_df.head())
    print("\nColumn names and data types:")
    print(large_df.info())
except FileNotFoundError:
    print(
        "Error: 'large_job_dataset.csv' not found. Please ensure the\
            file is in the correct directory."
    )

Loading large job dataset...
Dataset loaded successfully.
Initial shape: (17880, 18)

First 5 rows:
   job_id                                      title            location department salary_range                                                                                                                                                                                          company_profile                                                                                                                                                                                              description                                                                                                                                                                                             requirements                                                                                                                                                                                                 benefits

In [None]:
# Inspect the 'fraudulent' column distribution
print("\nDistribution of 'fraudulent' column:")
print(large_df["fraudulent"].value_counts())
print(
    f"Percentage of fake jobs:\
        {large_df['fraudulent'].value_counts(normalize=True)[1] * 100:.2f}%"
)


Distribution of 'fraudulent' column:
fraudulent
0    17014
1      866
Name: count, dtype: int64
Percentage of fake jobs: 4.84%


In [5]:
# Extract the fake jobs
fake_jobs_df = large_df[
    large_df["fraudulent"] == 1
].copy()  # .copy() to avoid SettingWithCopyWarning
print(f"\nExtracted {fake_jobs_df.shape[0]} fake job postings.")


Extracted 866 fake job postings.


In [6]:
# Check for missing values in the fake jobs DataFrame
print("\nMissing values in fake jobs DataFrame:")
print(fake_jobs_df.isnull().sum())


Missing values in fake jobs DataFrame:
job_id                   0
title                    0
location                19
department             531
salary_range           643
company_profile        587
description              1
requirements           154
benefits               364
telecommuting            0
has_company_logo         0
has_questions            0
employment_type        241
required_experience    435
required_education     451
industry               275
function               337
fraudulent               0
dtype: int64


Question for the team, what features should our job postings have?
should we go with job title, company name, job description and
salary range? answering this question will help us know the features
to drop from this dataset (question answered by team)

In [None]:
# drop columns that are not needed for our analysis
columns_to_drop = [
    "department",
    "telecommuting",
    "has_company_logo",
    "has_questions",
    "required_education",
    "employment_type",
    "function",
    "industry",
]
fake_jobs_df.drop(columns=columns_to_drop, inplace=True, errors="ignore")

# print the shape of the DataFrame after dropping columns
print(
    f"\nShape of fake jobs DataFrame after dropping unnecessary columns:\
        {fake_jobs_df.shape}"
)

# show the first 5 rows of the DataFrame after dropping columns
print(
    "\nFirst 5 rows of the fake jobs DataFrame after dropping\
        unnecessary columns:"
)
print(fake_jobs_df.head())


Shape of fake jobs DataFrame after dropping unnecessary columns: (866, 10)

First 5 rows of the fake jobs DataFrame after dropping unnecessary columns:
     job_id                             title                            location  salary_range                                                                                                                                                                                          company_profile                                                                                                                                                                                              description                                                                                                                                                                                             requirements                                                                                                                                                     

In [None]:
# drop rows where 'job_title' or 'job_description' is NaN
fake_jobs_df.dropna(subset=["description"], inplace=True)

# print the shape of the DataFrame after dropping rows
print(
    f"\nShape of fake jobs DataFrame after dropping rows\
        with NaN in 'job_description': {fake_jobs_df.shape}"
)


Shape of fake jobs DataFrame after dropping rows with NaN in 'job_description': (865, 10)


In [None]:
# Randomly select 30 fake jobs
# Use a random state for reproducibility
random_state = 42
if fake_jobs_df.shape[0] >= 30:
    selected_fake_jobs = fake_jobs_df.sample(n=30, random_state=random_state)
    print(
        f"Successfully selected 30 fake job\
            postings (random_state={random_state})."
    )
else:
    selected_fake_jobs = fake_jobs_df  # Take all if less than 30
    print(
        f"Only {fake_jobs_df.shape[0]} fake jobs available.\
            Taking all of them."
    )

print("\nSelected Fake Jobs (first 5):")
# print(selected_fake_jobs.head())
print(selected_fake_jobs.tail())

Successfully selected 30 fake job postings (random_state=42).

Selected Fake Jobs (first 5):
       job_id                                               title              location salary_range                                                                                                                                                                                          company_profile                                                                                                                                                                                              description                                                                                                                                                                                             requirements                                                                                                                                                                                                 benefits re

In [None]:
# set the output file path
output_file_path = (
    "../1_datasets/processed_fake_jobs/originally_selected_30_fake_jobs.csv"
)

In [15]:
# save the selected fake jobs to a CSV file
selected_fake_jobs.to_csv(output_file_path, index=False)
print(f"\nSelected fake jobs saved to {output_file_path}.")


Selected fake jobs saved to ../1_datasets/processed_fake_jobs/randomlyselected_fake_jobs.csv.
