In [3]:
import pandas as pd
import plotly.express as px


In [4]:
file_path = '../dataset/fake_job_postings.csv'
df = pd.read_csv(file_path)

df_info = df.info()
df_head = df.head()

df_info, df_head


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   department           6333 non-null   object
 4   salary_range         2868 non-null   object
 5   company_profile      14572 non-null  object
 6   description          17879 non-null  object
 7   requirements         15184 non-null  object
 8   benefits             10668 non-null  object
 9   telecommuting        17880 non-null  int64 
 10  has_company_logo     17880 non-null  int64 
 11  has_questions        17880 non-null  int64 
 12  employment_type      14409 non-null  object
 13  required_experience  10830 non-null  object
 14  required_education   9775 non-null   object
 15  industry             12977 non-null  object
 16  func

(None,
    job_id                                      title            location  \
 0       1                           Marketing Intern    US, NY, New York   
 1       2  Customer Service - Cloud Video Production      NZ, , Auckland   
 2       3    Commissioning Machinery Assistant (CMA)       US, IA, Wever   
 3       4          Account Executive - Washington DC  US, DC, Washington   
 4       5                        Bill Review Manager  US, FL, Fort Worth   
 
   department salary_range                                    company_profile  \
 0  Marketing          NaN  We're Food52, and we've created a groundbreaki...   
 1    Success          NaN  90 Seconds, the worlds Cloud Video Production ...   
 2        NaN          NaN  Valor Services provides Workforce Solutions th...   
 3      Sales          NaN  Our passion for improving quality of life thro...   
 4        NaN          NaN  SpotSource Solutions LLC is a Global Human Cap...   
 
                                         

# Data Cleaning

### Handling Duplciated Rows

No duplicated rows were found.

In [5]:
duplicates = df.duplicated()
num_duplicates = duplicates.sum()

print(f"Number of duplicated rows: {num_duplicates}")

Number of duplicated rows: 0


### Handling Missing Values

Upon inspecting the dataset, we found several fields with null values. Since these fields are all categorical, we decided to fill the missing values with the appropriate categorical values:

- Missing values filled with **"Unknown"**: Used for columns like `location` and `employment_type`, where missing data likely means the information is unavailable or not applicable. This preserves the understanding that the data may exist but is not provided.
  
- Missing values filled with **"Not Provided"**: Applied to columns such as `salary_range` and `company_profile`, where the missing information might indicate that it was purposely left out by the job poster.

In [6]:
df.isnull().sum()

job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2696
benefits                7212
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

In [7]:
df['location'].fillna('Unknown', inplace=True)
df['employment_type'].fillna('Unknown', inplace=True)
df['required_experience'].fillna('Unknown', inplace=True)
df['required_education'].fillna('Unknown', inplace=True)
df['industry'].fillna('Unknown', inplace=True)
df['function'].fillna('Unknown', inplace=True)

df['department'].fillna('Not Provided', inplace=True)
df['salary_range'].fillna('Not Provided', inplace=True)
df['company_profile'].fillna('Not Provided', inplace=True)
df['description'].fillna('Not Provided', inplace=True)
df['requirements'].fillna('Not Provided', inplace=True)
df['benefits'].fillna('Not Provided', inplace=True)

df_null_removed = df.copy()

missing_values = df_null_removed.isnull().sum()

missing_values

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['location'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['employment_type'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting

job_id                 0
title                  0
location               0
department             0
salary_range           0
company_profile        0
description            0
requirements           0
benefits               0
telecommuting          0
has_company_logo       0
has_questions          0
employment_type        0
required_experience    0
required_education     0
industry               0
function               0
fraudulent             0
dtype: int64

### Investigating and Handling Outliers

After handling missing values, we checked for potential outliers in the dataset. For this, we only focused on the numeric columns. Using the **Interquartile Range (IQR)** method, outliers were found in columns such as `telecommuting`, `has_company_logo`, and `fraudulent`. 

Upon further investigation, we plotted these columns and found that the "outliers" were a result of **class imbalance** rather than true outliers. Since these columns are binary and categorical in nature, the detected values were not problematic for analysis and thus no additional outlier removal was needed.

This step helped us realise that there is data imbalance in certain categories.

In [8]:
numeric_columns = df_null_removed.select_dtypes(include=['int64', 'float64']).columns

def detect_outliers_iqr_with_print(df, column):
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1

    # Defining outlier boundaries
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identifying outliers
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]

    # Print IQR and boundaries
    print(f"Column: {column}")
    print(f"Q1: {Q1}, Q3: {Q3}")
    print(f"IQR: {IQR}")
    print(f"Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")
    print(f"Number of outliers: {len(outliers)}\n")

    return outliers

for col in numeric_columns:
    outliers = detect_outliers_iqr_with_print(df_null_removed, col)


Column: job_id
Q1: 4470.75, Q3: 13410.25
IQR: 8939.5
Lower Bound: -8938.5, Upper Bound: 26819.5
Number of outliers: 0

Column: telecommuting
Q1: 0.0, Q3: 0.0
IQR: 0.0
Lower Bound: 0.0, Upper Bound: 0.0
Number of outliers: 767

Column: has_company_logo
Q1: 1.0, Q3: 1.0
IQR: 0.0
Lower Bound: 1.0, Upper Bound: 1.0
Number of outliers: 3660

Column: has_questions
Q1: 0.0, Q3: 1.0
IQR: 1.0
Lower Bound: -1.5, Upper Bound: 2.5
Number of outliers: 0

Column: fraudulent
Q1: 0.0, Q3: 0.0
IQR: 0.0
Lower Bound: 0.0, Upper Bound: 0.0
Number of outliers: 866



In [9]:
telecommuting_plot = px.histogram(df_null_removed, x='telecommuting', title='Distribution of Telecommuting (Outliers Detected)',
                                  labels={'telecommuting': 'Telecommuting'},
                                  color='telecommuting', barmode='group')

has_company_logo_plot = px.histogram(df_null_removed, x='has_company_logo', title='Distribution of Has Company Logo (Outliers Detected)',
                                     labels={'has_company_logo': 'Has Company Logo'},
                                     color='has_company_logo', barmode='group')

fraudulent_plot = px.histogram(df_null_removed, x='fraudulent', title='Distribution of Fraudulent Job Postings (Outliers Detected)',
                               labels={'fraudulent': 'Fraudulent'},
                               color='fraudulent', barmode='group')

telecommuting_plot.show()
has_company_logo_plot.show()
fraudulent_plot.show()


### Exploding the `location` Column

The `location` column contains comma-separated values representing the country, state, and city. To make this data more usable, we exploded the `location` column into three separate fields: `country`, `state`, and `city`.

- **Before**: The `location` column was a single string in the format `country, state, city` (e.g., US, NY, New York).
- **After**: We split the column into three distinct columns: `country`, `state`, and `city`. 

For rows where either `country`, `state`, and `city` was missing, the missing value is replaced by **"Unknown"**

In [10]:
# Splitting the 'location' column into 'country', 'state', and 'city', handling missing values
df_null_removed[['country', 'state', 'city']] = df_null_removed['location'].str.split(',', expand=True, n=2)

df_null_removed[['job_id', 'location', 'country', 'state', 'city']].head()

Unnamed: 0,job_id,location,country,state,city
0,1,"US, NY, New York",US,NY,New York
1,2,"NZ, , Auckland",NZ,,Auckland
2,3,"US, IA, Wever",US,IA,Wever
3,4,"US, DC, Washington",US,DC,Washington
4,5,"US, FL, Fort Worth",US,FL,Fort Worth


In [11]:
df_location_split = df_null_removed.copy()

df_location_split['country'] = df_location_split['country'].replace([' ', None], 'Unknown')
df_location_split['state'] = df_location_split['state'].replace([' ', None], 'Unknown')
df_location_split['city'] = df_location_split['city'].replace([' ', None], 'Unknown')

df_location_split.drop(columns=['location'], inplace=True)

df_location_split[['job_id', 'country', 'state', 'city']].head()

Unnamed: 0,job_id,country,state,city
0,1,US,NY,New York
1,2,NZ,Unknown,Auckland
2,3,US,IA,Wever
3,4,US,DC,Washington
4,5,US,FL,Fort Worth


## Remove irrelevant columns

Columns such as job_id are not relevant since there is no ranking involved

In [23]:
df_cleaned = df_location_split.copy()
df_cleaned = df_cleaned.drop(['job_id'],axis=1)
df_cleaned

Unnamed: 0,title,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,country,state,city
0,Marketing Intern,Marketing,Not Provided,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,Not Provided,0,1,0,Other,Internship,Unknown,Unknown,Marketing,0,US,NY,New York
1,Customer Service - Cloud Video Production,Success,Not Provided,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,Unknown,Marketing and Advertising,Customer Service,0,NZ,Unknown,Auckland
2,Commissioning Machinery Assistant (CMA),Not Provided,Not Provided,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,Not Provided,0,1,0,Unknown,Unknown,Unknown,Unknown,Unknown,0,US,IA,Wever
3,Account Executive - Washington DC,Sales,Not Provided,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,US,DC,Washington
4,Bill Review Manager,Not Provided,Not Provided,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,US,FL,Fort Worth
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,Account Director - Distribution,Sales,Not Provided,Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,0,1,1,Full-time,Mid-Senior level,Unknown,Computer Software,Sales,0,CA,ON,Toronto
17876,Payroll Accountant,Accounting,Not Provided,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0,US,PA,Philadelphia
17877,Project Cost Control Staff Engineer - Cost Con...,Not Provided,Not Provided,We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,Not Provided,0,0,0,Full-time,Unknown,Unknown,Unknown,Unknown,0,US,TX,Houston
17878,Graphic Designer,Not Provided,Not Provided,Not Provided,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,0,0,1,Contract,Not Applicable,Professional,Graphic Design,Design,0,NG,LA,Lagos


## Sorting by fraudulent job listings

To help with recognising patterns on fraudulent cases for EDA

In [30]:
df_sorted = df_cleaned.copy()
df_sorted = df_sorted.sort_values(by='fraudulent',ascending=False)
df_sorted

Unnamed: 0,title,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,country,state,city
4577,Senior Sales Professionals,Sales/Marketing,Not Provided,Not Provided,Do YOU have the sales skills or entrepreneuria...,Not Provided,Not Provided,0,0,0,Unknown,Unknown,High School or equivalent,Unknown,Sales,1,US,IN,Indianapolis
17547,Home Based Payroll Data Entry Clerk Position -...,Not Provided,Not Provided,Not Provided,We are a full-service marketing and staffing f...,RequirementsAll you need is access to the Inte...,This is an entry level position and we offer f...,0,0,0,Unknown,Unknown,Unknown,Unknown,Unknown,1,GB,AGB,Unknown
17664,Data Entry Admin/Clerical Positions - Work Fro...,Not Provided,Not Provided,Not Provided,ACCEPTING ONLINE APPLICATIONS ONLYClick Here T...,Not Provided,Not Provided,0,0,0,Unknown,Unknown,Unknown,Unknown,Unknown,1,US,CA,Visalia
17779,J2EE Developer Required for Bahrain,Information Technology,Not Provided,Not Provided,Greetings from VAM SYSTEMS…..VAM SYSTEMS is a ...,Skillset required:2-3 years of J2EE experience...,Not Provided,0,0,1,Unknown,Unknown,Unknown,Unknown,Unknown,1,BH,13,Unknown
17663,Cash In Hand Job (Urgent Staff Required),Not Provided,Not Provided,Not Provided,Cash In Hand Job (Urgent Staff Required)No Exp...,Not Provided,Not Provided,0,0,0,Part-time,Unknown,Unknown,Unknown,Unknown,1,US,CA,Los Angeles
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16718,iOS Developer,Not Provided,Not Provided,We're Hiring!Yoyo is bringing together a game-...,Yoyo is hiring an experienced Software Develop...,you have experience building scalable iOS apps...,being apart of an early-stage venture-backed s...,0,1,1,Full-time,Mid-Senior level,Unknown,Financial Services,Engineering,0,GB,LND,London
16719,Android Developer,Not Provided,Not Provided,We're Hiring!Yoyo is bringing together a game-...,Yoyo is hiring an experienced Software Develop...,you have experience building scalable Android ...,being apart of an early-stage venture-backed s...,0,1,1,Full-time,Mid-Senior level,Unknown,Financial Services,Engineering,0,GB,LND,London
16720,Benefits Consultant,Not Provided,Not Provided,About AflacWould you buy insurance from a duck...,Aflac Benefits ConsultantWe are looking for en...,Not Provided,Top-Notch BenefitsStock bonus program allows c...,0,1,0,Unknown,Unknown,Unknown,Insurance,Unknown,0,US,OK,Tulsa
16722,Lead Information Architect,PMO,208000-270400,CARES is Alabama’s approach to the modernizati...,"Design, develop, implement, and validate innov...",Experience:10+ years of experience with system...,Not Provided,0,0,0,Unknown,Unknown,Unknown,Unknown,Unknown,0,US,AL,Montgomery


## Text Proprocessing

Clean textual columns by removing special characters, stop words, and performing tokenization.

In [37]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Clean text function
def clean_text(text):
    text = re.sub(r'\W+', ' ', text.lower())  # Remove special characters and lowercase
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(tokens)

# Apply text cleaning
df_cleaned = df_sorted.copy()
df_cleaned['description_cleaned'] = df_cleaned['description'].apply(clean_text)


KeyboardInterrupt: 