In [23]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import nltk
from nltk.corpus import stopwords
import re
from tabulate import tabulate
import plotly.express as px
import plotly.graph_objects as go

In [24]:
# Load the dataset
df = pd.read_csv('fake_job_postings.csv')
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [25]:
df.describe()

Unnamed: 0,job_id,telecommuting,has_company_logo,has_questions,fraudulent
count,17880.0,17880.0,17880.0,17880.0,17880.0
mean,8940.5,0.042897,0.795302,0.491723,0.048434
std,5161.655742,0.202631,0.403492,0.499945,0.214688
min,1.0,0.0,0.0,0.0,0.0
25%,4470.75,0.0,1.0,0.0,0.0
50%,8940.5,0.0,1.0,0.0,0.0
75%,13410.25,0.0,1.0,1.0,0.0
max,17880.0,1.0,1.0,1.0,1.0


In [26]:
print("Dataset Shape:", df.shape)
print(df.info())

Dataset Shape: (17880, 18)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   department           6333 non-null   object
 4   salary_range         2868 non-null   object
 5   company_profile      14572 non-null  object
 6   description          17879 non-null  object
 7   requirements         15184 non-null  object
 8   benefits             10668 non-null  object
 9   telecommuting        17880 non-null  int64 
 10  has_company_logo     17880 non-null  int64 
 11  has_questions        17880 non-null  int64 
 12  employment_type      14409 non-null  object
 13  required_experience  10830 non-null  object
 14  required_education   9775 non-null   object
 15  industry             12977

In [27]:
print("Null Values:")
print(df.isnull().sum())

Null Values:
job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2696
benefits                7212
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64


In [28]:
print("Fraudulent Value Counts:")
print(df['fraudulent'].value_counts())

Fraudulent Value Counts:
fraudulent
0    17014
1      866
Name: count, dtype: int64


In [29]:
# For text columns, replace missing values with an empty string
text_columns = ['title', 'company_profile', 'description', 'requirements', 'benefits']
df[text_columns] = df[text_columns].fillna(' ')

In [30]:
df['location'].fillna('Unknown', inplace=True)
df['department'].fillna('Unknown', inplace=True)
df['salary_range'].fillna('Not Specified', inplace=True)
df['employment_type'].fillna('Not Specified', inplace=True)
df['required_experience'].fillna('Not Specified', inplace=True)
df['required_education'].fillna('Not Specified', inplace=True)
df['industry'].fillna('Not Specified', inplace=True)
df['function'].fillna('Not Specified', inplace=True)

In [31]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = [word for word in text.split() if word not in stop_words]
    return ' '.join(text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jingyiliu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
for col in text_columns:
    df[col] = df[col].apply(preprocess_text)

In [33]:
# Combining text columns into a single feature
df['text'] = df[text_columns].apply(lambda x: ' '.join(x), axis=1)

In [34]:
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,text
0,1,marketing intern,"US, NY, New York",Marketing,Not Specified,food weve created groundbreaking awardwinning ...,food fastgrowing james beard awardwinning onli...,experience content management systems major pl...,,0,1,0,Other,Internship,Not Specified,Not Specified,Marketing,0,marketing intern food weve created groundbreak...
1,2,customer service cloud video production,"NZ, , Auckland",Success,Not Specified,seconds worlds cloud video production service ...,organised focused vibrant awesomedo passion cu...,expect youyour key responsibility communicate ...,get usthrough part seconds team gainexperience...,0,1,0,Full-time,Not Applicable,Not Specified,Marketing and Advertising,Customer Service,0,customer service cloud video production second...
2,3,commissioning machinery assistant cma,"US, IA, Wever",Unknown,Not Specified,valor services provides workforce solutions me...,client located houston actively seeking experi...,implement precommissioning commissioning proce...,,0,1,0,Not Specified,Not Specified,Not Specified,Not Specified,Not Specified,0,commissioning machinery assistant cma valor se...
3,4,account executive washington dc,"US, DC, Washington",Sales,Not Specified,passion improving quality life geography heart...,company esri environmental systems research in...,education bachelors masters gis business admin...,culture anything corporatewe collaborative cre...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,account executive washington dc passion improv...
4,5,bill review manager,"US, FL, Fort Worth",Unknown,Not Specified,spotsource solutions llc global human capital ...,job title itemization review managerlocation f...,qualificationsrn license state texasdiploma ba...,full benefits offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,bill review manager spotsource solutions llc g...


In [35]:
# Distribution of fraudulent vs non-fraudulent job postings
fig = px.histogram(df, x='fraudulent', title='Distribution of Fraudulent vs Non-Fraudulent Job Postings',
                   labels={'fraudulent': 'Fraudulent'}, color='fraudulent',
                   color_discrete_sequence=['#1f77b4', '#ff7f0e'])
fig.update_layout(
    template='plotly_dark',
    xaxis_title='Fraudulent',
    yaxis_title='Count',
    title_x=0.5,
    font=dict(family="Arial, sans-serif", size=14, color="white"),
    paper_bgcolor='#1e1e1e',
    plot_bgcolor='#1e1e1e',
    xaxis=dict(gridcolor='gray'),
    yaxis=dict(gridcolor='gray')
)
fig.show()

In [38]:
fraudulent_jobs = df[df['fraudulent'] == 1]['text']
non_fraudulent_jobs = df[df['fraudulent'] == 0]['text']

print(fraudulent_jobs)

98       ice technician staffing amp recruiting done ri...
144      forward cap  group raised fund purchase homes ...
173      technician instrument controls edison internat...
180      sales executive  sales executive sales executi...
215      ice technician mt poso staffing amp recruiting...
                               ...                        
17827    student positions parttime fulltime  student p...
17828    sales associate  learn earn executive level in...
17829    android developer  infullmobile sp z oo mobile...
17830    payroll clerk  job descriptionwe seeking full ...
17831    furniture mover anthony warren marketing adver...
Name: text, Length: 866, dtype: object


In [36]:
# Top words in fraudulent job postings
fraudulent_jobs = df[df['fraudulent'] == 1]['text']
non_fraudulent_jobs = df[df['fraudulent'] == 0]['text']

def plot_top_words(text, title):
    word_freq = pd.Series(' '.join(text).split()).value_counts().head(20)
    fig = px.bar(word_freq, x=word_freq.index, y=word_freq.values, title=title,
                 labels={'index': 'Words', 'y': 'Frequency'},
                 color=word_freq.values, color_continuous_scale='Blues')
    fig.update_layout(template='plotly_dark')
    fig.show()

plot_top_words(fraudulent_jobs, 'Top Words in Fraudulent Job Postings')
plot_top_words(non_fraudulent_jobs, 'Top Words in Non-Fraudulent Job Postings')

In [39]:
# Feature extraction using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['text']).toarray()
y = df['fraudulent']

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Model training using Multinomial Naive Bayes
model = MultinomialNB()
model.fit(X_train, y_train)

In [46]:
# Predictions
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

In [47]:
# Evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_prob))

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      3395
           1       1.00      0.32      0.49       181

    accuracy                           0.97      3576
   macro avg       0.98      0.66      0.73      3576
weighted avg       0.97      0.97      0.96      3576

Confusion Matrix:
[[3395    0]
 [ 123   58]]
ROC AUC Score: 0.9394576033979121


In [49]:
# Visualization of Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
fig = px.imshow(conf_matrix, text_auto=True, title='Confusion Matrix')
fig.show()

In [48]:
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_fig = go.Figure()
roc_fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC Curve', line=dict(color='cyan')))
roc_fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random Classifier', line=dict(dash='dash', color='red')))
roc_fig.update_layout(title='ROC Curve', xaxis_title='False Positive Rate', yaxis_title='True Positive Rate', template='plotly_dark')
roc_fig.show()

In [50]:
# Display 10 samples with actual and predicted values
samples = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
samples = samples.sample(10).reset_index(drop=True)
print("10 Sample Predictions:")
print(tabulate(samples, headers='keys', tablefmt='fancy_grid'))

10 Sample Predictions:
╒════╤══════════╤═════════════╕
│    │   Actual │   Predicted │
╞════╪══════════╪═════════════╡
│  0 │        1 │           1 │
├────┼──────────┼─────────────┤
│  1 │        0 │           0 │
├────┼──────────┼─────────────┤
│  2 │        0 │           0 │
├────┼──────────┼─────────────┤
│  3 │        0 │           0 │
├────┼──────────┼─────────────┤
│  4 │        0 │           0 │
├────┼──────────┼─────────────┤
│  5 │        0 │           0 │
├────┼──────────┼─────────────┤
│  6 │        1 │           0 │
├────┼──────────┼─────────────┤
│  7 │        0 │           0 │
├────┼──────────┼─────────────┤
│  8 │        0 │           0 │
├────┼──────────┼─────────────┤
│  9 │        0 │           0 │
╘════╧══════════╧═════════════╛
