 # Predicting Fake Jobs

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from path import Path
import pandas as pd

In [3]:
# read in data 
data = Path('fake_job_postings.csv')
job_posting_df = pd.read_csv(data)
job_posting_df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [4]:
# count of the fraudulent (1) postings and the actual (0)
job_posting_df["fraudulent"].value_counts()

0    17014
1      866
Name: fraudulent, dtype: int64

In [5]:
# Type of data in each column
job_posting_df.dtypes

job_id                  int64
title                  object
location               object
department             object
salary_range           object
company_profile        object
description            object
requirements           object
benefits               object
telecommuting           int64
has_company_logo        int64
has_questions           int64
employment_type        object
required_experience    object
required_education     object
industry               object
function               object
fraudulent              int64
dtype: object

In [6]:
# Drop the null columns where all values are null
#job_posting_df = job_posting_df.dropna(axis='columns', how='all')

# Drop the null rows
#job_posting_df = df.dropna()

In [7]:
# Dropping Columns 
# Will add a column for "has and does not have benefits"
# Feel free to adjust for the machine learning model - just add or take away column names in the "" inside of the []
job_posting_df=job_posting_df.drop(columns=["job_id","title","location","department","salary_range","company_profile","description","requirements","benefits", "function" ], axis=1)
job_posting_df.dtypes

telecommuting           int64
has_company_logo        int64
has_questions           int64
employment_type        object
required_experience    object
required_education     object
industry               object
fraudulent              int64
dtype: object

In [8]:
# count of the fraudulent (1) postings and the actual (0)
job_posting_df["fraudulent"].value_counts()

0    17014
1      866
Name: fraudulent, dtype: int64

In [9]:
# Showing the amount of unique inputs in each column, easiest to focus on the ones with the smallest. "required_education" and "required_experience"
# both had small amounts of unique inputs
job_posting_df.nunique()

telecommuting            2
has_company_logo         2
has_questions            2
employment_type          5
required_experience      7
required_education      13
industry               131
fraudulent               2
dtype: int64

In [10]:
# Determine the # of fraudulent posts based on required experience
job_posting_df.value_counts(['employment_type','fraudulent'])

employment_type  fraudulent
Full-time        0             11130
Contract         0              1480
Part-time        0               723
Full-time        1               490
Temporary        0               239
Other            0               212
Part-time        1                74
Contract         1                44
Other            1                15
Temporary        1                 2
dtype: int64

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
job_posting_df2 = job_posting_df.copy()
job_posting_df2['employment_type'] = le.fit_transform(job_posting_df2['employment_type'])
job_posting_df2.head()

Unnamed: 0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,fraudulent
0,0,1,0,2,Internship,,,0
1,0,1,0,1,Not Applicable,,Marketing and Advertising,0
2,0,1,0,5,,,,0
3,0,1,0,1,Mid-Senior level,Bachelor's Degree,Computer Software,0
4,0,1,1,1,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,0


In [12]:
# Determine the # of fraudulent posts based on required experience
job_posting_df2.value_counts(['required_experience','fraudulent'])

required_experience  fraudulent
Mid-Senior level     0             3696
Entry level          0             2518
Associate            0             2255
Not Applicable       0             1056
Director             0              372
Internship           0              371
Entry level          1              179
Executive            0              131
Mid-Senior level     1              113
Not Applicable       1               60
Associate            1               42
Director             1               17
Executive            1               10
Internship           1               10
dtype: int64

In [13]:

le = LabelEncoder()
job_posting_df3 = job_posting_df2.copy()
job_posting_df3['required_experience'] = le.fit_transform(job_posting_df3['required_experience'])
job_posting_df3.head()

Unnamed: 0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,fraudulent
0,0,1,0,2,4,,,0
1,0,1,0,1,6,,Marketing and Advertising,0
2,0,1,0,5,7,,,0
3,0,1,0,1,5,Bachelor's Degree,Computer Software,0
4,0,1,1,1,5,Bachelor's Degree,Hospital & Health Care,0


In [14]:
# Determine the # of fraudulent posts based on required experience
job_posting_df3.value_counts(['required_education','fraudulent'])

required_education                 fraudulent
Bachelor's Degree                  0             5045
High School or equivalent          0             1910
Unspecified                        0             1336
Master's Degree                    0              385
Associate Degree                   0              268
High School or equivalent          1              170
Certification                      0              151
Bachelor's Degree                  1              100
Some College Coursework Completed  0               99
Professional                       0               70
Unspecified                        1               61
Vocational                         0               49
Master's Degree                    1               31
Doctorate                          0               25
Some High School Coursework        1               20
Certification                      1               19
Vocational - HS Diploma            0                9
Some High School Coursework        0

In [15]:
le = LabelEncoder()
job_posting_df4 = job_posting_df3.copy()
job_posting_df4['required_education'] = le.fit_transform(job_posting_df4['required_education'])
job_posting_df4.head()

Unnamed: 0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,fraudulent
0,0,1,0,2,4,13,,0
1,0,1,0,1,6,13,Marketing and Advertising,0
2,0,1,0,5,7,13,,0
3,0,1,0,1,5,1,Computer Software,0
4,0,1,1,1,5,1,Hospital & Health Care,0


In [16]:
# Determine the # of fraudulent posts based on required experience
job_posting_df4.value_counts(['industry','fraudulent'])

industry                             fraudulent
Information Technology and Services  0             1702
Computer Software                    0             1371
Internet                             0             1062
Education Management                 0              822
Marketing and Advertising            0              783
                                                   ... 
Consumer Goods                       1                1
Online Media                         1                1
Building Materials                   1                1
Medical Practice                     1                1
Broadcast Media                      1                1
Length: 192, dtype: int64

In [17]:
le = LabelEncoder()
job_posting_df5 = job_posting_df4.copy()
job_posting_df5['industry'] = le.fit_transform(job_posting_df5['industry'])
job_posting_df5.head()

Unnamed: 0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,fraudulent
0,0,1,0,2,4,13,131,0
1,0,1,0,1,6,13,75,0
2,0,1,0,5,7,13,131,0
3,0,1,0,1,5,1,22,0
4,0,1,1,1,5,1,51,0


In [18]:
job_posting_df5.to_csv('encoded_data_postings.csv')

 ## Separate the Features (X) from the Target (y)

In [19]:
y = job_posting_df5["fraudulent"]
X = job_posting_df5.drop(columns="fraudulent")
# Check the balance of our target values
y.value_counts()

0    17014
1      866
Name: fraudulent, dtype: int64

 ## Split our data into training and testing

In [20]:
from sklearn.model_selection import train_test_split
from collections import Counter
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
print(Counter(y_train))
print(Counter(y_test))

Counter({0: 12761, 1: 649})
Counter({0: 4253, 1: 217})


 ## Create a Random Forest Classifier

In [21]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
model_brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
model_brf.fit(X_train, y_train) 


BalancedRandomForestClassifier(random_state=1)

In [22]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
# Calculated the balanced accuracy score
y_pred = model_brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8544589289642117

In [23]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix, classification_report

matrix = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    matrix, index=["Actual Fraudulent", "Actual No Fraudulent"], columns=["Predicted Fraudulent", "Predicted No Fraudulent"])
cm_df

Unnamed: 0,Predicted Fraudulent,Predicted No Fraudulent
Actual Fraudulent,3603,650
Actual No Fraudulent,30,187


In [24]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced 
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.85      0.86      0.91      0.85      0.73      4253
          1       0.22      0.86      0.85      0.35      0.85      0.73       217

avg / total       0.95      0.85      0.86      0.89      0.85      0.73      4470



In [25]:
# List the features sorted in descending order by feature importance
features_rank = sorted(zip(model_brf.feature_importances_, X.columns), reverse=True)
for feature in features_rank:
    print(f"{feature[1]}: ({feature[0]})")

industry: (0.31506709165572844)
has_company_logo: (0.2776687731513956)
required_education: (0.14877786569651993)
required_experience: (0.12227655510440245)
employment_type: (0.06310034304594092)
has_questions: (0.05323488833238311)
telecommuting: (0.01987448301362959)
