 # Predicting Fake Jobs

In [26]:
from path import Path
import pandas as pd

In [27]:
# read in data 
data = Path('fake_job_postings.csv')
job_posting_df = pd.read_csv(data)
job_posting_df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [28]:
# count of the fraudulent (1) postings and the actual (0)
job_posting_df["fraudulent"].value_counts()

0    17014
1      866
Name: fraudulent, dtype: int64

In [29]:
# Type of data in each column
job_posting_df.dtypes

job_id                  int64
title                  object
location               object
department             object
salary_range           object
company_profile        object
description            object
requirements           object
benefits               object
telecommuting           int64
has_company_logo        int64
has_questions           int64
employment_type        object
required_experience    object
required_education     object
industry               object
function               object
fraudulent              int64
dtype: object

In [6]:
# Drop the null columns where all values are null
#job_posting_df = job_posting_df.dropna(axis='columns', how='all')

# Drop the null rows
#job_posting_df = df.dropna()

In [30]:
# Dropping Columns 
# Will add a column for "has and does not have benefits"
# Feel free to adjust for the machine learning model - just add or take away column names in the "" inside of the []
job_posting_df=job_posting_df.drop(columns=["job_id","title","location","department","salary_range","company_profile","description","requirements","benefits", "function" ], axis=1)
job_posting_df.dtypes

telecommuting           int64
has_company_logo        int64
has_questions           int64
employment_type        object
required_experience    object
required_education     object
industry               object
fraudulent              int64
dtype: object

In [31]:
# count of the fraudulent (1) postings and the actual (0)
job_posting_df["fraudulent"].value_counts()

0    17014
1      866
Name: fraudulent, dtype: int64

In [32]:
# Showing the amount of unique inputs in each column, easiest to focus on the ones with the smallest. "required_education" and "required_experience"
# both had small amounts of unique inputs
job_posting_df.nunique()

telecommuting            2
has_company_logo         2
has_questions            2
employment_type          5
required_experience      7
required_education      13
industry               131
fraudulent               2
dtype: int64

In [34]:
# Determine the # of fraudulent posts based on required experience
job_posting_df.value_counts(['employment_type','fraudulent'])

employment_type  fraudulent
Full-time        0             11130
Contract         0              1480
Part-time        0               723
Full-time        1               490
Temporary        0               239
Other            0               212
Part-time        1                74
Contract         1                44
Other            1                15
Temporary        1                 2
dtype: int64

In [24]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
job_posting_df2 = job_posting_df.copy()
job_posting_df2['employment_type'] = le.fit_transform(job_posting_df2['employment_type'])
job_posting_df2.head()

Unnamed: 0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,fraudulent
0,0,1,0,2,Internship,,,0
1,0,1,0,1,Not Applicable,,Marketing and Advertising,0
2,0,1,0,5,,,,0
3,0,1,0,1,Mid-Senior level,Bachelor's Degree,Computer Software,0
4,0,1,1,1,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,0


In [35]:
# Determine the # of fraudulent posts based on required experience
job_posting_df2.value_counts(['required_experience','fraudulent'])

required_experience  fraudulent
Mid-Senior level     0             3696
Entry level          0             2518
Associate            0             2255
Not Applicable       0             1056
Director             0              372
Internship           0              371
Entry level          1              179
Executive            0              131
Mid-Senior level     1              113
Not Applicable       1               60
Associate            1               42
Director             1               17
Executive            1               10
Internship           1               10
dtype: int64

In [36]:

le = LabelEncoder()
job_posting_df3 = job_posting_df2.copy()
job_posting_df3['required_experience'] = le.fit_transform(job_posting_df3['required_experience'])
job_posting_df3.head()

Unnamed: 0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,fraudulent
0,0,1,0,2,4,,,0
1,0,1,0,1,6,,Marketing and Advertising,0
2,0,1,0,5,7,,,0
3,0,1,0,1,5,Bachelor's Degree,Computer Software,0
4,0,1,1,1,5,Bachelor's Degree,Hospital & Health Care,0


In [37]:
# Determine the # of fraudulent posts based on required experience
job_posting_df3.value_counts(['required_education','fraudulent'])

required_education                 fraudulent
Bachelor's Degree                  0             5045
High School or equivalent          0             1910
Unspecified                        0             1336
Master's Degree                    0              385
Associate Degree                   0              268
High School or equivalent          1              170
Certification                      0              151
Bachelor's Degree                  1              100
Some College Coursework Completed  0               99
Professional                       0               70
Unspecified                        1               61
Vocational                         0               49
Master's Degree                    1               31
Doctorate                          0               25
Some High School Coursework        1               20
Certification                      1               19
Vocational - HS Diploma            0                9
Some High School Coursework        0

In [38]:
le = LabelEncoder()
job_posting_df4 = job_posting_df3.copy()
job_posting_df4['required_education'] = le.fit_transform(job_posting_df4['required_education'])
job_posting_df4.head()

Unnamed: 0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,fraudulent
0,0,1,0,2,4,13,,0
1,0,1,0,1,6,13,Marketing and Advertising,0
2,0,1,0,5,7,13,,0
3,0,1,0,1,5,1,Computer Software,0
4,0,1,1,1,5,1,Hospital & Health Care,0


In [41]:
# Determine the # of fraudulent posts based on required experience
job_posting_df4.value_counts(['industry','fraudulent'])

industry                             fraudulent
Information Technology and Services  0             1702
Computer Software                    0             1371
Internet                             0             1062
Education Management                 0              822
Marketing and Advertising            0              783
                                                   ... 
Consumer Goods                       1                1
Online Media                         1                1
Building Materials                   1                1
Medical Practice                     1                1
Broadcast Media                      1                1
Length: 192, dtype: int64

In [42]:
le = LabelEncoder()
job_posting_df5 = job_posting_df4.copy()
job_posting_df5['industry'] = le.fit_transform(job_posting_df5['industry'])
job_posting_df5.head()

Unnamed: 0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,fraudulent
0,0,1,0,2,4,13,131,0
1,0,1,0,1,6,13,75,0
2,0,1,0,5,7,13,131,0
3,0,1,0,1,5,1,22,0
4,0,1,1,1,5,1,51,0


In [44]:
job_posting_df5.to_csv('encoded_data_postings.csv')

 ## Separate the Features (X) from the Target (y)

In [47]:
y = job_posting_df5["fraudulent"]
X = job_posting_df5.drop(columns="fraudulent")

 ## Split our data into training and testing

In [48]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(13410, 7)

 ## Create a Logistic Regression Model

In [49]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

 ## Fit (train) or model using the training data

In [50]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

 ## Make predictions

In [51]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [52]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9514541387024609
