In [None]:
pip install torch torchvision

In [None]:
pip install pytorch-pretrained-bert

In [9]:
# Dependencies
import pandas as pd 
import numpy as np 
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel
import torch
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings("ignore")

In [60]:
# Read in csv file
pd.set_option('display.max_columns', 30)
fake_jobs_df = pd.read_csv("fake_jobs_clean.csv")
fake_jobs_df

Unnamed: 0,job_id,city,state/province,country,title,department,industry,function,salary_range,salary_provided,company_profile,description,requirements,benefits,benefits_provided,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,fraudulent
0,1,New York,NY,US,Marketing Intern,Marketing,,Marketing,,0,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,0,1,0,Other,Internship,,0
1,2,Auckland,,NZ,Customer Service - Cloud Video Production,Success,Marketing and Advertising,Customer Service,,0,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,1,0,1,0,Full-time,Not Applicable,,0
2,3,Wever,IA,US,Commissioning Machinery Assistant (CMA),,,,,0,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,0,1,0,,,,0
3,4,Washington,DC,US,Account Executive - Washington DC,Sales,Computer Software,Sales,,0,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,1,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,0
4,5,Fort Worth,FL,US,Bill Review Manager,,Hospital & Health Care,Health Care Provider,,0,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,1,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,17876,Toronto,ON,CA,Account Director - Distribution,Sales,Computer Software,Sales,,0,Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,1,0,1,1,Full-time,Mid-Senior level,,0
17876,17877,Philadelphia,PA,US,Payroll Accountant,Accounting,Internet,Accounting/Auditing,,0,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,1,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,0
17877,17878,Houston,TX,US,Project Cost Control Staff Engineer - Cost Con...,,,,,0,We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,,0,0,0,0,Full-time,,,0
17878,17879,Lagos,LA,NG,Graphic Designer,,Graphic Design,Design,,0,,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,1,0,0,1,Contract,Not Applicable,Professional,0


In [61]:
# Take a look at job description and fraudulent cols
df = fake_jobs_df[['description', 'fraudulent']]
df

Unnamed: 0,description,fraudulent
0,"Food52, a fast-growing, James Beard Award-winn...",0
1,Organised - Focused - Vibrant - Awesome!Do you...,0
2,"Our client, located in Houston, is actively se...",0
3,THE COMPANY: ESRI – Environmental Systems Rese...,0
4,JOB TITLE: Itemization Review ManagerLOCATION:...,0
...,...,...
17875,Just in case this is the first time you’ve vis...,0
17876,The Payroll Accountant will focus primarily on...,0
17877,Experienced Project Cost Control Staff Enginee...,0
17878,Nemsia Studios is looking for an experienced v...,0


In [62]:
# Count how many Real vs Fraudulent postings 
from collections import Counter
print(Counter(df['fraudulent'].values))

Counter({0: 17014, 1: 866})


In [63]:
# Drop NaN values
df.dropna(inplace = True)
df

Unnamed: 0,description,fraudulent
0,"Food52, a fast-growing, James Beard Award-winn...",0
1,Organised - Focused - Vibrant - Awesome!Do you...,0
2,"Our client, located in Houston, is actively se...",0
3,THE COMPANY: ESRI – Environmental Systems Rese...,0
4,JOB TITLE: Itemization Review ManagerLOCATION:...,0
...,...,...
17875,Just in case this is the first time you’ve vis...,0
17876,The Payroll Accountant will focus primarily on...,0
17877,Experienced Project Cost Control Staff Enginee...,0
17878,Nemsia Studios is looking for an experienced v...,0


In [64]:
# Make sure number of real vs fraudulent are more balanced
df_fraudulent= df[df['fraudulent'] == 1] 
df_normal = df[df['fraudulent'] == 0] 

df_normal = df_normal.sample(n=1700, random_state=22)
df = df_normal.append(df_fraudulent)

df

Unnamed: 0,description,fraudulent
9024,Training OfficerDistribution Channels Training...,0
1645,We are looking for a talented and experienced ...,0
2537,We are an established and progressive chemical...,0
9242,The key areas of responsibility in this positi...,0
12967,WDM Group is seeking a top level RELATIONSHIP ...,0
...,...,...
17827,Student Positions Part-Time and Full-Time.You ...,1
17828,LEARN TO EARN AN EXECUTIVE LEVEL INCOMEFULL TR...,1
17829,inFullMobile Sp. z o.o. is a mobile software d...,1
17830,JOB DESCRIPTIONWe are seeking a full time payr...,1


In [65]:
# Shuffle the dataframe so fraudulent postings more evenly distributed 
from sklearn.utils import shuffle
df = shuffle(df, random_state=22)
df = df.reset_index(drop=True)
df

# df = df.sample(frac=1, random_state = 22).reset_index(drop=True)
# df.head(60)

Unnamed: 0,description,fraudulent
0,The ERP Team takes care of back-office applica...,0
1,Requirements:Candidate must possess at least a...,0
2,6* Ultra Luxury American Cruise Company is urg...,1
3,Independent Optometrist office looking for a f...,1
4,We have several openings available in this are...,1
...,...,...
2560,Now hiring CSR / Advertising representatives t...,1
2561,(We have more than 1500+ Job openings in our w...,0
2562,About Us:QB Medical Inc. is a medical supply d...,0
2563,Communicator. Writer. People-person.German-nat...,0


In [66]:
# Count Real vs Fraudulent again to make sure we've achieved the desired result
print(Counter(df['fraudulent'].values))

Counter({0: 1700, 1: 865})


In [None]:
# Create training and testing data 