# Load Data

In [0]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 30)
from time import time

import warnings
warnings.filterwarnings("ignore")

In [0]:
# Read in csv file
url = "https://job-postings-dataviz.s3.amazonaws.com/fake_jobs_clean.csv"
df = pd.read_csv(url, sep=",",  encoding = "UTF-8" )
# or to read for the data folder directly
# df = pd.read_csv("../assets/data/fake_jobs_clean.csv")
# df.head(60)

In [3]:
list(df.columns)

['job_id',
 'city',
 'state/province',
 'country',
 'title',
 'department',
 'industry',
 'function',
 'salary_range',
 'salary_provided',
 'company_profile',
 'description',
 'requirements',
 'benefits',
 'benefits_provided',
 'telecommuting',
 'has_company_logo',
 'has_questions',
 'employment_type',
 'required_experience',
 'required_education',
 'fraudulent']

# Feature Selection

In [4]:
# Feature Selection
selected_features = df.drop(['job_id', 'city', 'state/province', 'country','title','department', 'salary_range', 
                             'company_profile', 'description', 'requirements','benefits', 'fraudulent'], axis=1)
selected_features

Unnamed: 0,industry,function,salary_provided,benefits_provided,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education
0,,Marketing,0,0,0,1,0,Other,Internship,
1,Marketing and Advertising,Customer Service,0,1,0,1,0,Full-time,Not Applicable,
2,,,0,0,0,1,0,,,
3,Computer Software,Sales,0,1,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree
4,Hospital & Health Care,Health Care Provider,0,1,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree
...,...,...,...,...,...,...,...,...,...,...
17875,Computer Software,Sales,0,1,0,1,1,Full-time,Mid-Senior level,
17876,Internet,Accounting/Auditing,0,1,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree
17877,,,0,0,0,0,0,Full-time,,
17878,Graphic Design,Design,0,1,0,0,1,Contract,Not Applicable,Professional


In [6]:
# Get Dummies for Categorical Columns
dummy_features = pd.get_dummies(selected_features)
dummy_features

Unnamed: 0,salary_provided,benefits_provided,telecommuting,has_company_logo,has_questions,industry_Accounting,industry_Airlines/Aviation,industry_Alternative Dispute Resolution,industry_Animation,industry_Apparel & Fashion,industry_Architecture & Planning,industry_Automotive,industry_Aviation & Aerospace,industry_Banking,industry_Biotechnology,...,required_experience_Mid-Senior level,required_experience_Not Applicable,required_education_Associate Degree,required_education_Bachelor's Degree,required_education_Certification,required_education_Doctorate,required_education_High School or equivalent,required_education_Master's Degree,required_education_Professional,required_education_Some College Coursework Completed,required_education_Some High School Coursework,required_education_Unspecified,required_education_Vocational,required_education_Vocational - Degree,required_education_Vocational - HS Diploma
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
17876,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
17877,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
17878,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0


In [7]:
print(len(list(dummy_features.columns)))
list(dummy_features.columns)

198


['salary_provided',
 'benefits_provided',
 'telecommuting',
 'has_company_logo',
 'has_questions',
 'industry_Accounting',
 'industry_Airlines/Aviation',
 'industry_Alternative Dispute Resolution',
 'industry_Animation',
 'industry_Apparel & Fashion',
 'industry_Architecture & Planning',
 'industry_Automotive',
 'industry_Aviation & Aerospace',
 'industry_Banking',
 'industry_Biotechnology',
 'industry_Broadcast Media',
 'industry_Building Materials',
 'industry_Business Supplies and Equipment',
 'industry_Capital Markets',
 'industry_Chemicals',
 'industry_Civic & Social Organization',
 'industry_Civil Engineering',
 'industry_Commercial Real Estate',
 'industry_Computer & Network Security',
 'industry_Computer Games',
 'industry_Computer Hardware',
 'industry_Computer Networking',
 'industry_Computer Software',
 'industry_Construction',
 'industry_Consumer Electronics',
 'industry_Consumer Goods',
 'industry_Consumer Services',
 'industry_Cosmetics',
 'industry_Defense & Space',
 'in

In [8]:
# Assign the data to X and y
X = dummy_features.values
y = df.iloc[:,21].values

print(X.shape, y.shape)

(17880, 198) (17880,)


In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22)

# Pre-Processing

Scale Data using the MinMaxScaler

In [0]:
# Scale the data using the MinMaxScaler
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [0]:
# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [24]:
# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)
y_train_categorical.shape

(13410, 2)

# Train the Model

In [0]:
#Create a deep learning model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Create model and add layers
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=198))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

In [0]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [27]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 100)               19900     
_________________________________________________________________
dense_4 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 202       
Total params: 30,202
Trainable params: 30,202
Non-trainable params: 0
_________________________________________________________________


In [28]:
# Fit the model

# Time duration of model fit 
t1_start = time()
model.fit(
    X_train_scaled,
    y_train,
    epochs=100,
    shuffle=True,
    verbose=2)

t1_stop = time() 
print("Stop/Start Time:", t1_stop, t1_start)   
print(f'Duration of Model in seconds:',t1_stop-t1_start)

Epoch 1/100
420/420 - 1s - loss: 0.1562 - accuracy: 0.9531
Epoch 2/100
420/420 - 1s - loss: 0.1012 - accuracy: 0.9670
Epoch 3/100
420/420 - 1s - loss: 0.0831 - accuracy: 0.9735
Epoch 4/100
420/420 - 1s - loss: 0.0726 - accuracy: 0.9761
Epoch 5/100
420/420 - 1s - loss: 0.0647 - accuracy: 0.9786
Epoch 6/100
420/420 - 1s - loss: 0.0624 - accuracy: 0.9793
Epoch 7/100
420/420 - 1s - loss: 0.0585 - accuracy: 0.9808
Epoch 8/100
420/420 - 1s - loss: 0.0552 - accuracy: 0.9820
Epoch 9/100
420/420 - 1s - loss: 0.0537 - accuracy: 0.9823
Epoch 10/100
420/420 - 1s - loss: 0.0523 - accuracy: 0.9817
Epoch 11/100
420/420 - 1s - loss: 0.0513 - accuracy: 0.9829
Epoch 12/100
420/420 - 1s - loss: 0.0514 - accuracy: 0.9826
Epoch 13/100
420/420 - 1s - loss: 0.0496 - accuracy: 0.9826
Epoch 14/100
420/420 - 1s - loss: 0.0479 - accuracy: 0.9835
Epoch 15/100
420/420 - 1s - loss: 0.0486 - accuracy: 0.9837
Epoch 16/100
420/420 - 1s - loss: 0.0478 - accuracy: 0.9835
Epoch 17/100
420/420 - 1s - loss: 0.0472 - accura

In [29]:
# Quantify the model
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

140/140 - 0s - loss: 0.2154 - accuracy: 0.9734
Loss: 0.21536073088645935, Accuracy: 0.97337806224823


In [0]:
predictions = model.predict(X_test_scaled)
predictions = np.around(predictions , decimals = 0)

In [31]:
predictions

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [32]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test_categorical, predictions,
                            target_names= ["real", "fake"]))

              precision    recall  f1-score   support

        real       0.98      1.00      0.99      4251
        fake       0.86      0.55      0.67       219

   micro avg       0.97      0.97      0.97      4470
   macro avg       0.92      0.77      0.83      4470
weighted avg       0.97      0.97      0.97      4470
 samples avg       0.97      0.97      0.97      4470



# Save the Model

In [0]:
# import joblib
# joblib.dump(model, 'nontext_deeplearning_model.h5')