In [1]:
# Dependencies
import pandas as pd 
import numpy as np 
import torch.nn as nn

import torch
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.


In [0]:
# Read in data from S3 Buckets

url = "https://job-postings-dataviz.s3.amazonaws.com/fake_jobs_clean.csv"
df = pd.read_csv(url, sep=",",  encoding = "UTF-8" )

# Show DataFrame
# df.head()

In [4]:
# Focus only on the job description. Create a dataFrame with label "fraudulent" and "description"
descrip_df =df [['fraudulent','description']]
descrip_df.head()

Unnamed: 0,fraudulent,description
0,0,"Food52, a fast-growing, James Beard Award-winn..."
1,0,Organised - Focused - Vibrant - Awesome!Do you...
2,0,"Our client, located in Houston, is actively se..."
3,0,THE COMPANY: ESRI – Environmental Systems Rese...
4,0,JOB TITLE: Itemization Review ManagerLOCATION:...


In [5]:
# Drop NaN values but dropping duplicates is qustionable
# these duplicates are real data and contribute to both classes
print(len(descrip_df))
descrip_df= descrip_df.dropna()
print(len(descrip_df))
# descrip_df = descrip_df.drop_duplicates()
# print(descrip_df.count())

17880
17879


In [7]:
# Make sure number of real vs fraudulent are more balanced (1:5)
df_fake= descrip_df[descrip_df['fraudulent'] == 1] 
df_real = descrip_df[descrip_df['fraudulent'] == 0] 
# Count how many Real vs Fraudulent postings
n_f=len(df_fake)
n_n=len(df_real)
print(n_f)
print(n_n)

865
17014


Create a more balanced data set (1:5 fake to real) 

In [8]:
df_new= df_real.sample  ( 5*n_f  , random_state=580)
df_unders = df_new.append(df_fake)
print(len(df_new))
print(len(df_unders))

4325
5190


In [0]:
# Shuffle the dataframe so "fraudulent" postings are more evenly distributed 
df_underst = df_unders.sample(len(df_unders), random_state=580) 
# df_underst.head(20)
X= df_underst.iloc[:,1].values
y=df_underst.iloc[:,0].values

### Feature Transformations (Term Frequency times inverse document frequency)


In [11]:
# feature extraction using "bag of words" model. Create feature vectors. 
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

count_vectorizer = CountVectorizer( stop_words=stopwords.words('english'))
tfidfconverter = TfidfTransformer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
X_c = count_vectorizer.fit_transform(X)
X_tfidf = tfidfconverter.fit_transform(X_c).toarray()


Create a Naive Bayes Model  (ref : https://scikit-learn.org/stable/modules/naive_bayes.html)

In [0]:
# Break the data in trainand test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, random_state=580, stratify=y)

Multinomial Naive Bayes

In [33]:
# Create a Naive Bayes model and fit training data
from time import time
from sklearn.naive_bayes import ComplementNB

c_nb = ComplementNB(alpha= 0.8)
t1_start = time()
predictor = c_nb.fit(X_train,y_train)
t1_stop = time() 
print("Elapsed time:", t1_stop, t1_start)   
print(f'Elapsed time during the whole program in nanoseconds:',t1_stop-t1_start)

Elapsed time: 1587013068.7796566 1587013068.2961626
Elapsed time during the whole program in nanoseconds: 0.4834940433502197


In [0]:
# Tranform the model with the testing data
test_results = predictor.predict(X_test)

In [35]:
from sklearn.metrics import classification_report

print(classification_report(y_test, test_results,
                            target_names= ["real", "fake"]))

              precision    recall  f1-score   support

        real       0.87      0.99      0.93      1082
        fake       0.84      0.26      0.40       216

    accuracy                           0.87      1298
   macro avg       0.85      0.63      0.66      1298
weighted avg       0.87      0.87      0.84      1298



Save the model 

In [0]:
# save the model
from google.colab import drive
drive.mount('/content/drive')


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# Save to the model to the local drive

from sklearn.externals import joblib

filename = '/content/drive/My Drive/NB_under.h5'
joblib.dump(predictor, filename)

['/content/drive/My Drive/NB_under.h5']

In [0]:
# Load the model
loaded_model= joblib.load(filename)
predictionsl = loaded_model.predict(X_test)


In [0]:
from sklearn.metrics import classification_report

print(classification_report(y_test, test_results,
                            target_names= ["real", "fake"]))

              precision    recall  f1-score   support

        real       0.86      0.80      0.83       217
        fake       0.81      0.88      0.84       216

    accuracy                           0.84       433
   macro avg       0.84      0.84      0.84       433
weighted avg       0.84      0.84      0.84       433

