In [0]:
# Dependencies
import pandas as pd 
import numpy as np

from time import time
# import torch.nn as nn
# import torch
from sklearn.metrics import classification_report


import warnings
warnings.filterwarnings("ignore")

In [0]:
# Read in data from S3 Buckets

url = "https://job-postings-dataviz.s3.amazonaws.com/fake_jobs_clean.csv"

df = pd.read_csv(url, sep=",",  encoding = "UTF-8" )

# Show DataFrame
# df.head()

In [0]:
# Focus only on the job description. Create a dataFrame with label "fraudulent" and "description"
descrip_df =df [['fraudulent','description']]
descrip_df.head(1)


In [6]:
# Drop NaN values but dropping duplicates is qustionable
# these duplicates are real data and contribute to both classes
print(len(descrip_df))
descrip_df= descrip_df.dropna()
print(len(descrip_df))
# descrip_df = descrip_df.drop_duplicates()
# print(descrip_df.count())

17880
17879


In [7]:
# Make sure number of real vs fraudulent are more balanced
df_fake= descrip_df[descrip_df['fraudulent'] == 1] 
df_real = descrip_df[descrip_df['fraudulent'] == 0] 
# Count how many Real vs Fraudulent postings
n_f=len(df_fake)
n_n=len(df_real)
print(n_f)
print(n_n)


865
17014


Create a data set with same instances of fake and real : UNDERSTIMATING the real posts

In [8]:
df_new= df_real.sample  ( n_f  , random_state=580)
df_unders = df_new.append(df_fake)
print(len(df_new))
print(len(df_unders))

865
1730


In [0]:
# Shuffle the UNDERSTIMATE dataframe so fraudulent postings more evenly distributed 
df_underst = df_unders.sample(len(df_unders), random_state=580) 
# df_underst.head(20)
X_u = df_underst.iloc[:,1].values
y_u = df_underst.iloc[:,0].values

### Feature Transformations (Term Frequency times inverse document frequency)


In [10]:
# feature extraction using "bag of words" model. Create feature vectors. and remove stop words
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))


count_vectorizer = CountVectorizer( stop_words=stopwords.words('english'))
tfidfconverter = TfidfTransformer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
#  apply the transformatins
X_c = count_vectorizer.fit_transform(X_u)
X_tfidf = tfidfconverter.fit_transform(X_c).toarray()

# y.shape

Create a Naive Bayes Model 

In [0]:
# Break the data in trainand test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_u , random_state=580)

Multinomial Naive Bayes

In [13]:
# Create a Naive Bayes model and fit training data

from sklearn.naive_bayes import MultinomialNB
t1_start = time()
m_nb = MultinomialNB()
predictor = m_nb.fit(X_train,y_train)
t1_stop = time() 
print("Elapsed time:", t1_stop, t1_start)   
print(f'Elapsed time during the whole program in seconds:', 
                                            t1_stop-t1_start)

Elapsed time: 1586834614.7692697 1586834614.7118258
Elapsed time during the whole program in nanoseconds: 0.057443857192993164


In [0]:
# Tranform the model with the testing data
test_results = predictor.predict(X_test)

In [15]:
from sklearn.metrics import classification_report

print(classification_report(y_test, test_results,
                            target_names= ["real", "fake"]))

              precision    recall  f1-score   support

        real       0.87      0.75      0.81       224
        fake       0.77      0.88      0.82       209

    accuracy                           0.81       433
   macro avg       0.82      0.82      0.81       433
weighted avg       0.82      0.81      0.81       433



Save the model 

In [16]:
# save the model
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
# Save to the model to the local drive

from sklearn.externals import joblib

filename = '/content/drive/My Drive/MultinomialNB_under.h5'
joblib.dump(predictor, filename)

['/content/drive/My Drive/MultinomialNB_under.h5']

In [0]:
# Load the model
# loaded_model= joblib.load(filename)
# predictionsl = loaded_model.predict(X_test)


In [0]:


print(classification_report(y_test, test_results,
                            target_names= ["real", "fake"]))

              precision    recall  f1-score   support

        real       0.86      0.80      0.83       217
        fake       0.81      0.88      0.84       216

    accuracy                           0.84       433
   macro avg       0.84      0.84      0.84       433
weighted avg       0.84      0.84      0.84       433



Complement Naive Bayes : suited for imbalanced data sets

In [0]:
# Use all data set with both models
X= descrip_df.iloc[:,1].values
y=descrip_df.iloc[:,0].values
# apply the transformations TF-IDF
X_c = count_vectorizer.fit_transform(X)
X_tfidf = tfidfconverter.fit_transform(X_c).toarray()

In [0]:
# Break the data in trainand test datasets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, random_state=580, stratify=y)


In [21]:
# Create a Naive Bayes model and fit training data
from sklearn.naive_bayes import ComplementNB

t1_start = time() 
c_nb = ComplementNB()
predictor = c_nb.fit(X_train,y_train)
t1_stop = time() 
print("Elapsed time:", t1_stop, t1_start)   
print(f'Elapsed time during the whole program in nanoseconds:', 
                                            t1_stop-t1_start)

Elapsed time: 1586834934.675624 1586834932.5519407
Elapsed time during the whole program in nanoseconds: 2.123683214187622


In [22]:
# save model to local drive
filename = '/content/drive/My Drive/ComplementNB_all.h5'
joblib.dump(predictor, filename)

['/content/drive/My Drive/ComplementNB_all.h5']

In [23]:
# Tranform the model with the testing data
test_results = predictor.predict(X_test)
print(classification_report(y_test, test_results,
                            target_names= ["real", "fake"]))

              precision    recall  f1-score   support

        real       0.95      0.99      0.97      4254
        fake       0.11      0.01      0.02       216

    accuracy                           0.95      4470
   macro avg       0.53      0.50      0.50      4470
weighted avg       0.91      0.95      0.93      4470



In [24]:
t1_start = time() 
m_nb = MultinomialNB()
predictor = m_nb.fit(X_train,y_train)
t1_stop = time() 
print("Elapsed time:", t1_stop, t1_start)   
print(f'Elapsed time during the whole program in seconds:', 
                                            t1_stop-t1_start)

Elapsed time: 1586834959.4117935 1586834957.1055934
Elapsed time during the whole program in nanoseconds: 2.3062000274658203


In [25]:
# save model to local drive
filename = '/content/drive/My Drive/Multiomial_all.h5'
joblib.dump(predictor, filename)

['/content/drive/My Drive/Multiomial_all.h5']

In [34]:
# Tranform the model with the testing data
test_results = predictor.predict(X_test)
print(classification_report(y_test, test_results,
                            target_names= ["real", "fake"]))

              precision    recall  f1-score   support

        real       0.95      1.00      0.97      4254
        fake       0.00      0.00      0.00       216

    accuracy                           0.95      4470
   macro avg       0.48      0.50      0.49      4470
weighted avg       0.91      0.95      0.93      4470



Deep Learning Model 

In [0]:
# Create a StandardScater model and fit it to the training data
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)

# Transform the training and testing data using the X_scaler

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)



In [0]:
# first, create a normal neural network with 2 inputs, 6 hidden nodes, and 2 outputs

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
