In this model, I will combine all the text from company_profile, description, requirements, and benefits and try different approaches on it


In [139]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import model_selection, naive_bayes
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer


In [140]:
filename = 'fake_job_postings.csv'
df = pd.read_csv(filename)
df.columns

Index(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent'],
      dtype='object')

In [141]:
df = df[['company_profile', 'description', 'requirements', 'benefits', 'fraudulent']]
df.head()

Unnamed: 0,company_profile,description,requirements,benefits,fraudulent
0,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0
1,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0
2,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0
3,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0
4,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0


In [142]:
df.isna().sum()

company_profile    3308
description           1
requirements       2695
benefits           7210
fraudulent            0
dtype: int64

In [143]:
df =df.fillna("unspecified")
df["all_text"] = df["company_profile"] + df['description'] + df['requirements'] + df['benefits']

In [144]:
df.drop(['company_profile', 'description', 'requirements', 'benefits'], axis =1 ,inplace =True)


In [145]:
df

Unnamed: 0,fraudulent,all_text
0,0,"We're Food52, and we've created a groundbreaki..."
1,0,"90 Seconds, the worlds Cloud Video Production ..."
2,0,Valor Services provides Workforce Solutions th...
3,0,Our passion for improving quality of life thro...
4,0,SpotSource Solutions LLC is a Global Human Cap...
...,...,...
17875,0,Vend is looking for some awesome new talent to...
17876,0,WebLinc is the e-commerce platform and service...
17877,0,We Provide Full Time Permanent Positions for m...
17878,0,unspecifiedNemsia Studios is looking for an ex...


In [146]:
my_stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [147]:
df['all_text'] = df['all_text'].str.lower()
df['all_text'] = df['all_text'].str.replace('[^\w\s]','')

In [148]:
df['clean_text'] = df['all_text'].apply(lambda x: ' '.join([item for item in x.split(' ') if item not in my_stopwords]))


In [149]:
df.head()

Unnamed: 0,fraudulent,all_text,clean_text
0,0,were food52 and weve created a groundbreaking ...,food52 weve created groundbreaking awardwinnin...
1,0,90 seconds the worlds cloud video production s...,90 seconds worlds cloud video production servi...
2,0,valor services provides workforce solutions th...,valor services provides workforce solutions me...
3,0,our passion for improving quality of life thro...,passion improving quality life geography heart...
4,0,spotsource solutions llc is a global human cap...,spotsource solutions llc global human capital ...


In [150]:
x = df['clean_text']
y = df['fraudulent']

#split it into training and test sets
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.2, random_state=42)

#encoding
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

#vectorization
MAX = 5000
vectorizer = TfidfVectorizer(max_features = MAX)
vectorizer.fit(x_train)

x_trainvec = vectorizer.transform(x_train)
x_testvec = vectorizer.transform(x_test)

In [151]:


from sklearn.linear_model import LogisticRegression

# fit 
logreg = LogisticRegression()
logreg.fit(x_trainvec, y_train)

# predict
y_pred_lr = logreg.predict(x_testvec)

# accuracy
print("Accuracy Score of LogReg :", accuracy_score(y_pred_lr, y_test), "\n") #96.3%

# confusion matrix 
print("Confusion Matrix of LogReg:\n", confusion_matrix(y_test, y_pred_lr), "\n") # [[3324, 0], [128, 55]]

#classifcation report
print("Classification Report of LogReg:\n", classification_report(y_test, y_pred_lr), "\n")



  return f(*args, **kwds)


Accuracy Score of LogReg : 0.9728747203579419 

Confusion Matrix of LogReg:
 [[3395    0]
 [  97   84]] 

Classification Report of LogReg:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99      3395
           1       1.00      0.46      0.63       181

    accuracy                           0.97      3576
   macro avg       0.99      0.73      0.81      3576
weighted avg       0.97      0.97      0.97      3576
 

