In [179]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.optimizers import Adam

In [161]:
df = pd.read_csv('fake_job_postings.csv')

In [162]:
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [163]:
del df['job_id']
del df['salary_range']
del df['telecommuting']
del df['has_company_logo']
del df['has_questions']

In [164]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   title                17880 non-null  object
 1   location             17534 non-null  object
 2   department           6333 non-null   object
 3   company_profile      14572 non-null  object
 4   description          17879 non-null  object
 5   requirements         15184 non-null  object
 6   benefits             10668 non-null  object
 7   employment_type      14409 non-null  object
 8   required_experience  10830 non-null  object
 9   required_education   9775 non-null   object
 10  industry             12977 non-null  object
 11  function             11425 non-null  object
 12  fraudulent           17880 non-null  int64 
dtypes: int64(1), object(12)
memory usage: 1.8+ MB


In [165]:
df.isna().sum()

title                      0
location                 346
department             11547
company_profile         3308
description                1
requirements            2696
benefits                7212
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

In [166]:
df.fillna(" ",inplace = True)

In [167]:
df['combined_text'] = ""
col_names_ct = ['title', 'location', 'department', 'company_profile', 'description', 'requirements', 'benefits', 'employment_type', 'required_experience', 'required_education', 'industry', 'function']

for i in range(len(col_names_ct)):
    df['combined_text'] = df['combined_text'] + df[col_names_ct[i]] + " "

In [169]:
df = df[['fraudulent', 'combined_text']]

In [170]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]+", " ", text)
    stop_words = stopwords.words("english")
    text = " ".join([word for word in text.split() if word not in stop_words])
    stemmer = PorterStemmer()
    text = " ".join([stemmer.stem(word) for word in text.split()])
    return text

df['combined_text'] = df['combined_text'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['combined_text'] = df['combined_text'].apply(preprocess_text)


In [171]:
df.head()

Unnamed: 0,fraudulent,combined_text
0,0,market intern us ny new york market food52 cre...
1,0,custom servic cloud video product nz auckland ...
2,0,commiss machineri assist cma us ia wever valor...
3,0,account execut washington dc us dc washington ...
4,0,bill review manag us fl fort worth spotsourc s...


In [172]:
X_train, X_test, y_train, y_test = train_test_split(df["combined_text"], df["fraudulent"], test_size=0.2)

In [177]:
max_len = 100
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_text = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len)
X_test_text = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_len)

In [180]:
model = Sequential()

model.add(Embedding(tokenizer.num_words, 128, input_length=max_len))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dense(32, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer=Adam(learning_rate=0.001), metrics=["accuracy"])

In [181]:
model.fit(X_train_text, y_train, epochs=10, validation_data=(X_test_text, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1af48035c50>

In [182]:
loss, accuracy = model.evaluate(X_test_text, y_test, verbose=0)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.9774969816207886
