In [7]:
import pandas as pd

df = pd.read_csv("E:/Final Year Project/Fake_Job_Detection/data/raw/fake_real_job_postings_3000x25.csv")

df.head(3)

Unnamed: 0,job_id,job_title,job_description,requirements,benefits,company_name,company_profile,industry,employment_type,location,...,application_deadline,contact_email,company_website,has_logo,num_open_positions,job_function,telecommuting,fraud_reason,text_length,is_fake
0,1,Software Engineer,We are looking for responsibilities fast-paced...,Candidates should have dynamic team skills fas...,We offer required skills fast-paced skills req...,Company_543,Our company growth fast-paced responsibilities...,Marketing,Contract,"Toronto, Canada",...,2024-09-16,hr312@company.com,https://www.company.com,0,3,Management,0,,89,0
1,2,Content Writer,We are looking for required support experience...,Candidates should have required team fast-pace...,We offer fast-paced dynamic dynamic strategy g...,Company_192,Our company fast-paced opportunity innovation ...,Finance,Full-time,"Toronto, Canada",...,2024-10-18,hr127@company.com,https://www.company.com,0,10,Development,1,,89,0
2,3,Customer Support Specialist,We are looking for dynamic required fast-paced...,Candidates should have preferred knowledge opp...,We offer skills experience required growth res...,,We are global innovation growth skills knowled...,Healthcare,Internship,Remote,...,2024-01-13,job92@gmail.com,,0,6,Support,0,Suspicious email,69,1


In [8]:
df.columns

Index(['job_id', 'job_title', 'job_description', 'requirements', 'benefits',
       'company_name', 'company_profile', 'industry', 'employment_type',
       'location', 'salary_range', 'required_experience_years',
       'education_level', 'department', 'posting_date', 'application_deadline',
       'contact_email', 'company_website', 'has_logo', 'num_open_positions',
       'job_function', 'telecommuting', 'fraud_reason', 'text_length',
       'is_fake'],
      dtype='object')

In [10]:
text_columns = ["job_title", "job_description", "requirements", "benefits"]

df[text_columns] = df[text_columns].fillna("")

df["merged_text"] = (df["job_title"] + " " +
                   df["job_description"] + " " +
                   df["requirements"] + " " +
                   df["benefits"])

In [15]:
df["merged_text"].head(5)

0    Software Engineer We are looking for responsib...
1    Content Writer We are looking for required sup...
2    Customer Support Specialist We are looking for...
3    Data Analyst We are looking for collaboration ...
4    Graphic Designer We are looking for team growt...
Name: merged_text, dtype: object

In [13]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)     # remove HTML
    text = re.sub(r"http\S+", " ", text)   # remove URLs
    text = re.sub(r"[^a-z\s]", " ", text)  # remove special chars
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean_text"] = df["merged_text"].apply(clean_text)


In [14]:
df["clean_text"].head(5)

0    software engineer we are looking for responsib...
1    content writer we are looking for required sup...
2    customer support specialist we are looking for...
3    data analyst we are looking for collaboration ...
4    graphic designer we are looking for team growt...
Name: clean_text, dtype: object

In [19]:
y = df["is_fake"].values

In [22]:
import numpy as np
np.count_nonzero(y)

1472

In [23]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_WORDS = 20000
MAX_LEN = 300

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(df["clean_text"])

sequences = tokenizer.texts_to_sequences(df["clean_text"])

X_text = pad_sequences(
    sequences,
    maxlen=MAX_LEN,
    padding="post",
    truncating="post"
)


In [35]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(
    X_text, y, test_size=0.30, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)


In [34]:
import pickle

# Save tokenizer
with open("../models/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


In [38]:
import numpy as np

np.save("../data/processed/X_train.npy", X_train)
np.save("../data/processed/X_val.npy", X_val)
np.save("../data/processed/X_test.npy", X_test)

np.save("../data/processed/y_train.npy", y_train)
np.save("../data/processed/y_val.npy", y_val)
np.save("../data/processed/y_test.npy", y_test)
