# **FraudSpotter: Job Posting Detection Using NLP & ML Models- DistilBERT PORTION**

Created By: Maureen Ekwebelem & YaeJin(Sally) Kang

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import re
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from tqdm import tqdm

import os
cwd = os.getcwd()
print(cwd)

new_directory = "C:\\Users\\SAMSUNG\\OneDrive\\Desktop\\capstone"
os.chdir(new_directory)

print("New working directory:", os.getcwd())


c:\Users\SAMSUNG\OneDrive\Desktop
New working directory: C:\Users\SAMSUNG\OneDrive\Desktop\capstone


In [None]:
df= pd.read_csv("fake_job_postings.csv")
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [None]:
# Drops columns with over 80% missing data
null_counts = df.isna().sum()
missing_percent = (null_counts / len(df)) * 100
high_missing = missing_percent[missing_percent > 80]
df.drop(columns=high_missing.index, inplace=True)

df.drop_duplicates(inplace=True)

In [None]:
#fill NaN and combine text
text_columns = [
    'title', 'location', 'department', 'company_profile',
    'description', 'requirements', 'benefits',
    'employment_type', 'required_experience',
    'required_education', 'industry', 'function'
]

for col in text_columns:
    if col in df.columns:
        df[col] = df[col].fillna('')

# Combine relevant text fields into one
df['text'] = df[text_columns].astype(str).agg(' '.join, axis=1)


In [None]:
df['text'] = df[text_columns].astype(str).agg(' '.join, axis=1)

# Clean Text - standard preprocessing
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'<.*?>', '', text)               # remove HTML tags
    text = re.sub(r'http\S+|www\S+', '', text)      # remove URLs
    text = re.sub(r'\s+', ' ', text).strip()        # collapse whitespace
    return text

df['text_clean'] = df['text'].apply(clean_text)

print("Final shape after cleaning:", df.shape)

Final shape after cleaning: (17880, 20)


In [None]:
# Separate text vs structured features
y = df['fraudulent']

# Text column to use- Renaming
text_col = 'text_clean'

# Structured columns: numeric + categorical
structured_cols = [
    'telecommuting',
    'has_company_logo',
    'has_questions',
    'employment_type',
    'required_experience',
    'required_education',
    'industry',
    'function'
]
structured_cols = [c for c in structured_cols if c in df.columns]

X_struct = df[structured_cols]
X_text = df[text_col]

# Single train/test split
X_train_struct, X_test_struct, X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_struct,
    X_text,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train size:", X_train_text.shape[0])
print("Test size:", X_test_text.shape[0])

Train size: 14304
Test size: 3576


In [None]:
#distilbert model
print("\n=== Building DistilBERT embeddings ===")

# Categorical columns (subset of structured_cols)
categorical_cols = [
    'employment_type',
    'required_experience',
    'required_education',
    'industry',
    'function'
]
categorical_cols = [c for c in categorical_cols if c in X_train_struct.columns]

# One-hot encode categorical features
X_train_encoded = pd.get_dummies(X_train_struct, columns=categorical_cols, drop_first=True)
X_test_encoded  = pd.get_dummies(X_test_struct,  columns=categorical_cols, drop_first=True)

# Align columns (in case some categories appear only in train or test)
X_train_encoded, X_test_encoded = X_train_encoded.align(
    X_test_encoded,
    join='left',
    axis=1,
    fill_value=0
)

print("Structured train shape (after one-hot):", X_train_encoded.shape)
print("Structured test shape (after one-hot):",  X_test_encoded.shape)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
bert_model = AutoModel.from_pretrained("distilbert-base-uncased")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = bert_model.to(device)
bert_model.eval()

def get_bert_embeddings(text_series, tokenizer, model, device, batch_size=16, max_length=128):
    embeddings = []
    for i in tqdm(range(0, len(text_series), batch_size)):
        batch_texts = text_series.iloc[i:i+batch_size].tolist()
        encoded = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model(**encoded)
            # Mean-pool over sequence length
            last_hidden_state = outputs.last_hidden_state.mean(dim=1)
            embeddings.append(last_hidden_state.cpu().numpy())

    return np.concatenate(embeddings, axis=0)

print("\nComputing DistilBERT embeddings for train...")
X_train_bert = get_bert_embeddings(X_train_text, tokenizer, bert_model, device)

print("Computing DistilBERT embeddings for test...")
X_test_bert = get_bert_embeddings(X_test_text, tokenizer, bert_model, device)

print("BERT embeddings train shape:", X_train_bert.shape)
print("BERT embeddings test shape:",  X_test_bert.shape)

# Combine BERT embeddings with structured numeric/categorical features
X_train_final_bert = np.hstack([X_train_bert, X_train_encoded.values])
X_test_final_bert  = np.hstack([X_test_bert,  X_test_encoded.values])

print("Final BERT+structured train shape:", X_train_final_bert.shape)
print("Final BERT+structured test shape:",  X_test_final_bert.shape)

# Scale features for LR
scaler = StandardScaler()
X_train_bert_scaled = scaler.fit_transform(X_train_final_bert)
X_test_bert_scaled  = scaler.transform(X_test_final_bert)




=== Building DistilBERT embeddings ===
Structured train shape (after one-hot): (14304, 189)
Structured test shape (after one-hot): (3576, 189)

Computing DistilBERT embeddings for train...


100%|██████████| 894/894 [9:25:34<00:00, 37.96s/it]       


Computing DistilBERT embeddings for test...


100%|██████████| 224/224 [16:53<00:00,  4.53s/it]


BERT embeddings train shape: (14304, 768)
BERT embeddings test shape: (3576, 768)
Final BERT+structured train shape: (14304, 957)
Final BERT+structured test shape: (3576, 957)


In [None]:
def evaluate_model(name, model, X_train, X_test, y_train, y_test, is_prob_model=True):
    print(f"\n=== {name} ===")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, digits=3))

    if is_prob_model and hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
        print("ROC-AUC:", roc_auc_score(y_test, y_prob))
    elif hasattr(model, "decision_function"):
        y_score = model.decision_function(X_test)
        print("ROC-AUC:", roc_auc_score(y_test, y_score))


In [None]:
#distilbert results

class_weight_option = 'balanced'

def evaluate_model(name, model, X_train, X_test, y_train, y_test, is_prob_model=True):
    print(f"\n=== {name} ===")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, digits=3))

    if is_prob_model and hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
        print("ROC-AUC:", roc_auc_score(y_test, y_prob))
    elif hasattr(model, "decision_function"):
        y_score = model.decision_function(X_test)
        print("ROC-AUC:", roc_auc_score(y_test, y_score))

# Logistic Regression on DistilBERT embeddings + structured features
lr_bert = LogisticRegression(
    max_iter=1000,
    class_weight=class_weight_option,
    n_jobs=-1
)

evaluate_model(
    "Logistic Regression (DistilBERT + structured)",
    lr_bert,
    X_train_bert_scaled,
    X_test_bert_scaled,
    y_train,
    y_test
)


=== Logistic Regression (DistilBERT + structured) ===
Confusion Matrix:
[[3332   71]
 [  28  145]]

Classification Report:
              precision    recall  f1-score   support

           0      0.992     0.979     0.985      3403
           1      0.671     0.838     0.746       173

    accuracy                          0.972      3576
   macro avg      0.831     0.909     0.865      3576
weighted avg      0.976     0.972     0.974      3576

ROC-AUC: 0.9723246574341918
