In [2]:
%pip install -U pandas torch transformers kagglehub wandb scikit-learn matplotlib seaborn fastai google-colab

Collecting torch
  Using cached torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting nvidia-cusparselt-cu12==0.6.2 (from torch)
  Using cached nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting triton==3.2.0 (from torch)
  Using cached triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting pandas
  Downloading pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting jedi>=0.16 (from ipython==7.34.0->google-colab)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m112.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m18.7 MB/s[0

## Data Loading

In [3]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split


def load_data(path: Path) -> pd.DataFrame:
    data = pd.read_csv(path)
    data = pd.DataFrame(
        {
            "job_description": data[
                ["title", "location", "department", "salary_range", "company_profile", "description", "requirements", "benefits"]
            ]
            .fillna("")
            .agg(" ".join, axis=1),
            "fraudulent": data["fraudulent"],
        }
    )
    data = data.drop_duplicates(subset=["job_description"], keep="first")
    return data


def split_data(data: pd.DataFrame):
    X_train, X_test, y_train, y_test = train_test_split(
        data["job_description"],
        data["fraudulent"],
        test_size=0.2,
        random_state=42,
        stratify=data["fraudulent"],
    )

    # Convert to DataFrame for easy manipulation
    train_df = pd.DataFrame({'job_description': X_train, 'fraudulent': y_train})

    # Separate fraudulent (y=1) and non-fraudulent (y=0) samples
    fraudulent_df = train_df[train_df['fraudulent'] == 1]
    non_fraudulent_df = train_df[train_df['fraudulent'] == 0]
    print(fraudulent_df.shape, non_fraudulent_df.shape)

    # Oversample each fraudulent job description exactly 21 times
    fraudulent_df_oversampled = pd.concat([fraudulent_df] * 21, ignore_index=True)

    # Combine the oversampled fraudulent data with the original non-fraudulent data
    train_df_oversampled = pd.concat([non_fraudulent_df, fraudulent_df_oversampled], ignore_index=True)

    # Shuffle the data
    train_df_oversampled = train_df_oversampled.sample(frac=1, random_state=42).reset_index(drop=True)

    # Extract the oversampled X_train and y_train
    X_train: pd.Series[str] = train_df_oversampled['job_description']
    y_train: pd.Series[int] = train_df_oversampled['fraudulent']

    return X_train, X_test, y_train, y_test


In [4]:
import kagglehub
from pathlib import Path

path = kagglehub.dataset_download("shivamb/real-or-fake-fake-jobposting-prediction")

print("Path to dataset files:", path)
data = load_data(Path(path + "/fake_job_postings.csv"))
X_train, X_test, y_train, y_test = split_data(data)
print(X_train.head())
print(y_train.head())


Downloading from https://www.kaggle.com/api/v1/datasets/download/shivamb/real-or-fake-fake-jobposting-prediction?dataset_version_number=1...


100%|██████████| 16.1M/16.1M [00:01<00:00, 8.59MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/shivamb/real-or-fake-fake-jobposting-prediction/versions/1
(682, 2) (13350, 2)
0    Data Entry Admin/Clerical Positions - Work Fro...
1    Home Based Payroll Typist/Data Entry Clerks Po...
2    Backend Developer PHP/Symfony2 FR, J, Paris   ...
3    UI / UX / Graphic Designer IN, TN, Chennai   V...
4    Picker/Packers $10.50 Plus Overtime US, GA, Br...
Name: job_description, dtype: object
0    1
1    1
2    0
3    0
4    0
Name: fraudulent, dtype: int64


In [5]:
# Convert Series to sets for intersection operation
train_set = set(X_train.values)
test_set = set(X_test.values)

# Find overlap between train and test sets
overlap = train_set.intersection(test_set)

# Print results
print(f"Number of overlapping descriptions: {len(overlap)}")
if len(overlap) > 0:
  print("Warning: Train and test sets have overlapping data!")
else:
  print("Train and test sets are properly separated with no overlap.")

Number of overlapping descriptions: 0
Train and test sets are properly separated with no overlap.


In [6]:
from torch.utils.data import Dataset
import torch

class JobDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        """
        Args:
            texts (pd.Series or list): The job descriptions.
            labels (pd.Series or list): The corresponding labels (0 or 1).
            tokenizer (PreTrainedTokenizer): Tokenizer for BERT.
            max_length (int): Maximum token length for each text.
        """
        # Convert pandas Series to list if needed
        self.texts = texts.tolist() if hasattr(texts, 'tolist') else texts
        self.labels = labels.tolist() if hasattr(labels, 'tolist') else labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        # Tokenize the text with truncation and padding
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        # Squeeze to remove the batch dimension
        encoding = {key: tensor.squeeze(0) for key, tensor in encoding.items()}
        encoding['labels'] = torch.tensor(label, dtype=torch.long)
        return encoding


In [None]:
base_model_name = "bert-base-uncased"

In [7]:
import torch
from transformers import AutoModel, AutoTokenizer

# Load BERT (without classification head)
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert = AutoModel.from_pretrained(model_name)

# Freeze BERT weights (no fine-tuning)
for param in bert.parameters():
    param.requires_grad = False


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [8]:
train_dataset = JobDataset(X_train, y_train, tokenizer, max_length=512)
test_dataset = JobDataset(X_test, y_test, tokenizer, max_length=512)

In [10]:
import numpy as np
from torch.utils.data import DataLoader, Dataset

class JobDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts.tolist() if hasattr(texts, 'tolist') else texts
        self.labels = labels.tolist() if hasattr(labels, 'tolist') else labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt"
        )
        return {key: tensor.squeeze(0) for key, tensor in encoding.items()}, torch.tensor(label)

# Convert dataset into PyTorch format
train_dataset = JobDataset(X_train, y_train, tokenizer)
test_dataset = JobDataset(X_test, y_test, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [12]:
from tqdm import tqdm

def extract_embeddings(dataloader, model, device):
    model.to(device)
    model.eval()
    all_embeddings, all_labels = [], []

    with torch.no_grad():
        for batch, labels in tqdm(dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # Extract [CLS] token embedding

            all_embeddings.append(cls_embeddings.cpu().numpy())
            all_labels.append(labels.numpy())

    return np.vstack(all_embeddings), np.hstack(all_labels)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Extract embeddings
X_train_emb, y_train_emb = extract_embeddings(train_loader, bert, device)
X_test_emb, y_test_emb = extract_embeddings(test_loader, bert, device)

print(f"Train embeddings shape: {X_train_emb.shape}")  # Expected: (num_samples, 1024)
print(f"Test embeddings shape: {X_test_emb.shape}")   # Expected: (num_samples, 1024)


100%|██████████| 1730/1730 [03:47<00:00,  7.60it/s]
100%|██████████| 220/220 [00:29<00:00,  7.54it/s]

Train embeddings shape: (27672, 768)
Test embeddings shape: (3508, 768)





In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train a Logistic Regression classifier
clf = LogisticRegression(random_state=42)
clf.fit(X_train_emb, y_train_emb)

# Evaluate the classifier
y_pred = clf.predict(X_test_emb)
accuracy = accuracy_score(y_test_emb, y_pred)
print(f"🔹 Linear Probing Accuracy: {accuracy:.4f}")
print(classification_report(y_test_emb, y_pred))


🔹 Linear Probing Accuracy: 0.9179
              precision    recall  f1-score   support

           0       0.99      0.93      0.96      3337
           1       0.35      0.78      0.48       171

    accuracy                           0.92      3508
   macro avg       0.67      0.85      0.72      3508
weighted avg       0.96      0.92      0.93      3508



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
# prompt: Use Linear SVM for the same classification

from sklearn.linear_model import SGDClassifier
model = SGDClassifier()
model.fit(X_train_emb, y_train_emb)

# Evaluate the classifier
y_pred_svm = model.predict(X_test_emb)
accuracy_svm = accuracy_score(y_test_emb, y_pred_svm)
print(f"🔹 Linear SVM Accuracy: {accuracy_svm:.4f}")
print(classification_report(y_test_emb, y_pred_svm))


🔹 Linear SVM Accuracy: 0.9424
              precision    recall  f1-score   support

           0       0.98      0.95      0.97      3337
           1       0.44      0.71      0.55       171

    accuracy                           0.94      3508
   macro avg       0.71      0.83      0.76      3508
weighted avg       0.96      0.94      0.95      3508

