In [None]:
from pathlib import Path
import re
import string
import pandas as pd
from sklearn.model_selection import train_test_split


def load_data(path: Path) -> pd.DataFrame:
    data = pd.read_csv(path)
    data = pd.DataFrame(
        {
            "job_description": data[
                ["company_profile", "description", "requirements", "benefits"]
            ]
            .fillna("")
            .agg(" ".join, axis=1),
            "fraudulent": data["fraudulent"],
        }
    )
    data = data.drop_duplicates(subset=["job_description"], keep="first")
    return data

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text


def split_data(data: pd.DataFrame):
    X_train, X_test, y_train, y_test = train_test_split(
        data["job_description"],
        data["fraudulent"],
        test_size=0.2,
        random_state=42,
        stratify=data["fraudulent"],
    )

    # Convert to DataFrame for easy manipulation
    train_df = pd.DataFrame({'job_description': X_train, 'fraudulent': y_train})

    # Separate fraudulent (y=1) and non-fraudulent (y=0) samples
    fraudulent_df = train_df[train_df['fraudulent'] == 1]
    non_fraudulent_df = train_df[train_df['fraudulent'] == 0]
    print(fraudulent_df.shape, non_fraudulent_df.shape)

    # Oversample each fraudulent job description exactly 21 times
    fraudulent_df_oversampled = pd.concat([fraudulent_df] * 21, ignore_index=True)

    # Combine the oversampled fraudulent data with the original non-fraudulent data
    train_df_oversampled = pd.concat([non_fraudulent_df, fraudulent_df_oversampled], ignore_index=True)

    # Shuffle the data
    train_df_oversampled = train_df_oversampled.sample(frac=1, random_state=42).reset_index(drop=True)

    # Extract the oversampled X_train and y_train
    X_train: pd.Series[str] = train_df_oversampled['job_description']
    y_train: pd.Series[int] = train_df_oversampled['fraudulent']

    # apply clean_text function to the training and test data

    X_train = X_train.apply(clean_text)
    X_test = X_test.apply(clean_text)

    return X_train, X_test, y_train, y_test


In [None]:
import kagglehub
from pathlib import Path

path = kagglehub.dataset_download("shivamb/real-or-fake-fake-jobposting-prediction")
print("Path to dataset files:", path)

data = load_data(Path(path + "/fake_job_postings.csv"))

X_train, X_test, y_train, y_test = split_data(data)

print(X_train.head())
print(y_train.head())

In [None]:
from transformers import AutoTokenizer

# Load BERT-large tokenizer
model_name = "bert-large-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the dataset
def preprocess_data(texts, labels):
    return tokenizer(texts, truncation=True, padding="max_length", max_length=512)

# Tokenize training & test sets
train_encodings = preprocess_data(X_train.tolist(), y_train.tolist())
test_encodings = preprocess_data(X_test.tolist(), y_test.tolist())


In [None]:
import torch

class FakeJobDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Create PyTorch datasets
train_dataset = FakeJobDataset(train_encodings, y_train.tolist())
test_dataset = FakeJobDataset(test_encodings, y_test.tolist())


In [None]:
from transformers import AutoModelForSequenceClassification

# Load BERT-large with classification head
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",      # Directory for model checkpoints
    evaluation_strategy="epoch", # Evaluate at the end of each epoch
    save_strategy="epoch",       # Save model at each epoch
    per_device_train_batch_size=4,  # Reduce batch size due to memory limits
    per_device_eval_batch_size=4,
    num_train_epochs=1,          # Adjust based on dataset size
    learning_rate=2e-5,          # Recommended LR for fine-tuning BERT
    weight_decay=0.01,           # Regularization
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2,          # Keep only the 2 latest models
    fp16=True,                   # Enable mixed precision (A100 supports it)
)


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train model
trainer.train()


In [None]:
# Save model locally
model.save_pretrained("bert-large-fake-job-classifier")
tokenizer.save_pretrained("bert-large-fake-job-classifier")

# Upload to Hugging Face Hub
from huggingface_hub import notebook_login

notebook_login()  # Logs into Hugging Face

model.push_to_hub("pcloud/job_catcher-bert-large-uncased")
tokenizer.push_to_hub("pcloud/job_catcher-bert-large-uncased")
