In [None]:
#I run Task4 on colab, so I read data from google drive!
from google.colab import drive
drive.mount('/content/drive/')

!ls /content/drive/MyDrive/



Mounted at /content/drive/
'Colab Notebooks'   fake_job_postings.csv   sample_data.csv


In [None]:
!pip install transformers datasets torch scikit-learn

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:0

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
＃Task4-A:fine-tuning DistilBERT

In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import evaluate
from sklearn.model_selection import train_test_split

# Read the dataset
# If you need to run on local computer!
#import os
#data_file = os.path.join("data", "fake_job_postings.csv")
#df = pd.read_csv(data_file)

df = pd.read_csv('/content/drive/MyDrive/fake_job_postings.csv')

# Task 1: Predicting Education Level
non_fraud_df = df[df['fraudulent'] == 0]
education_levels = ['Master\'s Degree', 'Bachelor\'s Degree', 'High School or equivalent']
education_df = non_fraud_df[non_fraud_df['required_education'].isin(education_levels)]

# Split the dataset into training and validation sets
train_dataset_task1, val_dataset_task1 = train_test_split(education_df, test_size=0.2)

# Task 2: Predict whether it is a fake job
fraud_df = df[['description', 'fraudulent']]
train_dataset_task2, val_dataset_task2 = train_test_split(fraud_df, test_size=0.2)

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Prepare preprocessing functions for Task 1: Education Level Prediction
def preprocess_data(examples):
    description = examples['description'] if isinstance(examples['description'], str) else ''
    required_education = examples['required_education'] if pd.notna(examples['required_education']) else 'Unknown'

    tokenized_inputs = tokenizer(description, padding='max_length', truncation=True, max_length=512)
    tokenized_inputs['labels'] = education_levels.index(required_education) if required_education in education_levels else -1
    return tokenized_inputs

# Prepare preprocessing functions for Task 2: Fake job detection
def preprocess_data_fraud(examples):
    description = examples['description'] if isinstance(examples['description'], str) else ''
    tokenized_inputs = tokenizer(description, padding='max_length', truncation=True, max_length=512)
    tokenized_inputs['labels'] = examples['fraudulent']
    return tokenized_inputs

# Process the dataset
train_dataset_task1 = train_dataset_task1.apply(preprocess_data, axis=1).tolist()
val_dataset_task1 = val_dataset_task1.apply(preprocess_data, axis=1).tolist()
train_dataset_task2 = train_dataset_task2.apply(preprocess_data_fraud, axis=1).tolist()
val_dataset_task2 = val_dataset_task2.apply(preprocess_data_fraud, axis=1).tolist()

# Load evaluation metrics
accuracy_metric = evaluate.load("accuracy")

# Function to calculate the indicator
def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=1)
    accuracy = accuracy_metric.compute(predictions=preds, references=labels)
    return accuracy

# Task 1: Education level prediction
model_education = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(education_levels))
training_args_task1 = TrainingArguments(
    output_dir='./results_education',
    evaluation_strategy="steps",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01
)
trainer_education = Trainer(
    model=model_education,
    args=training_args_task1,
    train_dataset=train_dataset_task1,
    eval_dataset=val_dataset_task1,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Task 2: Fake job detection
model_fraud = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
training_args_task2 = TrainingArguments(
    output_dir='./results_fraud',
    evaluation_strategy="steps",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01
)
trainer_fraud = Trainer(
    model=model_fraud,
    args=training_args_task2,
    train_dataset=train_dataset_task2,
    eval_dataset=val_dataset_task2,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train and evaluate Task 1 (education level prediction)
trainer_education.train()
trainer_education.evaluate()

# Train and evaluate Task 2 (Fake Job Detection)
trainer_fraud.train()
trainer_fraud.evaluate()

# Save the model
trainer_education.save_model("./model_education")
trainer_fraud.save_model("./model_fraud")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
500,0.3454,0.361247,0.905995
1000,0.1976,0.363812,0.91485


Step,Training Loss,Validation Loss,Accuracy
500,0.1531,0.110242,0.973993
1000,0.1019,0.075821,0.980145
1500,0.0712,0.069862,0.979586
2000,0.0432,0.078459,0.983501
2500,0.0217,0.082315,0.984899


In [None]:
＃Task4-A:Orignal DistilBERT

In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import evaluate
from sklearn.model_selection import train_test_split

# Read the dataset
# change if you want
df = pd.read_csv('/content/drive/MyDrive/fake_job_postings.csv')

# Task 1: Predicting Education Level
non_fraud_df = df[df['fraudulent'] == 0]
education_levels = ['Master\'s Degree', 'Bachelor\'s Degree', 'High School or equivalent']
education_df = non_fraud_df[non_fraud_df['required_education'].isin(education_levels)]

# Split the dataset into training and validation sets
train_dataset_task1, val_dataset_task1 = train_test_split(education_df, test_size=0.2)

# Task 2: Predict whether it is a fake job
fraud_df = df[['description', 'fraudulent']]
train_dataset_task2, val_dataset_task2 = train_test_split(fraud_df, test_size=0.2)

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Prepare preprocessing functions for Task 1: Education Level Prediction
def preprocess_data(examples):
    description = examples['description'] if isinstance(examples['description'], str) else ''
    required_education = examples['required_education'] if pd.notna(examples['required_education']) else 'Unknown'

    tokenized_inputs = tokenizer(description, padding='max_length', truncation=True, max_length=512)
    tokenized_inputs['labels'] = education_levels.index(required_education) if required_education in education_levels else -1
    return tokenized_inputs

# Prepare preprocessing functions for Task 2: Fake job detection
def preprocess_data_fraud(examples):
    description = examples['description'] if isinstance(examples['description'], str) else ''
    tokenized_inputs = tokenizer(description, padding='max_length', truncation=True, max_length=512)
    tokenized_inputs['labels'] = examples['fraudulent']
    return tokenized_inputs

# Process the dataset
train_dataset_task1 = train_dataset_task1.apply(preprocess_data, axis=1).tolist()
val_dataset_task1 = val_dataset_task1.apply(preprocess_data, axis=1).tolist()
train_dataset_task2 = train_dataset_task2.apply(preprocess_data_fraud, axis=1).tolist()
val_dataset_task2 = val_dataset_task2.apply(preprocess_data_fraud, axis=1).tolist()

# Load evaluation metrics
accuracy_metric = evaluate.load("accuracy")

# Function to calculate the indicator
def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=1)
    accuracy = accuracy_metric.compute(predictions=preds, references=labels)
    return accuracy

# Task 1: Education level prediction (untrained model)
model_education = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(education_levels))
training_args_task1 = TrainingArguments(
    output_dir='./results_education',
    evaluation_strategy="steps",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=0,  # No training
    weight_decay=0.01
)
trainer_education = Trainer(
    model=model_education,
    args=training_args_task1,
    train_dataset=train_dataset_task1,
    eval_dataset=val_dataset_task1,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Task 2: Fake Job Detection (Untrained Model)
model_fraud = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
training_args_task2 = TrainingArguments(
    output_dir='./results_fraud',
    evaluation_strategy="steps",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=0, # No training
    weight_decay=0.01
)
trainer_fraud = Trainer(
    model=model_fraud,
    args=training_args_task2,
    train_dataset=train_dataset_task2,
    eval_dataset=val_dataset_task2,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Evaluate Task 1 (Education Level Prediction)
eval_results_education = trainer_education.evaluate()
print("Education Level Prediction Evaluation Results:", eval_results_education)

# Evaluate Task 2 (Fraudulent Job Detection)
eval_results_fraud = trainer_fraud.evaluate()
print("Fraudulent Job Detection Evaluation Results:", eval_results_fraud)

# Save the model (although not trained, it can still be saved)
trainer_education.save_model("./model_education")
trainer_fraud.save_model("./model_fraud")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Education Level Prediction Evaluation Results: {'eval_loss': 1.1107063293457031, 'eval_model_preparation_time': 0.0028, 'eval_accuracy': 0.138283378746594, 'eval_runtime': 24.0182, 'eval_samples_per_second': 61.12, 'eval_steps_per_second': 3.83}


Fraudulent Job Detection Evaluation Results: {'eval_loss': 0.6959530115127563, 'eval_model_preparation_time': 0.0038, 'eval_accuracy': 0.48126398210290827, 'eval_runtime': 54.9838, 'eval_samples_per_second': 65.037, 'eval_steps_per_second': 4.074}


In [None]:
#Task4-B: GPT-4o-Mini

In [None]:
!pip install langchain-openai

Collecting langchain-openai
  Downloading langchain_openai-0.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain-core<0.4.0,>=0.3.9 (from langchain-openai)
  Downloading langchain_core-0.3.9-py3-none-any.whl.metadata (6.3 kB)
Collecting openai<2.0.0,>=1.40.0 (from langchain-openai)
  Downloading openai-1.51.2-py3-none-any.whl.metadata (24 kB)
Collecting tiktoken<1,>=0.7 (from langchain-openai)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.4.0,>=0.3.9->langchain-openai)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting langsmith<0.2.0,>=0.1.125 (from langchain-core<0.4.0,>=0.3.9->langchain-openai)
  Downloading langsmith-0.1.132-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain-core<0.4.0,>=0.3.9->langchain-openai)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting httpx<1

In [None]:
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# I have api key, However I can not provide for you!
# Set up OpenAI API key
OPENAI_API_KEY = ""

llm = ChatOpenAI(api_key=OPENAI_API_KEY, model="gpt-4o-mini")


# Example: Categorize a single job description
def classify_job_description(job_description):
    input_text = f"Predict the required education level for the following job description(only show the degree label, it may be label-Master’s Degree==2, Bachelor’s Degree==1 or High School or equivalent==0 ”): {job_description}"
    response = llm.invoke(input_text)
    return response.content.strip() if hasattr(response, 'content') else response.strip()

# Test classification function
job_description = "As part of the sales team, you will work with clients to develop and execute strategies to help them leverage GIS technology for business goals."
predicted_education = classify_job_description(job_description)
print(f"The predicted education requirement is: {predicted_education}")

The predicted education requirement is: 1


In [7]:
import pandas as pd
import numpy as np
import time


# Set OpenAI API key
OPENAI_API_KEY = ""

llm = ChatOpenAI(api_key=OPENAI_API_KEY, model="gpt-4o-mini")

# Read dataset
# change if you want
df = pd.read_csv('/content/drive/MyDrive/fake_job_postings.csv')

# Filter out non-fraudulent job postings
df = df[df['fraudulent'] == 0]

# Filter based on required education levels
education_levels = ["Bachelor's Degree", "Master's Degree", 'High School or equivalent']
df = df[df['required_education'].isin(education_levels)]

# Map education levels: Master's Degree==2, Bachelor's Degree==1, High School or equivalent==0
education_mapping = {
    "Master's Degree": 2,
    "Bachelor's Degree": 1,
    "High School or equivalent": 0
}
y = df['required_education'].map(education_mapping)
print(f"Length of y: {len(y)}")
Xr = df['description']
print(f"Length of Xr: {len(Xr)}")

# Define function to classify job descriptions
def classify_job_description(job_description):
    input_text = f"Predict the required education level for the following job description (only show the degree label, it may be: Master's Degree==2, Bachelor's Degree==1, or High School or equivalent==0): {job_description}"
    response = llm.predict(input_text)
    return response.strip()

# Define batch processing function
def classify_in_batches(Xr, batch_size=200):
    predictions = []

    # Loop through the dataset in batches
    for i in range(0, len(Xr), batch_size):
        batch_descriptions = Xr[i:i+batch_size]
        print(f"Processing batch {i // batch_size + 1}/{(len(Xr) // batch_size) + 1}")

        # Classify each job description in the batch
        for job_description in batch_descriptions:
            predicted_label = classify_job_description(job_description)
            try:
                predictions.append(int(predicted_label))  # Convert the prediction to an integer
            except ValueError:
                predictions.append(-1)  # If the result can't be classified, assign -1

        # Optional: Add a delay between batches to avoid API rate limits
        time.sleep(1)

    return predictions

# Process the job descriptions in batches
batch_size = 200  # You can adjust the batch size as needed
predictions = classify_in_batches(Xr, batch_size=batch_size)

# Define function to calculate accuracy
def calculate_accuracy(predictions, true_labels):
    # Ignore unclassified results (-1) and calculate accuracy
    valid_predictions = [pred for pred in predictions if pred != -1]
    valid_true_labels = [label for pred, label in zip(predictions, true_labels) if pred != -1]
    accuracy = np.mean([pred == true_label for pred, true_label in zip(valid_predictions, valid_true_labels)])
    return accuracy

# Calculate accuracy
accuracy = calculate_accuracy(predictions, y)
print(f"Classification accuracy: {accuracy * 100:.2f}%")


Length of y: 7340
Length of Xr: 7340
Processing batch 1/37
Processing batch 2/37
Processing batch 3/37
Processing batch 4/37
Processing batch 5/37
Processing batch 6/37
Processing batch 7/37
Processing batch 8/37
Processing batch 9/37
Processing batch 10/37
Processing batch 11/37
Processing batch 12/37
Processing batch 13/37
Processing batch 14/37
Processing batch 15/37
Processing batch 16/37
Processing batch 17/37
Processing batch 18/37
Processing batch 19/37
Processing batch 20/37


KeyboardInterrupt: 