In [50]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, f1_score


In [None]:
cd ..

In [27]:
from src.ScamSniffer.utils.common import read_yaml
from pathlib import Path
content = read_yaml(Path("config.yaml"))

In [28]:
data_folder = content.data_ingestion.root_dir

In [29]:
# loading data
dataset_1 = pd.read_csv(os.path.join(data_folder, "fake_job_postings.csv"))
dataset_2 = pd.read_csv(os.path.join(data_folder, "simulated_fake_job_posts.csv"))

In [30]:
dataset_1.describe()

Unnamed: 0,job_id,telecommuting,has_company_logo,has_questions,fraudulent
count,17880.0,17880.0,17880.0,17880.0,17880.0
mean,8940.5,0.042897,0.795302,0.491723,0.048434
std,5161.655742,0.202631,0.403492,0.499945,0.214688
min,1.0,0.0,0.0,0.0,0.0
25%,4470.75,0.0,1.0,0.0,0.0
50%,8940.5,0.0,1.0,0.0,0.0
75%,13410.25,0.0,1.0,1.0,0.0
max,17880.0,1.0,1.0,1.0,1.0


In [31]:
dataset_1

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,17876,Account Director - Distribution,"CA, ON, Toronto",Sales,,Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,0,1,1,Full-time,Mid-Senior level,,Computer Software,Sales,0
17876,17877,Payroll Accountant,"US, PA, Philadelphia",Accounting,,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0
17877,17878,Project Cost Control Staff Engineer - Cost Con...,"US, TX, Houston",,,We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,,0,0,0,Full-time,,,,,0
17878,17879,Graphic Designer,"NG, LA, Lagos",,,,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,0,0,1,Contract,Not Applicable,Professional,Graphic Design,Design,0


In [32]:
dataset_1["description"].isnull().sum()

np.int64(1)

In [33]:
dataset_1["fraudulent"].isnull().sum()

np.int64(0)

In [34]:
raw_dataset_1 = dataset_1.where((pd.notnull(dataset_1)), '')

In [61]:
description_main = (raw_dataset_1["company_profile"].astype(str) + " " + raw_dataset_1["description"].astype(str) + " "+raw_dataset_1["requirements"].astype(str) + " " + raw_dataset_1["benefits"].astype(str))

In [62]:
description_main

0        We're Food52, and we've created a groundbreaki...
1        90 Seconds, the worlds Cloud Video Production ...
2        Valor Services provides Workforce Solutions th...
3        Our passion for improving quality of life thro...
4        SpotSource Solutions LLC is a Global Human Cap...
                               ...                        
17875    Vend is looking for some awesome new talent to...
17876    WebLinc is the e-commerce platform and service...
17877    We Provide Full Time Permanent Positions for m...
17878     Nemsia Studios is looking for an experienced ...
17879    Vend is looking for some awesome new talent to...
Length: 17880, dtype: object

In [64]:
raw_dataset_1["description"] = description_main
dataset_1_subset = raw_dataset_1[["description", "fraudulent"]]
dataset_1_subset.head()

Unnamed: 0,description,fraudulent
0,"We're Food52, and we've created a groundbreaki...",0
1,"90 Seconds, the worlds Cloud Video Production ...",0
2,Valor Services provides Workforce Solutions th...,0
3,Our passion for improving quality of life thro...,0
4,SpotSource Solutions LLC is a Global Human Cap...,0


In [36]:
dataset_2.head()

Unnamed: 0,post_id,description,fraudulent
0,1,🎯 We're Hiring at Netflix!\n\nVirtual Intervie...,1
1,2,🚨 #Razorpay is Hiring – Marketing Intern!\n\nL...,1
2,3,🚨 #Axis Bank is Hiring – AI Engineer!\n\nLooki...,1
3,4,"Hello Connections,\n\nWe are #hiring for multi...",1
4,5,🚨 #Capgemini is Hiring – Marketing Intern!\n\n...,1


In [65]:
raw_job_posts = pd.concat([dataset_1_subset, dataset_2[["description", "fraudulent"]]])
raw_job_posts.head()

Unnamed: 0,description,fraudulent
0,"We're Food52, and we've created a groundbreaki...",0
1,"90 Seconds, the worlds Cloud Video Production ...",0
2,Valor Services provides Workforce Solutions th...,0
3,Our passion for improving quality of life thro...,0
4,SpotSource Solutions LLC is a Global Human Cap...,0


In [66]:
raw_job_posts.tail()

Unnamed: 0,description,fraudulent
995,🚨 #HCL is Hiring – Data Analyst!\n\nLooking to...,1
996,🎯 We're Hiring at Google!\n\nVirtual Interview...,1
997,"Hii Everyone,\n\nWe are #hiring for multiple p...",1
998,🎯 We're Hiring at KPMG!\n\nVirtual Interview o...,1
999,🚀 Hiring Alert!! Great Opportunity!\n\n#Infosy...,1


In [67]:
raw_job_posts.shape

(18880, 2)

### Fake job post - 1, legit job post - 0

In [68]:
X = raw_job_posts['description']
Y = raw_job_posts['fraudulent']

### Train test split

In [69]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 3)

In [70]:
X_train.shape

(13216,)

In [71]:
X_test.shape

(5664,)

feature extraction


In [72]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words = 'english', lowercase= True)

In [73]:
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [74]:
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

Training the model

In [75]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}


In [77]:
for name, model in models.items():
    print(f"Training {name}")
    model.fit(X_train_features, Y_train)
    y_pred = model.predict(X_test_features)
    y_proba = model.predict_proba(X_test_features)[:, 1] if hasattr(model, "predict_proba") else None

    print("Accuracy:", accuracy_score(Y_test, y_pred))
    if y_proba is not None:
        print("ROC AUC:", roc_auc_score(Y_test, y_proba))
    
    print("F1 Score:", f1_score(Y_test, y_pred))
    print(classification_report(Y_test, y_pred))
    

Training Logistic Regression
Accuracy: 0.972457627118644
ROC AUC: 0.9881440053108874
F1 Score: 0.832258064516129
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      5121
           1       1.00      0.71      0.83       543

    accuracy                           0.97      5664
   macro avg       0.99      0.86      0.91      5664
weighted avg       0.97      0.97      0.97      5664

Training Random Forest
Accuracy: 0.9800494350282486
ROC AUC: 0.9928933438774296
F1 Score: 0.8845760980592441
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5121
           1       0.99      0.80      0.88       543

    accuracy                           0.98      5664
   macro avg       0.99      0.90      0.94      5664
weighted avg       0.98      0.98      0.98      5664

Training XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.9821680790960452
ROC AUC: 0.9905712692078226
F1 Score: 0.8982880161127895
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5121
           1       0.99      0.82      0.90       543

    accuracy                           0.98      5664
   macro avg       0.99      0.91      0.94      5664
weighted avg       0.98      0.98      0.98      5664



In [None]:
input_post = ["🔹 We're Hiring at HCL TECH! Join one of the most dynamic tech teams in the industry.Virtual Interview on 28th and 29th July 2025 | Timing:- 07:00 PM Comment your Email Address for Apply 🔗 Link  🎯 Who Can Apply: Freshers & Final-Year College Students Interns & Full-Time Job Seekers 💼 Work Mode: Remote / Hybrid 💰 Salary: ₹4.5 – ₹6.5 LPA 🚀 Open Roles: 💻 Full Stack Engineer ☕ Java Developer 📊 Data Scientist 🎨 UI/UX Designer 📩 Want to apply? 👉 Please hit like & comment your Email address so that I can review your profile. We will look at your LinkedIn profile and will let you know of any possible requirements that match your profile.Thank youBe a part of something big.  Build the future with HCL Tech! 🌐"]


input_post_features = feature_extraction.transform(input_post)
prediction = model.predict(input_post_features)

print(prediction)

[1]


In [80]:
input_post_2 = ["Looking to kickstart your career in tech? Abstrabit Technologies is hiring! We're offering remote positions for: Full Stack Interns: Dive into real-world projects and build your portfolio. Full-Time Software Developers: Elevate your skills with AI/ML and full-stack development. We're seeking driven individuals, especially freshers with internship experience, who are passionate about learning and innovation. You'll work on exciting end-to-end projects, gaining invaluable experience with the latest technologies. This is more than just a job; it's an opportunity to grow and make a real impact. Interested? Send your resume to hr@abstrabit.com.#careerdevelopment #techcareers #remotework #softwareengineering #artificialintelligence"]

In [81]:
input_post_features_2 = feature_extraction.transform(input_post_2)

In [83]:
prediction_2 = model.predict(input_post_features_2)

In [84]:
if(prediction_2 == 1): print("Fraud")
else: print("Legit")

Legit
