In [1]:
import pandas as pd
import numpy as np
import pickle
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [25]:
df = pd.read_csv("data/merged_fake_job_postings.csv")
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].fillna("")

In [26]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1.0,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0.0,1.0,0.0,Other,Internship,,,Marketing,0
1,2.0,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0.0,1.0,0.0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3.0,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0.0,1.0,0.0,,,,,,0
3,4.0,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0.0,1.0,0.0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5.0,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0.0,1.0,1.0,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [28]:
print("\nClass distribution:\n", df['fraudulent'].value_counts())


Class distribution:
 fraudulent
0    17014
1    11016
Name: count, dtype: int64


In [46]:
def clean(text):
    text = text.lower()
    text = re.sub(r'<[^>]+>', ' ', text)         
    text = re.sub(r'http\S+', '', text)       
    text = re.sub(r'[^\w\s₹$@]', '', text)      
    text = re.sub(r'\s+', ' ', text).strip()    
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

In [47]:
def risk_score(text):
    text = text.lower()
    score = 0
    if "telegram" in text or "whatsapp" in text: score += 1
    if "₹" in text and any(kw in text for kw in ["per day", "per week", "guaranteed", "earn"]): score += 1
    if "no interview" in text or "no experience" in text: score += 1
    if "visa" in text and "processing fee" in text: score += 1
    return score

In [48]:
text_columns = [
    'title', 'location', 'department', 'company_profile', 'description',
    'requirements', 'benefits', 'employment_type', 'required_experience',
    'required_education', 'industry', 'function'
]

In [49]:
df["combined_text"] = df[text_columns].apply(lambda x: " ".join(x.astype(str)), axis=1)
df["combined_text"] = df["combined_text"].apply(clean)

In [50]:
df["risk_score"] = df["combined_text"].apply(risk_score)

In [51]:
X = df[["combined_text", "risk_score"]]
y = df["fraudulent"]

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [53]:
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train["combined_text"])
X_test_tfidf = vectorizer.transform(X_test["combined_text"])

In [54]:
from scipy.sparse import hstack

X_train_combined = hstack([X_train_tfidf, np.array(X_train["risk_score"]).reshape(-1, 1)])
X_test_combined = hstack([X_test_tfidf, np.array(X_test["risk_score"]).reshape(-1, 1)])

In [55]:
lr_model = LogisticRegression(max_iter=1000, class_weight='balanced')
lr_model.fit(X_train_combined, y_train)

In [56]:
rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_model.fit(X_train_combined, y_train)

In [57]:
print("\n Logistic Regression:")
print(classification_report(y_test, lr_model.predict(X_test_combined)))

print("\n Random Forest:")
print(classification_report(y_test, rf_model.predict(X_test_combined)))



 Logistic Regression:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3403
           1       1.00      0.97      0.98      2203

    accuracy                           0.99      5606
   macro avg       0.99      0.98      0.99      5606
weighted avg       0.99      0.99      0.99      5606


 Random Forest:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3403
           1       1.00      0.97      0.98      2203

    accuracy                           0.99      5606
   macro avg       0.99      0.98      0.99      5606
weighted avg       0.99      0.99      0.99      5606



In [58]:
with open("lrmodel.pkl", "wb") as f:
    pickle.dump(lr_model, f)

with open("rfmodel.pkl", "wb") as f:
    pickle.dump(rf_model, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("\n Models and vectorizer saved successfully.")


 Models and vectorizer saved successfully.
