In [5]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib


In [6]:
df = pd.read_csv("fake_job_postings.csv")
df.shape


(17880, 18)

In [7]:
df.isnull().sum()


job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2696
benefits                7212
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

In [8]:
df = df[['title', 'company_profile', 'description', 'requirements', 'benefits', 'fraudulent']]


In [9]:
def combine_text(row):
    return str(row['title']) + ' ' + str(row['company_profile']) + ' ' + str(row['description']) + ' ' + str(row['requirements']) + ' ' + str(row['benefits'])

df['combined_text'] = df.apply(combine_text, axis=1)


In [10]:
def clean_text(text):
    text = text.lower()                     # Convert to lowercase
    text = re.sub(r'\d+', '', text)         # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)     # Remove punctuation
    text = re.sub(r'\s+', ' ', text)        # Remove extra spaces
    return text.strip()

df['clean_text'] = df['combined_text'].apply(clean_text)


In [11]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(df['clean_text'])


In [12]:
y = df['fraudulent']


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


In [14]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [15]:
y_pred = model.predict(X_test)


In [16]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9804250559284117
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3403
           1       1.00      0.60      0.75       173

    accuracy                           0.98      3576
   macro avg       0.99      0.80      0.87      3576
weighted avg       0.98      0.98      0.98      3576



In [18]:
import os
os.makedirs("models", exist_ok=True)


In [19]:
joblib.dump(model, 'models/fake_job_model.pkl')
joblib.dump(vectorizer, 'models/tfidf_vectorizer.pkl')


['models/tfidf_vectorizer.pkl']