# 🧠 Fake Job Posting Detection using NLP + ML
Detect fraudulent job postings using text classification with TF-IDF and ML algorithms.

## 📦 Load Dataset & Import Libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

df = pd.read_csv("fake_job_postings.csv")
df.head()

## 🔍 Explore Dataset

In [None]:
print("Shape:", df.shape)
df.info()
print("\nTarget Value Counts:\n", df['fraudulent'].value_counts())

sns.countplot(x='fraudulent', data=df)
plt.title("Fake vs Real Job Postings")
plt.show()

## 🧹 Clean the Data


In [None]:
df = df.drop(columns=["job_id", "salary_range", "telecommuting", "has_company_logo", 
                      "has_questions", "employment_type", "required_experience", 
                      "required_education", "industry", "function"], errors='ignore')

df = df.dropna(subset=['description'])

text_cols = ['title', 'company_profile', 'description', 'requirements', 'benefits']
for col in text_cols:
    df[col] = df[col].fillna('')

df['text'] = df['title'] + " " + df['company_profile'] + " " + df['description'] + " " + df['requirements'] + " " + df['benefits']

## 🧼 Clean the Text

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['clean_text'] = df['text'].apply(clean_text)

## ✨ Vectorize Text using TF-IDF

In [None]:
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['clean_text']).toarray()
y = df['fraudulent']

## 🔀 Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## 🤖 Train Machine Learning Models


In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

for name, model in models.items():
    print(f"\n{name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

## 📈 Evaluate the Best Model (ROC & AUC)


In [None]:
best_model = models['Random Forest']
y_proba = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)

plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label='ROC Curve')
plt.plot([0,1], [0,1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

print("AUC Score:", roc_auc_score(y_test, y_proba))

## 💾 Save the Trained Model and Vectorizer using joblib


In [None]:
import joblib

# Save the best model
joblib.dump(best_model, "fake_job_model.pkl")

# Save the vectorizer

In [None]:
!streamlit run app.py