<a href="https://colab.research.google.com/github/Kusumapriya58/Fake-job-detection/blob/main/Fake_job_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import random
import re
import string
from collections import Counter

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix,
                             precision_score, recall_score, f1_score)
from sklearn.model_selection import train_test_split

# NLP utilities
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure required NLTK data is available. These calls download inside Python (not shell).
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Configuration
RANDOM_STATE = 42
TEST_SIZE = 0.20
MAX_FEATURES = 5000
MODEL_OUTPUT = 'fake_job_model.pkl'
VECTORIZER_OUTPUT = 'tfidf_vectorizer.pkl'

# Helper functions

def basic_text_clean(text: str) -> str:
    """Perform initial cleaning: lowercasing, remove HTML, URLs, punctuation, digits."""
    if not isinstance(text, str):
        return ''
    # Lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', ' ', text)
    # Remove digits
    text = re.sub(r'\d+', ' ', text)
    # Remove punctuation
    text = text.translate(str.maketrans(' ', ' ', string.punctuation))
    # Collapse whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def tokenize_and_lemmatize(text: str, lemmatizer: WordNetLemmatizer, stop_words: set) -> str:
    """Tokenize, remove stopwords, and lemmatize. Return cleaned string."""
    if not text:
        return ''
    tokens = nltk.word_tokenize(text)
    cleaned_tokens = []
    for tok in tokens:
        tok = tok.strip()
        # skip purely punctuation or empty
        if not tok:
            continue
        if tok in stop_words:
            continue
        lemma = lemmatizer.lemmatize(tok)
        cleaned_tokens.append(lemma)
    return ' '.join(cleaned_tokens)

# Part 1 — Data Understanding

# 1. Load dataset
DATA_PATH = 'fake_job_postings.csv'
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset file not found at {DATA_PATH}. Place the CSV in the same folder as this script.")

df = pd.read_csv(DATA_PATH)

# Print dataset shape and columns
print('\n=== Dataset overview ===')
print('Shape:', df.shape)
print('Columns:', list(df.columns))

# Missing values per column
missing_per_col = df.isna().sum()
print('\nMissing values per column:')
print(missing_per_col)

# Convert label to a consistent column `fraud` if not present
# The Kaggle dataset typically has a column named `fraudulent` with values '0'/'1' or 0/1.
label_candidates = ['fraudulent', 'fraud', 'is_fake', 'fraudulent?']
label_col = None
for c in label_candidates:
    if c in df.columns:
        label_col = c
        break

# If no standard candidate found, inspect any column that looks like label
if label_col is None:
    # Heuristic: look for column with only two unique values and small unique count
    for c in df.columns:
        if df[c].nunique() <= 3 and df[c].dtype in [int, float, object]:
            label_col = c
            break

if label_col is None:
    raise ValueError('Cannot find a label column in dataset. Please ensure the dataset matches Kaggle fake_job_postings format.')

# Normalize label values to 0 (real) / 1 (fake)
print('\nLabel column detected:', label_col)

def normalize_label(v):
    if pd.isna(v):
        return np.nan
    if isinstance(v, str):
        v = v.strip()
        if v in ('0', '0.0', 'false', 'False', 'no', 'No'):
            return 0
        if v in ('1', '1.0', 'true', 'True', 'yes', 'Yes'):
            return 1
    try:
        nv = float(v)
        return int(nv)
    except Exception:
        # fallback: treat anything non-empty as 1
        return 1 if v else 0

df['fraud'] = df[label_col].apply(normalize_label)

# Distribution of fraudulent vs real
distribution = df['fraud'].value_counts(dropna=False)
print('\nDistribution (fraud label counts):')
print(distribution)

# Simple insight extraction (3 short insights printed)
print('\n=== Three dataset observations / insights ===')
insights = []
# Insight 1: missing values tendency
cols_with_many_missing = missing_per_col[missing_per_col > len(df) * 0.1].index.tolist()
if cols_with_many_missing:
    insights.append(f"Columns with >10% missing values: {cols_with_many_missing}")
else:
    insights.append("No column has more than 10% missing values.")

# Insight 2: company_profile often missing for fake jobs — heuristic check
if 'company_profile' in df.columns:
    cp_missing_by_fraud = df.groupby('fraud')['company_profile'].apply(lambda s: s.isna().mean())
    insights.append(f"Proportion of missing company_profile by label: {cp_missing_by_fraud.to_dict()}")
else:
    insights.append("No 'company_profile' column in dataset to check missing patterns.")

# Insight 3: salary / salary_range unrealistic check (if columns exist)
salary_cols = [c for c in df.columns if 'salary' in c.lower()]
if salary_cols:
    insights.append(f"Salary-related columns detected: {salary_cols} — inspect ranges manually as needed.")
else:
    insights.append("No explicit salary columns detected; salary may be embedded in text descriptions.")

for i, ins in enumerate(insights, 1):
    print(f"{i}. {ins}")

# Part 2 — Text Cleaning & Preprocessing
print('\n=== Part 2: Text cleaning and preprocessing ===')

# We'll use the 'description' column as requested
if 'description' not in df.columns:
    raise ValueError("The dataset does not contain a 'description' column.")

# Fill NaNs with empty string for processing
df['description'] = df['description'].fillna('')

# Average word count before cleaning
df['raw_word_count'] = df['description'].apply(lambda t: len(str(t).split()))
avg_before = df['raw_word_count'].mean()
print('Average word count (raw description):', round(avg_before, 2))

# Initialize NLP tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Create cleaned column step-by-step
print('Cleaning descriptions (this may take a little while depending on dataset size)...')
# 1. Basic cleaning
df['description_basic_clean'] = df['description'].apply(basic_text_clean)
# 2. Tokenize, remove stopwords, lemmatize
# Apply in a vectorized-friendly but still explicit manner
df['clean_description'] = df['description_basic_clean'].apply(lambda t: tokenize_and_lemmatize(t, lemmatizer, stop_words))

# Average word count after cleaning
df['clean_word_count'] = df['clean_description'].apply(lambda t: len(t.split()))
avg_after = df['clean_word_count'].mean()
print('Average word count (cleaned):', round(avg_after, 2))

# Show one raw vs cleaned sample (choose first non-empty description)
sample_idx = df[df['description'].str.strip() != ''].index.tolist()
if sample_idx:
    i = sample_idx[0]
    print('\nExample - raw vs cleaned description (row index =', i, ')')
    print('\nRAW:')
    print(df.at[i, 'description'][:1000])
    print('\nCLEANED:')
    print(df.at[i, 'clean_description'][:1000])
else:
    print('No non-empty descriptions to display samples for.')

# Part 3 — Feature Extraction (TF-IDF)
print('\n=== Part 3: TF-IDF feature extraction ===')

vectorizer = TfidfVectorizer(max_features=MAX_FEATURES)
# Fit-transform on cleaned descriptions
X_tfidf = vectorizer.fit_transform(df['clean_description'].fillna(''))

print('TF-IDF matrix shape:', X_tfidf.shape)
feature_names = vectorizer.get_feature_names_out()
print('\n10 sample feature names:')
print(list(feature_names[:10]))

# Top 15 words by global TF-IDF importance (sum of TF-IDF across all documents)
global_tfidf_sum = np.asarray(X_tfidf.sum(axis=0)).ravel()
top15_idx = global_tfidf_sum.argsort()[::-1][:15]
print('\nTop 15 words by global TF-IDF score:')
for rank, idx in enumerate(top15_idx, 1):
    print(f"{rank}. {feature_names[idx]} (score={global_tfidf_sum[idx]:.4f})")

# Part 4 — Model Building
print('\n=== Part 4: Model building & evaluation ===')

# Prepare label vector — drop rows where fraud is NaN
mask = df['fraud'].notna()
X = X_tfidf[mask.values]
y = df.loc[mask, 'fraud'].astype(int).values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

# Train logistic regression
clf = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

print('\nAccuracy:', round(acc, 4))
print('Precision:', round(prec, 4))
print('Recall:', round(rec, 4))
print('F1-score:', round(f1, 4))

# Confusion matrix and classification report
cm = confusion_matrix(y_test, y_pred)
print('\nConfusion matrix (rows=true, cols=predicted):')
print(cm)

print('\nClassification report:')
print(classification_report(y_test, y_pred, zero_division=0))

# Brief interpretation (2-3 sentences) — printed as plain text lines
print('\nModel interpretation:')
print('The model uses TF-IDF features and Logistic Regression. Accuracy and F1 indicate overall performance; inspect precision/recall for the fake class to understand false positives vs false negatives.')

# Part 5 — Model Analysis
print('\n=== Part 5: Model analysis ===')

# Predict_proba on 5 random examples from original dataset (that have labels)
all_indices = df[mask].index.tolist()
random.seed(RANDOM_STATE)
sample_five = random.sample(all_indices, min(5, len(all_indices)))
print('\nFive random examples with predicted probability of being fake:')
probs = clf.predict_proba(vectorizer.transform(df.loc[sample_five, 'clean_description']))
for idx, p in zip(sample_five, probs):
    prob_fake = p[1]
    print(f"Index {idx} — prob_fake: {prob_fake:.4f} — true label: {int(df.at[idx, 'fraud'])}")

# Manually inspect one predicted fake and one predicted real from the sample set
# If none in sample_five suit, search the test set predictions
preds_all = clf.predict(vectorizer.transform(df.loc[mask, 'clean_description']))
proba_all = clf.predict_proba(vectorizer.transform(df.loc[mask, 'clean_description']))

# Find one predicted fake and one predicted real (prefer misclassifications or clear cases)
predicted_fake_indices = [i for i, p in zip(df.loc[mask].index, preds_all) if p == 1]
predicted_real_indices = [i for i, p in zip(df.loc[mask].index, preds_all) if p == 0]

inspect_examples = []
if predicted_fake_indices:
    inspect_examples.append(('predicted_fake', predicted_fake_indices[0]))
if predicted_real_indices:
    inspect_examples.append(('predicted_real', predicted_real_indices[0]))

print('\nManual inspection of one predicted fake and one predicted real:')
for labelname, idx in inspect_examples:
    true_label = int(df.at[idx, 'fraud'])
    prob_fake = float(clf.predict_proba(vectorizer.transform([df.at[idx, 'clean_description']]))[0][1])
    raw_desc = df.at[idx, 'description']
    clean_desc = df.at[idx, 'clean_description']
    print('\n---')
    print(f"Index {idx} — {labelname} — prob_fake={prob_fake:.4f} — true_label={true_label}")
    print('\nRaw description (first 700 chars):')
    print(raw_desc[:700])
    print('\nCleaned description (first 700 chars):')
    print(clean_desc[:700])
    # Provide short textual reasoning (one sentence) about consistency
    reasoning = 'Consistent' if (prob_fake > 0.5 and true_label == 1) or (prob_fake <= 0.5 and true_label == 0) else 'Potential mismatch'
    print('\nQuick assessment:', reasoning)

# Persist model and vectorizer
joblib.dump(clf, MODEL_OUTPUT)
joblib.dump(vectorizer, VECTORIZER_OUTPUT)
print(f"\nSaved model to {MODEL_OUTPUT} and vectorizer to {VECTORIZER_OUTPUT}.")

# End of script
print('\nPipeline complete.')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!



=== Dataset overview ===
Shape: (17880, 18)
Columns: ['job_id', 'title', 'location', 'department', 'salary_range', 'company_profile', 'description', 'requirements', 'benefits', 'telecommuting', 'has_company_logo', 'has_questions', 'employment_type', 'required_experience', 'required_education', 'industry', 'function', 'fraudulent']

Missing values per column:
job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2696
benefits                7212
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

Label column detected: fraudulent

Distribution (fraud label counts):
fraud
0    17014
1      866
Name