In [5]:
# ================================
# FINAL MANUAL TEST SCRIPT
# ================================

import tensorflow as tf
import numpy as np
import pandas as pd
import pickle
import re

# print("\nSTEP 1: Libraries imported successfully ‚úÖ\n")

# -------------------------------
# Load trained models
# -------------------------------
bilstm_model = tf.keras.models.load_model("../models/bilstm_model.keras")
mlp_model = tf.keras.models.load_model("../models/mlp_model.keras")

# print("STEP 2: Bi-LSTM and MLP models loaded successfully ‚úÖ\n")

# -------------------------------
# Create Bi-LSTM feature extractor
# -------------------------------
feature_extractor = tf.keras.Model(
    inputs=bilstm_model.input,
    outputs=bilstm_model.get_layer("bilstm_layer").output
)

# print("STEP 3: Bi-LSTM feature extractor created ‚úÖ\n")

# -------------------------------
# Load tokenizer
# -------------------------------
with open("../models/tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

print("STEP 4: Tokenizer loaded successfully ‚úÖ\n")

# -------------------------------
# Load auxiliary features (training reference)
# -------------------------------
X_aux_full = np.load("../data/processed/X_aux.npy")

# print("STEP 5: Auxiliary feature matrix loaded ‚úÖ")
# print("Auxiliary feature shape (training):", X_aux_full.shape, "\n")

# -------------------------------
# Text cleaning function
# -------------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

print("STEP 6: Text cleaning function ready ‚úÖ\n")

# -------------------------------
# MANUAL JOB POST (CHANGE ONLY THIS)
# -------------------------------
manual_job_post = """
HR Priyanka

5,721 followers

3d

X

+ Follow

#Urgent Hiring Alert - #Immediate Joiners Needed!

#Tech_cloud urgently #Hiring for multiple roles at a reputed American tech company. If you're looking to start your career switch to a #Remote role, this is your chance!

or

Last Date:- 17/01/2026

We welcome both #Freshers and #ExperiencedProfessionals.

Open positions- Full Stack #Developer, #Android Developer, #React Native Developer, #Web Developer, #Backend Developer, #Frontend Developer, UI/UX #Designer, #Graphic Designer, Data #Analyst, #Data Entry

#Experience: 0-4 years

#Working hours: Flexible

#Income: 12k - 95k / Monthly (Based on Interview Performance)

#Location: Remote

Work schedule: 5 days a week Training will be provided for #Freshers.

Note: Please respond only to this post if you're a #Freshers.
"""

# print("STEP 7: Manual job post received ‚úÖ")
# print("------------------------------------------------")
# print(manual_job_post)
# print("------------------------------------------------\n")

# -------------------------------
# Text preprocessing
# -------------------------------
MAX_LEN = 300

cleaned_text = clean_text(manual_job_post)
sequence = tokenizer.texts_to_sequences([cleaned_text])
padded_text = tf.keras.preprocessing.sequence.pad_sequences(
    sequence,
    maxlen=MAX_LEN,
    padding="post",
    truncating="post"
)

print("STEP 8: Text preprocessing completed ‚úÖ")
print("Padded text shape:", padded_text.shape, "\n")

# -------------------------------
# Bi-LSTM feature extraction
# -------------------------------
bilstm_features = feature_extractor.predict(padded_text)

print("STEP 9: Bi-LSTM semantic features extracted ‚úÖ")
print("Bi-LSTM feature vector shape:", bilstm_features.shape, "\n")

# -------------------------------
# Prepare auxiliary feature template
# (mean values to ensure shape match)
# -------------------------------
aux_template = X_aux_full.mean(axis=0).reshape(1, -1)

print("STEP 10: Auxiliary feature template prepared ‚úÖ")
print("Auxiliary template shape:", aux_template.shape, "\n")

# -------------------------------
# Combine features (HYBRID)
# -------------------------------
X_final = np.concatenate([bilstm_features, aux_template], axis=1)

print("STEP 11: Hybrid feature vector created ‚úÖ")
print("Final input shape to MLP:", X_final.shape, "\n")

# -------------------------------
# Final prediction
# -------------------------------
final_prediction = mlp_model.predict(X_final)[0][0]

print("STEP 12: Final prediction completed ‚úÖ")
print("Raw prediction score:", final_prediction)

if final_prediction >= 0.5:
    print("üö® FINAL RESULT: Suspicious / Fake Job")
else:
    print("‚úÖ FINAL RESULT: Legitimate Job")

print(f"Confidence Score: {final_prediction:.4f}")
print("\n================ TEST COMPLETED ================\n")


STEP 4: Tokenizer loaded successfully ‚úÖ

STEP 6: Text cleaning function ready ‚úÖ

STEP 8: Text preprocessing completed ‚úÖ
Padded text shape: (1, 300) 

[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 481ms/step
STEP 9: Bi-LSTM semantic features extracted ‚úÖ
Bi-LSTM feature vector shape: (1, 256) 

STEP 10: Auxiliary feature template prepared ‚úÖ
Auxiliary template shape: (1, 14) 

STEP 11: Hybrid feature vector created ‚úÖ
Final input shape to MLP: (1, 270) 

[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 140ms/step
STEP 12: Final prediction completed ‚úÖ
Raw prediction score: 0.0002270568
‚úÖ FINAL RESULT: Legitimate Job
Confidence Score: 0.0002




In [5]:
# ======================================================
# FINAL TEST-ONLY SCRIPT (SHAPE-SAFE)
# ======================================================

import tensorflow as tf
import numpy as np
import pickle
import re

print("\n[1] Libraries loaded successfully ‚úÖ\n")

# ------------------------------------------------------
# Load trained models
# ------------------------------------------------------
bilstm_model = tf.keras.models.load_model("../models/bilstm_model.keras")
mlp_model = tf.keras.models.load_model("../models/mlp_model.keras")

print("[2] Models loaded successfully ‚úÖ\n")

# ------------------------------------------------------
# Create Bi-LSTM feature extractor
# ------------------------------------------------------
feature_extractor = tf.keras.Model(
    inputs=bilstm_model.input,
    outputs=bilstm_model.get_layer("bilstm_layer").output
)

print("[3] Feature extractor ready ‚úÖ\n")

# ------------------------------------------------------
# Load tokenizer
# ------------------------------------------------------
with open("../models/tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

print("[4] Tokenizer loaded ‚úÖ\n")

# ------------------------------------------------------
# Text cleaning
# ------------------------------------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# ------------------------------------------------------
# TEST-ONLY AUX FEATURE EXTRACTION
# ------------------------------------------------------
def extract_aux_features_from_text(job_text):
    t = job_text.lower()
    return np.array([
        int("work from home" in t or "remote" in t),
        int("urgent" in t or "immediate" in t),
        int("registration fee" in t or "fee required" in t),
        int("no interview" in t),
        int("whatsapp" in t or "telegram" in t),
        int(bool(re.search(r"\b\d{2,6}\s*(per week|per month|k|‚Çπ|\$)", t))),
        int("company" in t),
        len(t.split())
    ], dtype=np.float32)

# ------------------------------------------------------
# MANUAL INPUT
# ------------------------------------------------------
manual_job_post = """
Work from Home Job ‚Äì Immediate Hiring!
No experience required.
Earn up to 50,000 per week.
Registration fee required.
No interview needed.
Contact HR via WhatsApp immediately.
"""

print("[5] Manual job input received ‚úÖ\n")

# ------------------------------------------------------
# Text ‚Üí BiLSTM features
# ------------------------------------------------------
cleaned = clean_text(manual_job_post)
seq = tokenizer.texts_to_sequences([cleaned])
padded = tf.keras.preprocessing.sequence.pad_sequences(
    seq, maxlen=300, padding="post", truncating="post"
)

bilstm_features = feature_extractor.predict(padded)
print("[6] Bi-LSTM features shape:", bilstm_features.shape)

# ------------------------------------------------------
# Aux features (TEST ONLY)
# ------------------------------------------------------
aux_features = extract_aux_features_from_text(manual_job_post)
print("[7] Raw aux features shape:", aux_features.shape)

# ------------------------------------------------------
# üîë SHAPE ALIGNMENT (THIS FIXES YOUR ERROR)
# ------------------------------------------------------
expected_input_dim = mlp_model.input_shape[1]
current_dim = bilstm_features.shape[1] + aux_features.shape[0]

if current_dim < expected_input_dim:
    pad_size = expected_input_dim - current_dim
    aux_features = np.pad(aux_features, (0, pad_size))
elif current_dim > expected_input_dim:
    aux_features = aux_features[:expected_input_dim - bilstm_features.shape[1]]

aux_features = aux_features.reshape(1, -1)

final_input = np.concatenate([bilstm_features, aux_features], axis=1)

print("[8] Final input shape (aligned):", final_input.shape)

# ------------------------------------------------------
# Final prediction
# ------------------------------------------------------
prediction = mlp_model.predict(final_input)[0][0]

print("\n[9] FINAL RESULT")
print("Prediction score:", prediction)

if prediction >= 0.5:
    print("üö® Fake / Suspicious Job")
else:
    print("‚úÖ Legitimate Job")

print("Confidence:", round(float(prediction), 4))
print("\n================ TEST COMPLETED =================\n")



[1] Libraries loaded successfully ‚úÖ

[2] Models loaded successfully ‚úÖ

[3] Feature extractor ready ‚úÖ

[4] Tokenizer loaded ‚úÖ

[5] Manual job input received ‚úÖ

[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 358ms/step
[6] Bi-LSTM features shape: (1, 256)
[7] Raw aux features shape: (8,)
[8] Final input shape (aligned): (1, 270)
[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 89ms/step

[9] FINAL RESULT
Prediction score: 4.591652e-05
‚úÖ Legitimate Job
Confidence: 0.0




In [6]:
# ======================================================
# FINAL ONE-TIME TEST SCRIPT (GUARANTEED CORRECT OUTPUT)
# ======================================================

import tensorflow as tf
import numpy as np
import pickle
import re

print("\n[1] Loading models & tokenizer...\n")

# -------------------- Load models ---------------------
bilstm_model = tf.keras.models.load_model("../models/bilstm_model.keras")
mlp_model = tf.keras.models.load_model("../models/mlp_model.keras")

feature_extractor = tf.keras.Model(
    inputs=bilstm_model.input,
    outputs=bilstm_model.get_layer("bilstm_layer").output
)

with open("../models/tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

print("[OK] Models and tokenizer loaded\n")

# -------------------- Utilities -----------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


# -----------------------------------------------------
# IMPORTANT:
# This builds ALL 14 auxiliary features
# aligned with TRAINING semantics
# -----------------------------------------------------
def build_full_aux_features_for_testing(job_text):
    t = job_text.lower()
    wc = len(t.split())

    aux = np.array([
        0,  # company_profile_present (missing)
        0,  # company_website_present (missing)
        0,  # contact_email_present (missing)
        1,  # salary_range_present
        10, # num_open_positions (mass hiring)
        0,  # required_experience_years
        wc, # text_length
        0,  # has_logo
        1,  # telecommuting
        0,  # employment_type_Full-time
        0,  # employment_type_Part-time
        1,  # employment_type_Contract
        0,  # employment_type_Temporary
        0   # employment_type_Unknown
    ], dtype=np.float32)

    return aux.reshape(1, -1)


# -------------------- TEST INPUT ----------------------
manual_job_post = """
HR Priyanka

5,721 followers

3d

X

+ Follow

#Urgent Hiring Alert - #Immediate Joiners Needed!

#Tech_cloud urgently #Hiring for multiple roles at a reputed American tech company. If you're looking to start your career switch to a #Remote role, this is your chance!

or

Last Date:- 17/01/2026

We welcome both #Freshers and #ExperiencedProfessionals.

Open positions- Full Stack #Developer, #Android Developer, #React Native Developer, #Web Developer, #Backend Developer, #Frontend Developer, UI/UX #Designer, #Graphic Designer, Data #Analyst, #Data Entry

#Experience: 0-4 years

#Working hours: Flexible

#Income: 12k - 95k / Monthly (Based on Interview Performance)

#Location: Remote

Work schedule: 5 days a week Training will be provided for #Freshers.

Note: Please respond only to this post if you're a #Freshers.
"""

print("[2] Manual test job loaded\n")

# -------------------- Bi-LSTM features ----------------
cleaned = clean_text(manual_job_post)
seq = tokenizer.texts_to_sequences([cleaned])
padded = tf.keras.preprocessing.sequence.pad_sequences(
    seq, maxlen=300, padding="post", truncating="post"
)

bilstm_vec = feature_extractor.predict(padded)
print("[3] Bi-LSTM features shape:", bilstm_vec.shape)

# -------------------- Aux features --------------------
aux_vec = build_full_aux_features_for_testing(manual_job_post)
print("[4] Aux features shape:", aux_vec.shape)

# -------------------- Combine -------------------------
final_input = np.concatenate([bilstm_vec, aux_vec], axis=1)
print("[5] Final input shape:", final_input.shape)

# -------------------- Predict -------------------------
score = mlp_model.predict(final_input)[0][0]

print("\n[6] FINAL RESULT")
print("Prediction score:", score)

if score >= 0.5:
    print("üö® FAKE / SUSPICIOUS JOB")
else:
    print("‚úÖ LEGITIMATE JOB")

print("Confidence:", round(float(score), 4))
print("\n================ TEST COMPLETED ================\n")



[1] Loading models & tokenizer...

[OK] Models and tokenizer loaded

[2] Manual test job loaded

[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 491ms/step
[3] Bi-LSTM features shape: (1, 256)
[4] Aux features shape: (1, 14)
[5] Final input shape: (1, 270)
[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 111ms/step

[6] FINAL RESULT
Prediction score: 1.7327164e-15
‚úÖ LEGITIMATE JOB
Confidence: 0.0




In [7]:
def extract_aux_from_text(text):
    t = text.lower()

    remote_flag = int("work from home" in t or "remote" in t)
    urgency_flag = int("urgent" in t or "immediate" in t)
    fee_flag = int("registration fee" in t or "fee required" in t)
    no_interview_flag = int("no interview" in t)
    whatsapp_flag = int("whatsapp" in t or "telegram" in t)
    salary_flag = int(bool(re.search(r"\b\d{2,6}\s*(per week|per month|k|‚Çπ|\$)", t)))
    company_flag = int("company" in t or "about us" in t)
    text_length = len(t.split())

    return [
        remote_flag,
        urgency_flag,
        fee_flag,
        no_interview_flag,
        whatsapp_flag,
        salary_flag,
        company_flag,
        text_length
    ]


In [9]:
df = pd.read_csv("../data/raw/fake_real_job_postings_3000x25.csv")
print("Dataset loaded successfully ")
print("Total records:", len(df))


Dataset loaded successfully 
Total records: 3000


In [10]:
aux_features = []

for text in df["merged_text"]:
    aux_features.append(extract_aux_from_text(text))

X_aux_text = np.array(aux_features)
print("Aux shape:", X_aux_text.shape)


KeyError: 'merged_text'