In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/raw/fake_real_job_postings_3000x25.csv")
df.head()

df.columns
df["company_profile_present"] = df["company_profile"].notna().astype(int)
df["company_website_present"] = df["company_website"].notna().astype(int)
df["contact_email_present"] = df["contact_email"].notna().astype(int)
df["salary_range_present"] = df["salary_range"].notna().astype(int)

binary_features = df[
    ["has_logo", "telecommuting"]
].fillna(0)

df[["job_id","has_logo", "telecommuting","company_profile_present", "company_website_present", "contact_email_present", "salary_range_present"]]
df["num_open_positions"] = df["num_open_positions"].fillna(0)
df["required_experience_years"] = df["required_experience_years"].fillna(0)
df["text_length"] = df["text_length"].fillna(0)

df["employment_type"] = df["employment_type"].fillna("Unknown")

employment_type_encoded = pd.get_dummies(
    df["employment_type"],
    prefix="employment_type"
)

X_aux = pd.concat(
    [
        df[
            [
                "company_profile_present",
                "company_website_present",
                "contact_email_present",
                "salary_range_present",
                "num_open_positions",
                "required_experience_years",
                "text_length"
            ]
        ],
        binary_features,
        employment_type_encoded
    ],
    axis=1
)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_aux_scaled = scaler.fit_transform(X_aux)

np.save("../data/processed/X_aux.npy", X_aux_scaled)

import tensorflow as tf
import numpy as np
import pandas as pd
import pickle
import re

    
bilstm_model = tf.keras.models.load_model("../models/bilstm_model.keras")
print("Bi-LSTM model loaded successfully ✅")

feature_extractor = tf.keras.Model(
    inputs=bilstm_model.input,
    outputs=bilstm_model.get_layer("bilstm_layer").output
)

print("Feature extractor created from Bi-LSTM layer ")

with open("../models/tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

print("Tokenizer loaded successfully ")

df = pd.read_csv("../data/raw/fake_real_job_postings_3000x25.csv")
print("Dataset loaded successfully ")
print("Total records:", len(df))

text_columns = ["job_title", "job_description", "requirements", "benefits"]
df[text_columns] = df[text_columns].fillna("")

df["merged_text"] = (
    df["job_title"] + " " +
    df["job_description"] + " " +
    df["requirements"] + " " +
    df["benefits"]
)

print("Text columns merged successfully ")
print("Sample merged text:\n")
print(df["merged_text"].iloc[0][:300], "...")

def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean_text"] = df["merged_text"].apply(clean_text)

print("Text cleaned successfully")
print("Sample cleaned text:\n")
print(df["clean_text"].iloc[0][:300], "...")

MAX_LEN = 300

sequences = tokenizer.texts_to_sequences(df["clean_text"])
X_text = tf.keras.preprocessing.sequence.pad_sequences(
    sequences,
    maxlen=MAX_LEN,
    padding="post",
    truncating="post"
)

print("Text tokenized and padded successfully ")
print("Text tensor shape:", X_text.shape)

X_bilstm_features = feature_extractor.predict(X_text, batch_size=32)

print("Bi-LSTM feature extraction completed ✅")
print("Feature vector shape:", X_bilstm_features.shape)

np.save("../data/processed/X_bilstm_features.npy", X_bilstm_features)

print("Bi-LSTM feature vectors saved successfully ")


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

print("Libraries imported successfully ✅")

X_bilstm = np.load("../data/processed/X_bilstm_features.npy")
print("Bi-LSTM feature vectors loaded ")
print("Bi-LSTM feature shape:", X_bilstm.shape)
X_aux = np.load("../data/processed/X_aux.npy")
print("Auxiliary features loaded ")
print("Auxiliary feature shape:", X_aux.shape)
X_final = np.concatenate([X_bilstm, X_aux], axis=1)

print("Bi-LSTM + Auxiliary features combined successfully ")
print("Final feature vector shape:", X_final.shape)



y = pd.read_csv("../data/raw/fake_real_job_postings_3000x25.csv")["is_fake"].values

print("Labels loaded successfully ")
print("Total labels:", len(y))

X_train, X_temp, y_train, y_temp = train_test_split(
    X_final, y, test_size=0.30, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print("Dataset split completed ")
print("Training samples:", X_train.shape[0])
print("Validation samples:", X_val.shape[0])
print("Testing samples:", X_test.shape[0])

mlp_model = Sequential([
    Dense(128, activation="relu", input_shape=(X_final.shape[1],)),
    Dropout(0.4),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])

print("MLP model architecture created ")

mlp_model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

mlp_model.summary()

early_stop = EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True
)
print("MLP training started...............! ")


history = mlp_model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=32,
    callbacks=[early_stop]
)

print("MLP training completed successfully ")

test_loss, test_accuracy = mlp_model.evaluate(X_test, y_test)

print("Final MLP Test Accuracy:", test_accuracy)
print("Final MLP Test Loss:", test_loss)

mlp_model.save("../models/mlp_model.keras")
print("MLP model saved successfully ✅")


Libraries imported successfully ✅
