<a href="https://colab.research.google.com/github/Harikageddapu/FakeJobPrediction/blob/main/Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
import os
os.listdir()

['.config',
 'app.py',
 'PassiveAggressive_model.pkl',
 'KNN_model.pkl',
 'DataSet.csv',
 'tfidf.pkl',
 'MLP_model.pkl',
 'eda.py',
 'GradientBoosting_model.pkl',
 'sample_data']

In [36]:
!ls /content

app.py	     GradientBoosting_model.pkl  PassiveAggressive_model.pkl
DataSet.csv  KNN_model.pkl		 sample_data
eda.py	     MLP_model.pkl		 tfidf.pkl


In [37]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score
import pickle


In [38]:
df = pd.read_csv("/content/DataSet.csv")
df.columns = df.columns.str.strip()

print(df.shape)
df.head()


(17880, 18)


Unnamed: 0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,in_balanced_dataset
0,Marketing Intern,"US, NY, New York",Marketing,,"<h3>We're Food52, and we've created a groundbr...","<p>Food52, a fast-growing, James Beard Award-w...",<ul>\r\n<li>Experience with content management...,,f,t,f,Other,Internship,,,Marketing,f,f
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"<h3>90 Seconds, the worlds Cloud Video Product...",<p>Organised - Focused - Vibrant - Awesome!<br...,<p><b>What we expect from you:</b></p>\r\n<p>Y...,<h3><b>What you will get from us</b></h3>\r\n<...,f,t,f,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,f,f
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,<h3></h3>\r\n<p>Valor Services provides Workfo...,"<p>Our client, located in Houston, is actively...",<ul>\r\n<li>Implement pre-commissioning and co...,,f,t,f,,,,,,f,f
3,Account Executive - Washington DC,"US, DC, Washington",Sales,,<p>Our passion for improving quality of life t...,<p><b>THE COMPANY: ESRI – Environmental System...,<ul>\r\n<li>\r\n<b>EDUCATION: </b>Bachelor’s o...,<p>Our culture is anything but corporate—we ha...,f,t,f,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,f,f
4,Bill Review Manager,"US, FL, Fort Worth",,,<p>SpotSource Solutions LLC is a Global Human ...,<p><b>JOB TITLE:</b> Itemization Review Manage...,<p><b>QUALIFICATIONS:</b></p>\r\n<ul>\r\n<li>R...,<p>Full Benefits Offered</p>,f,t,t,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,f,f


In [39]:
text_cols = ['title', 'company_profile', 'description', 'requirements', 'benefits']

for col in text_cols:
    df[col] = df[col].fillna('')

df['text'] = (
    df['title'] + ' ' +
    df['company_profile'] + ' ' +
    df['description'] + ' ' +
    df['requirements'] + ' ' +
    df['benefits']
)

X = df['text']
y = df['fraudulent']


In [40]:
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=5000
)

X_vectorized = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y, test_size=0.2, random_state=42
)


In [42]:
models = {
    "MLP": MLPClassifier(hidden_layer_sizes=(100,), max_iter=300),
    "PassiveAggressive": PassiveAggressiveClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

trained_models = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"{name} Accuracy: {acc:.4f}")
    trained_models[name] = model


MLP Accuracy: 0.9846
PassiveAggressive Accuracy: 0.9846
GradientBoosting Accuracy: 0.9771
KNN Accuracy: 0.9765


In [43]:
# Save vectorizer
pickle.dump(vectorizer, open("tfidf.pkl", "wb"))

# Save all models
for name, model in trained_models.items():
    pickle.dump(model, open(f"{name}_model.pkl", "wb"))

print("All models saved successfully!")


All models saved successfully!


In [32]:
%%writefile app.py
import streamlit as st
import pickle

# Load models
lr_model = pickle.load(open("fake_job_model.pkl", "rb"))
rf_model = pickle.load(open("random_forest.pkl", "rb"))
gb_model = pickle.load(open("gradient_boost.pkl", "rb"))
knn_model = pickle.load(open("knn_model.pkl", "rb"))
vectorizer = pickle.load(open("tfidf.pkl", "rb"))

st.title("Fake Job Prediction")

text = st.text_area("Enter job description:")

model_choice = st.selectbox(
    "Choose ML Model",
    ["Logistic Regression", "Random Forest", "Gradient Boosting", "KNN"]
)

if st.button("Predict"):
    text_vec = vectorizer.transform([text])
    if model_choice == "Logistic Regression":
        pred = lr_model.predict(text_vec)
    elif model_choice == "Random Forest":
        pred = rf_model.predict(text_vec)
    elif model_choice == "Gradient Boosting":
        pred = gb_model.predict(text_vec)
    else:
        pred = knn_model.predict(text_vec)

    if pred[0] == 1:
        st.error("⚠️ Fake Job")
    else:
        st.success("✅ Real Job")


Writing app.py
