In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import nltk
nltk.download('vader_lexicon')
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score,roc_auc_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textstat import flesch_reading_ease
import numpy as np
from imblearn.over_sampling import SMOTE
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
from nltk.corpus import stopwords
from scipy import stats
nltk.download('punkt')
stemmer = PorterStemmer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Load and preprocess data
df = pd.read_csv('C:/Users/lenovo/Downloads/fake_job_postings.csv')

In [31]:
df

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,...,required_education,industry,function,fraudulent,company_profile_word_count,job_requirements_word_count,description_word_count,requirements_word_count,description_sentiment,description_complexity
0,1,Marketing Intern,"US, NY, New York",Marketing,,food52 creat groundbreak cook site support con...,"Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,...,,,Marketing,0,141,115,124,115,0.6486,-16.84
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,90 second world cloud video product second wor...,Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,...,,Marketing and Advertising,Customer Service,0,153,200,315,200,0.9951,55.74
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,valor servic provid workforc solut meet need c...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,...,,,,0,141,164,50,164,0.9509,20.68
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,passion improv qualiti life geographi heart ev...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,...,Bachelor's Degree,Computer Software,Sales,0,85,176,346,176,0.9957,-6.12
4,5,Bill Review Manager,"US, FL, Fort Worth",,,spotsourc solut llc global human capit manag c...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,...,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,207,89,168,89,0.9426,-44.59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,17876,Account Director - Distribution,"CA, ON, Toronto",Sales,,vend look awesom new talent come join us work ...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,0,...,,Computer Software,Sales,0,290,180,226,180,0.9920,51.41
17876,17877,Payroll Accountant,"US, PA, Philadelphia",Accounting,,weblinc platform servic provid fastest grow on...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,0,...,Bachelor's Degree,Internet,Accounting/Auditing,0,330,111,161,111,0.9652,4.95
17877,17878,Project Cost Control Staff Engineer - Cost Con...,"US, TX, Houston",,,provid full time perman posit mani medium larg...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,,0,...,,,,0,32,159,171,159,0.9750,8.91
17878,17879,Graphic Designer,"NG, LA, Lagos",,,,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,0,...,Professional,Graphic Design,Design,0,1,86,77,86,0.9278,47.38


In [3]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
sia = SentimentIntensityAnalyzer()

In [4]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [ps.stem(word) for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(tokens)

In [None]:
# Apply preprocessing
df['company_profile'] = df['company_profile'].fillna('').apply(preprocess_text)
df['job_requirements'] = df['requirements'].fillna('').apply(preprocess_text)
df['description'] = df['description'].fillna('').apply(preprocess_text)

In [None]:
df['company_profile_word_count'] = df['company_profile'].apply(lambda x: len(word_tokenize(x)))
df['job_requirements_word_count'] = df['job_requirements'].apply(lambda x: len(word_tokenize(x)))
df['description_sentiment'] = df['description'].apply(lambda x: sia.polarity_scores(x)['compound'])
df['job_requirements_sentiment'] = df['job_requirements'].apply(lambda x: sia.polarity_scores(x)['compound'])


In [None]:
features = df[['company_profile_word_count', 'job_requirements_word_count', 
                 'description_sentiment', 'job_requirements_sentiment']]
labels = df['fraudulent'] 

In [None]:
# TF-IDF for Text Columns
tfidf = TfidfVectorizer(max_features=200)
tfidf_features = tfidf.fit_transform(df['description']).toarray()
tfidf_df = pd.DataFrame(tfidf_features, columns=[f'tfidf_{i}' for i in range(tfidf_features.shape[1])])
features = pd.concat([features.reset_index(drop=True), tfidf_df], axis=1)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Oversample with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Define model and parameters
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_params = {'n_estimators': [100, 200], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5]}


from sklearn.model_selection import RandomizedSearchCV
rf_grid = RandomizedSearchCV(rf, rf_params, cv=3, scoring='f1', n_iter=5)
# Grid search on Random Forest
#rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='f1')
rf_grid.fit(X_resampled, y_resampled)

# Best model and evaluation
best_rf = rf_grid.best_estimator_
print("Random Forest Best Parameters:", rf_grid.best_params_)

# Evaluate on the test set
y_pred_rf = best_rf.predict(X_test)
y_pred_proba_rf = best_rf.predict_proba(X_test)[:, 1]  # for ROC-AUC

Random Forest Best Parameters: {'n_estimators': 100, 'min_samples_split': 2, 'max_depth': None}


In [None]:
# Final Model Selection based on Validation Performance
final_rf = rf_grid.best_estimator_
#final_lr = lr_grid.best_estimator_

for model in [final_rf]:
    y_pred = model.predict(X_test)
    print(f"Model: {model}")
    print(classification_report(y_test, y_pred))
    print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba_rf))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    #y_pred_rf = rf.predict(X_test)



Model: RandomForestClassifier(class_weight='balanced', random_state=42)
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3395
           1       0.91      0.56      0.69       181

    accuracy                           0.97      3576
   macro avg       0.94      0.78      0.84      3576
weighted avg       0.97      0.97      0.97      3576

ROC-AUC Score: 0.9483323704830795
Accuracy: 0.9748322147651006


In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

In [18]:
import os
import pickle
os.makedirs('E:/Assignments/0.1_web_dev/model', exist_ok=True)
with open('E:/Assignments/0.1_web_dev/model/model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Also save TF-IDF Vectorizer if you’ll need it in the app
with open('E:/Assignments/0.1_web_dev/model/tfidf.pkl', 'wb') as tfidf_file:
    pickle.dump(tfidf, tfidf_file)