# IMPORTING REQUIRED LIBRARIES

In [32]:
import pandas as pd 
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder , LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

ModuleNotFoundError: No module named 'xgboost'

# IMPORTING DATA

In [2]:
df = pd.read_csv(r"C:/Users/asus/Desktop/Prachi _1/fakejob/fake_job_postings.csv" ,  encoding = "utf-8")

In [3]:
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [4]:
df.info

<bound method DataFrame.info of        job_id                                              title  \
0           1                                   Marketing Intern   
1           2          Customer Service - Cloud Video Production   
2           3            Commissioning Machinery Assistant (CMA)   
3           4                  Account Executive - Washington DC   
4           5                                Bill Review Manager   
...       ...                                                ...   
17875   17876                   Account Director - Distribution    
17876   17877                                 Payroll Accountant   
17877   17878  Project Cost Control Staff Engineer - Cost Con...   
17878   17879                                   Graphic Designer   
17879   17880                         Web Application Developers   

                   location   department salary_range  \
0          US, NY, New York    Marketing          NaN   
1            NZ, , Auckland      Succ

In [5]:
df.describe()

Unnamed: 0,job_id,telecommuting,has_company_logo,has_questions,fraudulent
count,17880.0,17880.0,17880.0,17880.0,17880.0
mean,8940.5,0.042897,0.795302,0.491723,0.048434
std,5161.655742,0.202631,0.403492,0.499945,0.214688
min,1.0,0.0,0.0,0.0,0.0
25%,4470.75,0.0,1.0,0.0,0.0
50%,8940.5,0.0,1.0,0.0,0.0
75%,13410.25,0.0,1.0,1.0,0.0
max,17880.0,1.0,1.0,1.0,1.0


In [6]:
df['fraudulent'].value_counts(normalize=True)

0    0.951566
1    0.048434
Name: fraudulent, dtype: float64

# DATA CLEANING

In [7]:
df.isnull().sum()


job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2695
benefits                7210
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

In [8]:
#droping unnecessary values
df.drop(columns=['job_id','department','salary_range'],inplace = True , errors = 'ignore')

In [9]:
# Fill text-based missing values
text_cols = ['company_profile', 'description', 'requirements', 'benefits', 
             'employment_type', 'required_experience', 'required_education', 
             'industry', 'function','location']

df[text_cols] = df[text_cols].fillna("Unknown")


In [10]:
df.isnull().sum()

title                  0
location               0
company_profile        0
description            0
requirements           0
benefits               0
telecommuting          0
has_company_logo       0
has_questions          0
employment_type        0
required_experience    0
required_education     0
industry               0
function               0
fraudulent             0
dtype: int64

# DATA PREPROCESSING

In [11]:
df['title'] = df['title'].str.lower()
df['company_profile'] = df['company_profile'].str.lower()
df['description'] = df['description'].str.lower()
df['requirements'] = df['requirements'].str.lower()


In [12]:
#removing special characters and numbers
def clean_text(text):
    if isinstance(text , str):#check if it is a string
        text = re.sub(r'[^a-zA-Z\s]','',text)
        text = re.sub(r'\s+',' ',text)
        return text.strip()
    return text

In [13]:
df['title'] = df['title'].apply(clean_text)
df['company_profile']=df['company_profile'].apply(clean_text)
df['requirements']=df['requirements'].apply(clean_text)



In [14]:
#removing stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    if isinstance(text, str):
        return ' '.join([word for word in text.split() if word not in stop_words])
    return text


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
df['description'] = df['description'].apply(remove_stopwords)
df['requirements'] = df['requirements'].apply(remove_stopwords)
df['company_profile'] = df['company_profile'].apply(remove_stopwords)
df['benefits'] = df['benefits'].apply(remove_stopwords)


In [16]:
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
#Lemmatization 
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(text) # tokenize text into words
    lemmatized_text = " ".join([lemmatizer.lemmatize(word)for word in tokens])
    return lemmatized_text


In [18]:
df['description'] = df['description'].apply(lemmatize_text)
df['requirements'] = df['requirements'].apply(lemmatize_text)
df['company_profile'] = df['company_profile'].apply(lemmatize_text)
df['benefits'] = df['benefits'].apply(lemmatize_text)

In [20]:
df['description'] = df['description'].fillna("")
df['requirements'] = df['requirements'].fillna("")

# Vectorization
col = ['description', 'requirements']
tfidf = TfidfVectorizer(max_features=5000)
tfidf_dfs = []

for c in col:
    tfidf_matrix = tfidf.fit_transform(df[c])  # Process one column at a time
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names())  # Convert to DataFrame
    tfidf_df.columns = [f"{c}_{word}" for word in tfidf_df.columns]  # Rename columns for clarity
    tfidf_dfs.append(tfidf_df)

# Concatenate TF-IDF results
df_tfidf = pd.concat(tfidf_dfs, axis=1)

# Merge with original DataFrame (if needed)
df = pd.concat([df, df_tfidf], axis=1)


In [21]:
# Encode categorical columns
one_hot_cols = ['employment_type', 'industry', 'function']
label_encode_cols = ['required_experience', 'required_education']

# Ensure only existing columns are encoded
existing_one_hot_cols = [col for col in one_hot_cols if col in df.columns]
existing_label_encode_cols = [col for col in label_encode_cols if col in df.columns]



In [22]:
# One-Hot Encoding
if existing_one_hot_cols:
    df = pd.get_dummies(df, columns=existing_one_hot_cols)

In [23]:
# Label Encoding
le = LabelEncoder()
for col in existing_label_encode_cols:
    df[col] = le.fit_transform(df[col])

In [24]:
#  Remove remaining non-numeric columns (if any)
df = df.select_dtypes(include=[np.number])

# Check for missing values and fill them
df.fillna(df.median(), inplace=True)


In [25]:
print(df.select_dtypes(include=['uint8']).head())

   employment_type_Contract  employment_type_Full-time  employment_type_Other  \
0                         0                          0                      1   
1                         0                          1                      0   
2                         0                          0                      0   
3                         0                          1                      0   
4                         0                          1                      0   

   employment_type_Part-time  employment_type_Temporary  \
0                          0                          0   
1                          0                          0   
2                          0                          0   
3                          0                          0   
4                          0                          0   

   employment_type_Unknown  industry_Accounting  industry_Airlines/Aviation  \
0                        0                    0                           0   


# MODEL TRAINING

In [26]:
#spliting the data
X = df.drop(columns='fraudulent')
y = df['fraudulent']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2 ,random_state=42,stratify=y)

In [27]:
print(X_train.dtypes.value_counts())


float64    10000
uint8        176
int64          3
int32          2
dtype: int64


In [28]:
#Handling Nans and infinite values to resolve any errors
print(X_train.isna().sum())  # Check for missing values in each column
X_train = X_train.fillna(0)  # Replace NaNs with 0 (or use mean/median)
X_test = X_test.fillna(0)

print(np.isinf(X_train).sum())  # Count infinite values
X_train = X_train.replace([np.inf, -np.inf], np.nan)  # Convert inf to NaN
X_train = X_train.fillna(0)  # Fill NaNs with 0

X_test = X_test.replace([np.inf, -np.inf], np.nan)
X_test = X_test.fillna(0)


telecommuting                 0
has_company_logo              0
has_questions                 0
required_experience           0
required_education            0
                             ..
function_Strategy/Planning    0
function_Supply Chain         0
function_Training             0
function_Unknown              0
function_Writing/Editing      0
Length: 10181, dtype: int64
telecommuting                 0
has_company_logo              0
has_questions                 0
required_experience           0
required_education            0
                             ..
function_Strategy/Planning    0
function_Supply Chain         0
function_Training             0
function_Unknown              0
function_Writing/Editing      0
Length: 10181, dtype: int64


In [33]:
model = RandomForestClassifier(class_weight='balanced',random_state=42,n_estimators=500)

model.fit(X_train , y_train)
y_pred = model.predict(X_test)


# MODEL EVALUATION

In [34]:
#evaluation
print("Confusion Matrix:",confusion_matrix(y_test , y_pred))
print("Classification report:",classification_report(y_test,y_pred))
print("Accuracy score:",accuracy_score(y_test,y_pred))


Confusion Matrix: [[3402    1]
 [  72  101]]
Classification report:               precision    recall  f1-score   support

           0       0.98      1.00      0.99      3403
           1       0.99      0.58      0.73       173

    accuracy                           0.98      3576
   macro avg       0.98      0.79      0.86      3576
weighted avg       0.98      0.98      0.98      3576

Accuracy score: 0.979586129753915
