In [3]:
import pandas as pd

df = pd.read_csv("fake_job_postings.csv")
df.head()


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [4]:
df.shape        # rows, columns
df.columns      # column names
df.info()       # data types + missing values


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   department           6333 non-null   object
 4   salary_range         2868 non-null   object
 5   company_profile      14572 non-null  object
 6   description          17879 non-null  object
 7   requirements         15184 non-null  object
 8   benefits             10668 non-null  object
 9   telecommuting        17880 non-null  int64 
 10  has_company_logo     17880 non-null  int64 
 11  has_questions        17880 non-null  int64 
 12  employment_type      14409 non-null  object
 13  required_experience  10830 non-null  object
 14  required_education   9775 non-null   object
 15  industry             12977 non-null  object
 16  func

In [5]:
df['fraudulent'].value_counts()


fraudulent
0    17014
1      866
Name: count, dtype: int64

In [6]:
df['fraudulent'].unique()


array([0, 1])

In [7]:
# Fill missing values
df = df.fillna("")

# Combine important text columns
df['text'] = (
    df['title'] + " " +
    df['company_profile'] + " " +
    df['description'] + " " +
    df['requirements'] + " " +
    df['benefits']
)

df[['text', 'fraudulent']].head()


Unnamed: 0,text,fraudulent
0,"Marketing Intern We're Food52, and we've creat...",0
1,Customer Service - Cloud Video Production 90 S...,0
2,Commissioning Machinery Assistant (CMA) Valor ...,0
3,Account Executive - Washington DC Our passion ...,0
4,Bill Review Manager SpotSource Solutions LLC i...,0


In [9]:
import re

def clean_text(text):
    text = text.lower()                      # lowercase
    text = re.sub(r'http\S+', ' ', text)     # remove URLs
    text = re.sub(r'[^a-zA-Z ]', ' ', text)  # remove numbers & symbols
    text = re.sub(r'\s+', ' ', text)         # remove extra spaces
    return text.strip()


In [10]:
df['clean_text'] = df['text'].apply(clean_text)

df[['clean_text', 'fraudulent']].head()


Unnamed: 0,clean_text,fraudulent
0,marketing intern we re food and we ve created ...,0
1,customer service cloud video production second...,0
2,commissioning machinery assistant cma valor se...,0
3,account executive washington dc our passion fo...,0
4,bill review manager spotsource solutions llc i...,0


In [11]:
df['clean_text'].str.len().describe()


count    17880.000000
mean      2562.341443
std       1411.624985
min         14.000000
25%       1532.000000
50%       2430.000000
75%       3347.000000
max      14443.000000
Name: clean_text, dtype: float64

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english'
)

X = tfidf.fit_transform(df['clean_text'])
y = df['fraudulent']


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [14]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced'  # important for imbalanced data
)

model.fit(X_train, y_train)


In [15]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]


In [16]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[3321   82]
 [  18  155]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      3403
           1       0.65      0.90      0.76       173

    accuracy                           0.97      3576
   macro avg       0.82      0.94      0.87      3576
weighted avg       0.98      0.97      0.97      3576



In [17]:
model.score(X_test, y_test)


0.9720357941834452

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)



In [19]:
results = pd.DataFrame({
    'clean_text': df.loc[y_test.index, 'clean_text'],
    'actual_label': y_test.values,
    'predicted_label': y_pred,
    'fake_probability': y_prob
})


In [20]:
results['actual_status'] = results['actual_label'].map({0: 'Real', 1: 'Fake'})
results['predicted_status'] = results['predicted_label'].map({0: 'Real', 1: 'Fake'})


In [21]:
results.head()


Unnamed: 0,clean_text,actual_label,predicted_label,fake_probability,actual_status,predicted_status
16995,excellent er rn opportunity available now our ...,0,1,0.582291,Real,Fake
9357,scrum master website development project manag...,0,0,0.037536,Real,Real
11561,hr assistant contract squiz is one of the worl...,0,0,0.059402,Real,Real
1105,regional sales director south africa upstream ...,0,0,0.036142,Real,Real
1980,petrophysicist valor services provides workfor...,0,0,0.067553,Real,Real


In [22]:
results['actual_status'].value_counts()
results['predicted_status'].value_counts()


predicted_status
Real    3339
Fake     237
Name: count, dtype: int64

In [23]:
results.sort_values('fake_probability', ascending=False).head(5)


Unnamed: 0,clean_text,actual_label,predicted_label,fake_probability,actual_status,predicted_status
4832,senior engineering product manager aptitude st...,1,1,0.998049,Fake,Fake
1821,principal senior mechanical engineer package e...,1,1,0.994382,Fake,Fake
5505,home based payroll data entry clerk position e...,1,1,0.993066,Fake,Fake
5506,home based payroll data entry clerk position e...,1,1,0.993066,Fake,Fake
2396,senior qa engineer aptitude staffing solutions...,1,1,0.988941,Fake,Fake


In [24]:
results.to_csv("fake_job_predictions.csv", index=False)


In [25]:
import sqlite3
import pandas as pd

# load csv
df = pd.read_csv("fake_job_predictions.csv")

# create database
conn = sqlite3.connect("fake_jobs.db")

# write to sql table
df.to_sql("job_predictions", conn, if_exists="replace", index=False)

conn.close()


In [26]:
conn = sqlite3.connect("fake_jobs.db")
pd.read_sql("SELECT * FROM job_predictions LIMIT 5", conn)


Unnamed: 0,clean_text,actual_label,predicted_label,fake_probability,actual_status,predicted_status
0,excellent er rn opportunity available now our ...,0,1,0.582291,Real,Fake
1,scrum master website development project manag...,0,0,0.037536,Real,Real
2,hr assistant contract squiz is one of the worl...,0,0,0.059402,Real,Real
3,regional sales director south africa upstream ...,0,0,0.036142,Real,Real
4,petrophysicist valor services provides workfor...,0,0,0.067553,Real,Real
