In [40]:
from google.colab import files
uploaded = files.upload()



MessageError: Error: credential propagation was unsuccessful

In [6]:
# Step 1: Load & Explore the Dataset
import pandas as pd
df = pd.read_csv('fake_job_postings.csv')
df.head

# basic info
df.shape
df.info()
df['fraudulent'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   department           6333 non-null   object
 4   salary_range         2868 non-null   object
 5   company_profile      14572 non-null  object
 6   description          17879 non-null  object
 7   requirements         15184 non-null  object
 8   benefits             10668 non-null  object
 9   telecommuting        17880 non-null  int64 
 10  has_company_logo     17880 non-null  int64 
 11  has_questions        17880 non-null  int64 
 12  employment_type      14409 non-null  object
 13  required_experience  10830 non-null  object
 14  required_education   9775 non-null   object
 15  industry             12977 non-null  object
 16  func

Unnamed: 0_level_0,count
fraudulent,Unnamed: 1_level_1
0,17014
1,866


In [7]:
# Step 2: Clean the Dataset

# Selecting only important columns
df = df[['title' , 'location' , 'department' , 'company_profile' , 'description' , 'requirements' , 'benefits' , 'fraudulent']]

# Drop rows with the missing job description (required for the classification)
df.dropna(subset = ['description'] , inplace = True)

# Combine multiple text fields into one
df['text'] = df[['title', 'company_profile', 'description', 'requirements', 'benefits']].fillna('').agg(' '.join , axis=1)

# Check again
df[['text' , 'fraudulent']].head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset = ['description'] , inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df[['title', 'company_profile', 'description', 'requirements', 'benefits']].fillna('').agg(' '.join , axis=1)


Unnamed: 0,text,fraudulent
0,"Marketing Intern We're Food52, and we've creat...",0
1,Customer Service - Cloud Video Production 90 S...,0
2,Commissioning Machinery Assistant (CMA) Valor ...,0
3,Account Executive - Washington DC Our passion ...,0
4,Bill Review Manager SpotSource Solutions LLC i...,0


In [19]:
# Step 3: Text Preprocessing
# We'll clean and preprocess the text (lowercase , remove punctuation , lemmatize)

import nltk # Natural Language Toolkit - a library for NLP in Python
import re # Python regular expression module - used for pattern-based text cleaning
from nltk.corpus import stopwords # set of common , unimportant words (like "the" , "is" , "and") that we'll remove later

nltk.download('stopwords') # list of words to remvove from the text
nltk.download('punkt') # Pre-trained Tokenizer (used to split text into words)
nltk.download('wordnet') # A database used for lemmatization (finding root words)
# nltk.download('punkt_tab')

stop_words = set(stopwords.words('english')) # Loads English Stopwords like "and" , "the" , "a" into python set we'll filter out these words cuz they don't help in ML models
lemmatizer = nltk.WordNetLemmatizer() # creates lemmatizer object , Lemmatization reduces words to their base/root from ("running" - "run")


# main function
def clean_text(text):
  text = text.lower()  # converts the entire text to lowercase i.e Job = job
  text = re.sub(r'[^a-zA-Z]' , ' ' , text) # removes all non-alphabetic characters and replaces everything else (numbers , punctuation , symbols) with space E.g. "Salary is $3000/month" -> "Salary is month"
  tokens = nltk.word_tokenize(text) # tokenize the snetence into individual words (token) E.g. "this is a job" -> ["this" , "is" , "a" , "job"]
  tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words] # for each token if it's not in the stopword list, then apply lemmatization to get the root form E.g. ["working" , "on" , "data" , "analysis"] -> ["work" , "data" , "analysis"]
  return ' '.join(tokens) # joins all processed words back into a cleaned string E.g. ["work" , "data" , "analysis"] -> "work data analysis"

# Applies clean_text() function to every row in the 'text' column and stores the result in a new column called 'cleaned_text'.
df['cleaned_text'] = df['text'].apply(clean_text)
# Shows the first 5 rows of processed text alongside fraudulent label (0 or 1)
df[['cleaned_text' , 'fraudulent']].head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,cleaned_text,fraudulent
0,marketing intern food created groundbreaking a...,0
1,customer service cloud video production second...,0
2,commissioning machinery assistant cma valor se...,0
3,account executive washington dc passion improv...,0
4,bill review manager spotsource solution llc gl...,0


In [36]:
#  Step 4: Feature Extraction with TF-IDF
# This block is about converting cleaned text into numerical features using TF-IDF (Term Frequency–Inverse Document Frequency) — a key step in making text usable for machine learning.

# TF_IDF converts text into numbers based on:
# 1.Term Frequency (TF) : How often a word appears in a document
# 2.Inverse Document Frequency (IDF) : How unique/important that word is across all documents

# So, common but less useful words like "the" or "job" are weighted lower, while rarer but important words like "fraud" or "internship" are weighted higher.

# tool that converts cleaned text into a matrix of TF-IDF features
from sklearn.feature_extraction.text import TfidfVectorizer

# creates a TF-IDF vectorizer object , and limits it to use only the top 5000 words (most frequent and important ones) , max_features = 5000 keeps performance
vectorizer = TfidfVectorizer(max_features=5000)

#fit and transform the cleaned text into TF-IDF features
# fit_transform does two things
# 1.Fits the vectorize on your text corpus - learns the vocabulary and IDF weights
# 2. Transforms each text document into a vector of TF-IDF features
# Output: X becomes a sparse matrix where:
# ~Rows = job posts
# ~Columns = words(features)
# ~Values = TF-IDF scores for each word in each job post
X = vectorizer.fit_transform(df['cleaned_text'])

#  Sets the target label (0 = real, 1 = fake) from the fraudulent column.
# X: Features (TF-IDF vectors)
# y: Labels (0 or 1)
y = df['fraudulent']

# print("TF-IDF matrix shape:", X.shape)

# # Get the top 5000 words (feature names) used by TF-IDF
# feature_names = vectorizer.get_feature_names_out()

# # View the first 20 words
# print("First 20 words:", feature_names[:20])

# # Choose the first row (job posting)
# row = X[0]

# # Convert to a dense array
# row_dense = row.toarray().flatten()

# # Get indices where TF-IDF > 0
# nonzero_indices = row_dense.nonzero()[0]

# # Display words with their TF-IDF scores
# for idx in nonzero_indices:
#     print(f"{feature_names[idx]}: {row_dense[idx]:.4f}")







Accuarcy: 0.9670022371364653

 Confusion Matrix:
 [[3302   92]
 [  26  156]]

 Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98      3394
           1       0.63      0.86      0.73       182

    accuracy                           0.97      3576
   macro avg       0.81      0.92      0.85      3576
weighted avg       0.97      0.97      0.97      3576



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [31]:
# Step 5: Split Data into Train and Test Sets
from sklearn.model_selection import train_test_split

# 80% training and 20% testing
X_train , X_test , y_train , y_test = train_test_split(X, y , test_size = 0.2 , random_state=42)   # random_state --> 	This ensures you get the same split every time (useful for reproducibility)

In [32]:
# Step 6: Train a Machine Learning Model(Logistic Regression)
# using logistic regression cuz its simple and works well on for binary classification (fake vs real job)

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000 , class_weight='balanced') # maximum number of iterations allowed for the model to find the best weights , by default its 100 , but text data with many features (like after TF-IDF) can require more

model.fit(X_train , y_train) # model learns which words (features) are associated with real or fake jobs , finds the best weights (coefficients) for the logistic regression

In [33]:
# Step 7: Evaluate the model
from sklearn.metrics import accuracy_score , classification_report , confusion_matrix

y_pred = model.predict(X_test)

print("Accuarcy:" , accuracy_score(y_test , y_pred))
print("\n Confusion Matrix:\n" , confusion_matrix(y_test , y_pred))
print("\n Classification Report:\n" , classification_report(y_test , y_pred))

Accuarcy: 0.9670022371364653

 Confusion Matrix:
 [[3302   92]
 [  26  156]]

 Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98      3394
           1       0.63      0.86      0.73       182

    accuracy                           0.97      3576
   macro avg       0.81      0.92      0.85      3576
weighted avg       0.97      0.97      0.97      3576



In [34]:
# Save the model
import pickle

# Save the model to a file
with open('model.pkl' , 'wb') as f:
  pickle.dump(model , f)

# Save the vetorizer
with open('vectorizer.pkl' , 'wb') as f:
  pickle.dump(vectorizer , f)


In [35]:
# download both files
from google.colab import files
files.download('model.pkl')
files.download('vectorizer.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>