In [3]:
import os
import random
import pandas as pd
import nltk
import numpy as np
import matplotlib.pyplot as plt
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
from collections import Counter

In [None]:
# Download Resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

In [5]:
# Load the dataset
df = pd.read_csv('fake_job_postings.csv')

In [6]:
# Process the doc text
def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text)
    
    # Tag each token
    pos_tags = pos_tag(tokens)
    
    # Converts each token to lowercase and lemmatizes it (according to its POS tag)
    lemmatizer = WordNetLemmatizer()
    # Perform lemmatization; call get_wordnet_pos function: change to corresponding WordNet POS
    lemmatized_tokens = [lemmatizer.lemmatize(token.lower(), pos=get_wordnet_pos(tag)) for token, tag in pos_tags]
    
    # Filtering: Get rid of stopwords and non-alphabetic token
    stop_words = set(stopwords.words('english')) # Load stopwords
    filtered_tokens = [token for token in lemmatized_tokens if token.isalpha() and token not in stop_words]
    
    return ' '.join(filtered_tokens)


def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN

In [7]:
# Apply text processing
df['processed_description'] = df['description'].fillna('').apply(preprocess_text)

In [8]:
print(df[['description', 'processed_description']].head())

                                         description  \
0  Food52, a fast-growing, James Beard Award-winn...   
1  Organised - Focused - Vibrant - Awesome!Do you...   
2  Our client, located in Houston, is actively se...   
3  THE COMPANY: ESRI – Environmental Systems Rese...   
4  JOB TITLE: Itemization Review ManagerLOCATION:...   

                               processed_description  
0  james beard online food community curated reci...  
1  organise focus vibrant awesome passion custome...  
2  client locate houston actively seek experience...  
3  company esri environmental system research ins...  
4  job title itemization review managerlocation f...  


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
# TF-IDF analysis

# Load raal and fake job description
real_descriptions = df[df['fraudulent'] == 0]['processed_description'].tolist()
fake_descriptions = df[df['fraudulent'] == 1]['processed_description'].tolist()

descriptions = real_descriptions + fake_descriptions
labels = [0] * len(real_descriptions) + [1] * len(fake_descriptions)

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(descriptions)

# Get feature names
feature_names = tfidf_vectorizer.get_feature_names()

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names) # Feature names as column names
tfidf_df['fraudulent'] = labels  # Add label as a new column

real_tfidf_mean = tfidf_df[tfidf_df['fraudulent'] == 0].mean(axis=0).sort_values(ascending=False)
fake_tfidf_mean = tfidf_df[tfidf_df['fraudulent'] == 1].mean(axis=0).sort_values(ascending=False)

In [18]:
print("Top words in real postings:")
print(real_tfidf_mean.head(10))

print("\nTop words in fake postings:")
print(fake_tfidf_mean.head(10))

Top words in real postings:
work          0.035460
team          0.035180
customer      0.033374
sale          0.027176
product       0.027086
client        0.026883
service       0.026470
business      0.025076
experience    0.024584
company       0.024304
dtype: float64

Top words in fake postings:
fraudulent    1.000000
work          0.048301
position      0.037100
customer      0.035093
amp           0.033624
service       0.033064
home          0.031776
skill         0.027830
time          0.026968
project       0.025612
dtype: float64


In [11]:
# Fields analysis (e.g. titile, company profile, ...)

def compare_field_statistics(df, field_name):
    real_jobs = df[df['fraudulent'] == 0][field_name].dropna()
    fake_jobs = df[df['fraudulent'] == 1][field_name].dropna()

    print(f"\nAnalysis for {field_name}:")
    print(f"Real job postings - missing values: {df[df['fraudulent'] == 0][field_name].isnull().sum()}")
    print(f"Fake job postings - missing values: {df[df['fraudulent'] == 1][field_name].isnull().sum()}")
    
    print(f"Average length of {field_name} in real jobs: {real_jobs.apply(len).mean()}")
    print(f"Average length of {field_name} in fake jobs: {fake_jobs.apply(len).mean()}")

compare_field_statistics(df, 'title')
compare_field_statistics(df, 'company_profile')
compare_field_statistics(df, 'location')
compare_field_statistics(df, 'requirements')


Analysis for title:
Real job postings - missing values: 0
Fake job postings - missing values: 0
Average length of title in real jobs: 28.42159398142706
Average length of title in fake jobs: 30.666281755196305

Analysis for company_profile:
Real job postings - missing values: 2721
Fake job postings - missing values: 587
Average length of company_profile in real jobs: 762.734625341076
Average length of company_profile in fake jobs: 716.673835125448

Analysis for location:
Real job postings - missing values: 327
Fake job postings - missing values: 19
Average length of location in real jobs: 15.636843051477198
Average length of location in fake jobs: 15.206611570247935

Analysis for requirements:
Real job postings - missing values: 2541
Fake job postings - missing values: 154
Average length of requirements in real jobs: 702.3622607614178
Average length of requirements in fake jobs: 542.5266853932584


In [15]:
# N-gram analysis: find common word pairs
def ngram_analysis(descriptions, n=2):
    vectorizer = CountVectorizer(ngram_range=(n, n), stop_words='english')
    # Row: job description; column: specific n-gram phrase; element: number of occurrences of an n-gram in the corresponding job description
    ngram_matrix = vectorizer.fit_transform(descriptions)
    
    # Sum the columns: get the total number of occurrences of each n-gram
    ngram_counts = ngram_matrix.sum(axis=0).A1
    ngram_features = vectorizer.get_feature_names()
    
    ngrams = pd.DataFrame({'ngram': ngram_features, 'count': ngram_counts})
    return ngrams.sort_values(by='count', ascending=False).head(10)

# show top 10 bigrams and trigrams
print("Top 10 bigrams in real job postings:")
real_bigrams = ngram_analysis(real_descriptions, n=2)
print(real_bigrams)

print("\nTop 10 bigrams in fake job postings:")
fake_bigrams = ngram_analysis(fake_descriptions, n=2)
print(fake_bigrams)

print("\nTop 10 trigrams in real job postings:")
real_trigrams = ngram_analysis(real_descriptions, n=3)
print(real_trigrams)

print("\nTop 10 trigrams in fake job postings:")
fake_trigrams = ngram_analysis(fake_descriptions, n=3)
print(fake_trigrams)

Top 10 bigrams in real job postings:
                      ngram  count
104134     customer service   2594
404848        social medium   1422
434273          team member   1168
230152            join team   1130
476317         work closely   1085
203036      ideal candidate   1067
197426           high level   1048
77197   communication skill    864
481530      year experience    793
122462     development team    770

Top 10 bigrams in fake job postings:
                     ngram  count
7071      customer service    166
20180              oil gas    144
7232            data entry    121
12586         gas industry    117
33623            work home    109
1128         aker solution    104
5149   communication skill     90
23728      product service     83
3590       business people     74
145              able work     72

Top 10 trigrams in real job postings:
                              ngram  count
551140                 play kid pay    715
797975             usd monthly cost    68