The main purpose of this notebook is to check the words in both the raw fake and raw real data, then document and visualize the top 15 most frequently used words in both for comparison purposes.

In [None]:
import pandas as pd
from matplotlib import pyplot as plt

# reading the dataset
df = pd.read_csv("../1_datasets/raw_fake_jobs/fake_job_postings.csv")

# extracting both fraudulent and legitimate jobs
fake_jobs = df[df["fraudulent"] == 1].copy()
real_jobs = df[df["fraudulent"] == 0].copy()

# number of fake and real posts
print("Total fake jobs detected:", len(fake_jobs))
print("Total real jobs detected:", len(real_jobs))

In [None]:
# words that have high frequency but have no
# actual weight so it is better to extract them
stopwords = {
    "the",
    "and",
    "to",
    "for",
    "a",
    "in",
    "is",
    "on",
    "with",
    "company",
    "role",
    "job",
    "position",
    "bachelors",
    "you",
    "are",
    "our",
    "your",
    "am",
    "from",
    "all",
    "that",
    "have",
    "been",
    "will",
    "this",
    "their",
    "not",
    "per",
    "can",
    "who",
    "into",
}


def clean_text(text):
    # checks if the input text is Nan, None,
    # Nat and returns an empty string incase
    # CSV file had empty cells
    if pd.isna(text):
        return ""

    # converts to lowercase
    text = str(text).lower()
    # removes all characters except for alphabets and spaces
    text = "".join([char for char in text if char.isalpha() or char == " "])

    # removes stopwords
    words = [word for word in text.split() if word not in stopwords and len(word) > 2]  # noqa: E501

    # returns a string and consumes less memory
    return " ".join(words)


# process all column and combine words
all_fake_words = []
all_real_words = []

for col in [
    "company_profile",
    "description",
    "requirements",
    "benefits",
]:
    fake_text = fake_jobs[col].apply(clean_text).dropna()
    real_text = real_jobs[col].apply(clean_text).dropna()
    for row in fake_text:
        all_fake_words.extend(row.split())

    for row in real_text:
        all_real_words.extend(row.split())

# check frequency of fake words count
fake_words_count = pd.Series(all_fake_words).value_counts()

# check frequency of real words count
real_words_count = pd.Series(all_real_words).value_counts()

# convert to a DataFrame for fake posts
fake_freq_df = pd.DataFrame(fake_words_count).reset_index()
fake_freq_df.columns = ["word", "count"]

# convert to a DataFrame for real posts
real_freq_df = pd.DataFrame(real_words_count).reset_index()
real_freq_df.columns = ["word", "count"]

# save the data in a csv file
fake_file_path = (
    "../3_data_exploration/cleaned_most_frequent_words/fake_words_count.csv"  # noqa: E501
)

fake_freq_df.to_csv(fake_file_path, index=False)

real_file_path = (
    "../3_data_exploration/cleaned_most_frequent_words/real_words_count.csv"  # noqa: E501
)

real_freq_df.to_csv(real_file_path, index=False)

In [None]:
# reading the file of fake words count
df = pd.read_csv(
    "../3_data_exploration/cleaned_most_frequent_words/fake_words_count.csv"
)

top10 = df.head(15)

# plot histogram
plt.figure(figsize=(8, 4))
plt.barh(top10["word"], top10["count"], color="skyblue")
plt.title("Top 15 Most Frequent Words in Fraudulent Posts")
plt.tight_layout()

# save diagram in a png file
fake_dia_file = (
    "../3_data_exploration/cleaned_most_frequent_words/fake_visual_words_count.png"  # noqa: E501
)

plt.savefig(fake_dia_file)

plt.show()

In [None]:
# reading the file of fake words count
df = pd.read_csv(
    "../3_data_exploration/cleaned_most_frequent_words/real_words_count.csv"
)

top10 = df.head(15)

# plot histogram
plt.figure(figsize=(8, 4))
plt.barh(top10["word"], top10["count"], color="skyblue")
plt.title("Top 15 Most Frequent Words in Real Posts")
plt.tight_layout()

# save diagram in a png file
real_dia_file = (
    "../3_data_exploration/cleaned_most_frequent_words/real_visual_words_count.png"  # noqa: E501
)

plt.savefig(real_dia_file)

plt.show()