The main purpose of this notebook is to check the words in both the raw fake and raw real data, along with the llm-refined version in order to document and visualize the top 15 most frequently used words in all for comparison purposes.

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# reading the dataset
df = pd.read_csv("../1_datasets/raw_fake_jobs/fake_job_postings.csv")

# extracting both fraudulent and legitimate jobs
fake_jobs = df[df["fraudulent"] == 1].copy()
real_jobs = df[df["fraudulent"] == 0].copy()

# number of fake and real posts
print("Total fake jobs detected:", len(fake_jobs))
print("Total real jobs detected:", len(real_jobs))

In [None]:
# words that have high frequency but have no
# actual weight so it is better to extract them
stopwords = {
    "the",
    "and",
    "for",
    "with",
    "working",
    "people",
    "work",
    "experience",
    "company",
    "skills",
    "role",
    "job",
    "position",
    "bachelors",
    "you",
    "are",
    "our",
    "amp",
    "your",
    "within",
    "new",
    "from",
    "all",
    "other",
    "ability",
    "more",
    "that",
    "including",
    "time",
    "years",
    "have",
    "looking",
    "been",
    "will",
    "this",
    "their",
    "not",
    "per",
    "can",
    "who",
    "into",
}


def clean_text(text):
    # checks if the input text is Nan, None,
    # Nat and returns an empty string incase
    # CSV file had empty cells
    if pd.isna(text):
        return ""

    # converts to lowercase
    text = str(text).lower()
    # removes all characters except for alphabets and spaces
    text = "".join([char for char in text if char.isalpha() or char == " "])

    # removes stopwords
    words = [word for word in text.split() if word not in stopwords and len(word) > 2]  # noqa: E501

    # returns a string and consumes less memory
    return " ".join(words)


# process all column and combine words
all_words_in_fake = []
all_words_in_real = []

for col in [
    "company_profile",
    "description",
    "requirements",
    "benefits",
]:
    fake_text = fake_jobs[col].apply(clean_text).dropna()
    real_text = real_jobs[col].apply(clean_text).dropna()
    for row in fake_text:
        all_words_in_fake.extend(row.split())

    for row in real_text:
        all_words_in_real.extend(row.split())

# check frequency of fake words count
fake_words_count = pd.Series(all_words_in_fake).value_counts()

# check frequency of real words count
real_words_count = pd.Series(all_words_in_real).value_counts()

# convert to a DataFrame for fake posts
fake_freq_df = pd.DataFrame(fake_words_count).reset_index()
fake_freq_df.columns = ["word", "count"]

# convert to a DataFrame for real posts
real_freq_df = pd.DataFrame(real_words_count).reset_index()
real_freq_df.columns = ["word", "count"]

# save the data in a csv file
fake_file_path = (
    "../4_data_analysis/cleaned_most_frequent_words/fake_words_count.csv"  # noqa: E501
)

fake_freq_df.to_csv(fake_file_path, index=False)

real_file_path = (
    "../4_data_analysis/cleaned_most_frequent_words/real_words_count.csv"  # noqa: E501
)

real_freq_df.to_csv(real_file_path, index=False)

In [None]:
# reading the file of fake words count
fake_df = pd.read_csv(
    "../4_data_analysis/cleaned_most_frequent_words/fake_words_count.csv"
)

fake_top15 = fake_df.head(15)

# plot histogram
plt.figure(figsize=(8, 4))
plt.barh(fake_top15["word"], fake_top15["count"], color="skyblue")
plt.title("Top 15 Most Frequent Words in Fraudulent Posts")
plt.tight_layout()

# save diagram in a png file
fake_dia_file = (
    "../4_data_analysis/cleaned_most_frequent_words/fake_visual_words_count.png"  # noqa: E501
)

plt.savefig(fake_dia_file)

plt.show()

In [None]:
# reading the file of real words count
real_df = pd.read_csv(
    "../4_data_analysis/cleaned_most_frequent_words/real_words_count.csv"
)

real_top15 = real_df.head(15)

# plot histogram
plt.figure(figsize=(8, 4))
plt.barh(real_top15["word"], real_top15["count"], color="skyblue")
plt.title("Top 15 Most Frequent Words in Real Posts")
plt.tight_layout()

# save diagram in a png file
real_dia_file = (
    "../4_data_analysis/cleaned_most_frequent_words/real_visual_words_count.png"  # noqa: E501
)

plt.savefig(real_dia_file)

plt.show()

In [61]:
# reading the file to check frequency of the words used in the refined LLMs version
llm_df = pd.read_json("../1_datasets/processed_fake_jobs/original_vs_refined_fakeJobs_descriptions.json", lines=True)

all_words_in_refined_llm = []

# using _, as when using .iterrows, it gives the index and data
# we're telling it to ignore the index
for _, row in llm_df.iterrows():
        AI_text = llm_df["description"].apply(clean_text)
        
        for row in AI_text:
                all_words_in_refined_llm.extend(row.split())

# check frequency of words
refined_llm_words_count = pd.Series(all_words_in_refined_llm).value_counts()

# convert it to a DataFrame of two columns
refined_llm_freq_df = pd.DataFrame(refined_llm_words_count).reset_index()
refined_llm_freq_df.columns = ["word", "count"]

# saving the file
refined_llm_file_path = ("../4_data_analysis/cleaned_most_frequent_words/LLM_refined_words_count.csv")

refined_llm_freq_df.to_csv(refined_llm_file_path, index=False)


In [None]:
# visualizing the top 15 words used by LLM refined posts
refined_llm_df = pd.read_csv("../4_data_analysis/cleaned_most_frequent_words/LLM_refined_words_count.csv")

refined_llm_top15 = refined_llm_df.head(15)

plt.figure(figsize= (8, 4))
plt.barh(refined_llm_df.head(15)["word"], refined_llm_df.head(15)["count"], color="skyblue")
plt.title("Top 15 Most Frequent Words in Fake Refined LLM Posts")
plt.tight_layout

refined_llm_dia_file = ("../4_data_analysis/cleaned_most_frequent_words/LLM_visual_words_count.png")

plt.savefig(refined_llm_dia_file)

plt.show()


In [None]:
# visualize all 3 sections together
fake_top10 = fake_top15.head(10).rename(columns={"count": "fake"})
real_top10 = real_top15.head(10).rename(columns={"count": "real"})
refined_llm_top10 = refined_llm_top15.head(10).rename(columns={"count": "LLM refined"})

# combining all words from all three sections and dropping duplicates
all_top_words= pd.Series(pd.concat([fake_top10["word"], real_top10["word"], refined_llm_top10["word"]])).drop_duplicates()

# checking the counts for words that appear in one or two of the three sections from the original DataFrames
top_fake= fake_freq_df[fake_freq_df["word"].isin(all_top_words)]
top_real= real_freq_df[real_freq_df["word"].isin(all_top_words)]
top_llm_refined= refined_llm_freq_df[refined_llm_freq_df["word"].isin(all_top_words)]

# merging all words from the three sections in one place
combined_words= (
  top_fake
  .merge(top_real, on="word", how="outer")
  .merge(top_llm_refined, on="word", how="outer"))

# naming the columns and filling NaN values
combined_words.columns=["Word", "Fake", "Real", "LLM-Refined"]
combined_words= combined_words.fillna(0)

print("Shape of the data:", combined_words.shape)
combined_words.head(23)

# creating a numeric position for each word in the y-axis
y= np.arange(len(combined_words))
height= 0.30

plt.figure(figsize=(10, 8))

plt.barh(y+height, combined_words["Fake"], height, label="Fake")
plt.barh(y, combined_words["Real"], height, label="Real")
plt.barh(y-height, combined_words["LLM-Refined"], height, label="LLM-Refined")

plt.yticks(y, combined_words["Word"])
plt.tight_layout()
plt.legend()
plt.show

# saving the file
all_sections_dia_file = ("../4_data_analysis/cleaned_most_frequent_words/all_sections_words_count.png")
plt.savefig(all_sections_dia_file)
