In [None]:
import pandas as pd


# reading the dataset
df = pd.read_csv("../1_datasets/raw_fake_jobs/fake_job_postings.csv")

# extracting only fraudulent jobs
fake_jobs = df[df['fraudulent'] == 1].copy()

print("Total fake jobs detected:", len(fake_jobs))

In [None]:
# words that have high frequency but have no 
# actual weight so it is better to extract them
stopwords = {'the', 'and', 'to', 'for', 'a', 'in', 'is', 'on',
          'with', 'company', 'role', 'job',
          'position', 'bachelors', 'you', 'are',
          'our', 'your', 'am', 'from', 'all',
          'that', 'have', 'been', 'will',
          'this', 'their', 'not', 'per'
            }

def clean_text(text):
    # checks if the input text is Nan, None,
    # Nat and returns an empty string incase
    # CSV file had empty cells
    if pd.isna(text):
        return ""

    # converts to lowercase
    text = str(text).lower()
    # removes all characters except for alphabets and spaces
    text = "".join([char for char in text if char.isalpha() or char == " "])

    # removes stopwords
    words = [word for word in text.split()
            if word not in stopwords and len(word) > 2]

    # returns a string and consumes less memory
    return " ".join(words)

# process all column and combine words
all_words = []

for col in ['company_profile', 'description',
            'requirements', 'benefits',
            'telecommuting', 'required_experience']:
    cleaned_text = fake_jobs[col].apply(clean_text).dropna()
    for row in cleaned_text:
      all_words.extend(row.split())

    # check frequency
word_counts = pd.Series(all_words).value_counts()
print(word_counts.head(20))