In [1]:
import pandas as pd
import plotly.express as px
import numpy as np

import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /Users/lienahtan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lienahtan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lienahtan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/lienahtan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
file_path = '../dataset/fake_job_postings.csv'
df = pd.read_csv(file_path)

df_info = df.info()
df_head = df.head()

print(df_info)
print(df_head)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   department           6333 non-null   object
 4   salary_range         2868 non-null   object
 5   company_profile      14572 non-null  object
 6   description          17879 non-null  object
 7   requirements         15184 non-null  object
 8   benefits             10668 non-null  object
 9   telecommuting        17880 non-null  int64 
 10  has_company_logo     17880 non-null  int64 
 11  has_questions        17880 non-null  int64 
 12  employment_type      14409 non-null  object
 13  required_experience  10830 non-null  object
 14  required_education   9775 non-null   object
 15  industry             12977 non-null  object
 16  func

# Data Cleaning

### Handling Duplciated Rows

No duplicated rows were found.

In [3]:
duplicates = df.duplicated()
num_duplicates = duplicates.sum()

print(f"Number of duplicated rows: {num_duplicates}")

Number of duplicated rows: 0


### Handling Missing Values

Upon inspecting the dataset, we found several fields with null values. Since these fields are all categorical, we decided to fill the missing values with the appropriate categorical values:

- Missing values filled with **"Unknown"**: Used for columns like `location` and `employment_type`, where missing data likely means the information is unavailable or not applicable. This preserves the understanding that the data may exist but is not provided.
  
- Missing values filled with **"Not Provided"**: Applied to columns such as `salary_range` and `company_profile`, where the missing information might indicate that it was purposely left out by the job poster.

In [4]:
df.isnull().sum()

job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2696
benefits                7212
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

In [5]:
df['location'].fillna('Unknown', inplace=True)
df['employment_type'].fillna('Unknown', inplace=True)
df['required_experience'].fillna('Unknown', inplace=True)
df['required_education'].fillna('Unknown', inplace=True)
df['industry'].fillna('Unknown', inplace=True)
df['function'].fillna('Unknown', inplace=True)

df['department'].fillna('Not Provided', inplace=True)
df['salary_range'].fillna('Not Provided', inplace=True)
df['company_profile'].fillna('Not Provided', inplace=True)
df['description'].fillna('Not Provided', inplace=True)
df['requirements'].fillna('Not Provided', inplace=True)
df['benefits'].fillna('Not Provided', inplace=True)

df_null_removed = df.copy()

missing_values = df_null_removed.isnull().sum()

missing_values

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['location'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['employment_type'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting

job_id                 0
title                  0
location               0
department             0
salary_range           0
company_profile        0
description            0
requirements           0
benefits               0
telecommuting          0
has_company_logo       0
has_questions          0
employment_type        0
required_experience    0
required_education     0
industry               0
function               0
fraudulent             0
dtype: int64

### Feature engineering

In [6]:
columns_to_check = ['employment_type', 'required_experience', 'required_education', 'industry', 
                    'function', 'department', 'salary_range']

value_counts_dict = {col: df[col].value_counts() for col in columns_to_check}

for key, value in value_counts_dict.items():
    print(f"Column: {key}")
    print(value)
    print("-" * 50)

Column: employment_type
employment_type
Full-time    11620
Unknown       3471
Contract      1524
Part-time      797
Temporary      241
Other          227
Name: count, dtype: int64
--------------------------------------------------
Column: required_experience
required_experience
Unknown             7050
Mid-Senior level    3809
Entry level         2697
Associate           2297
Not Applicable      1116
Director             389
Internship           381
Executive            141
Name: count, dtype: int64
--------------------------------------------------
Column: required_education
required_education
Unknown                              8105
Bachelor's Degree                    5145
High School or equivalent            2080
Unspecified                          1397
Master's Degree                       416
Associate Degree                      274
Certification                         170
Some College Coursework Completed     102
Professional                           74
Vocational          

All categorical columns have a good split for each individual category except salary_range. We can improve the salary_range column by splitting by a delimiter and engineer 2 additional columns.

In [7]:
# Function to extract the number of digits in a salary value
def extract_salary_digits(salary_str):
    if pd.isnull(salary_str) or salary_str == 'Not Provided':
        return 0, 0  # For missing values, return 0 for both min and max salary digits
    try:
        # Split the salary range
        min_salary, max_salary = salary_str.split('-')
        min_digits = len(min_salary.strip())
        max_digits = len(max_salary.strip())
        return min_digits, max_digits
    except ValueError:
        return 0, 0  # In case the salary string is not in the expected format

# Apply the function to split min and max salary digits
df_null_removed['min_salary_digits'], df_null_removed['max_salary_digits'] = zip(*df_null_removed['salary_range'].apply(extract_salary_digits))

# Create binary column to indicate whether salary range was provided
df_null_removed['salary_range_provided'] = np.where(df_null_removed['salary_range'] == 'Not Provided', 0, 1)
df_null_removed.drop(columns=['salary_range'], inplace=True)

###  Additional features

In [8]:
pd.set_option('display.max_rows', None)

In [9]:
print(df_null_removed[df_null_removed['fraudulent'] == 1]['description'])

98       IC&amp;E Technician | Bakersfield, CA Mt. Poso...
144      The group has raised a fund for the purchase o...
173      Technician Instrument &amp; ControlsLocation D...
180                                        Sales Executive
215      IC&amp;E Technician | Bakersfield, CA Mt. Poso...
357      If you have experience in financing for auto s...
493      A Newly established company seeks outgoing, fr...
537      Apply below using link#URL_00962c0bdac3ecf40b2...
573      Senior Engineering Product ManagerAs a member ...
584      Apply using below link#URL_8b28dea5804b323e29d...
603      Corporate overviewAker Solutions is a global p...
606      Director of Strategy | San Mateo, CA Intereste...
628      Qualified candidates are encouraged to apply d...
661      Director of Strategy | San Mateo, CA Intereste...
740      Health &amp; Safety Professional | Bakersfield...
812         UST Testing Technician II Bakersfield, CA L...
814      We are a full-service engineering, project man.

In [10]:
# Length of the job description
df_null_removed['description_length'] = df_null_removed['description'].apply(lambda x: 0 if x == 'Not Provided' else len(x))

# Whether the description contains external links
def contains_external_link(text):
    if pd.isnull(text):
        return 0
    return 1 if re.search(r'http[s]?://', text) else 0

df_null_removed['has_external_link'] = df_null_removed['description'].apply(contains_external_link)

# Whether job description contains common scam phrases as per research
# Source: https://bgc-group.com/common-job-scams-in-singapore-and-how-to-avoid-them/#:~:text=If%20it's%20too%20good%20to%20be%20true%2C%20it's%20not%20true&text=Job%20scammers%20often%20try%20to,away%20before%20it's%20too%20late.

scam_phrases = ["quick cash", "flexible hours", "work from home", "no experience needed", 
                "too good to be true", "application fee", "deposit", "earn", "no experience", 
                "no exp" "fast cash", "easy cash"]

df_null_removed['contains_scam_phrases'] = df_null_removed['description'].apply(
    lambda x: any(phrase in x.lower() for phrase in scam_phrases)
)

### Investigating and Handling Outliers

After handling missing values, we checked for potential outliers in the dataset. For this, we only focused on the numeric columns. Using the **Interquartile Range (IQR)** method, outliers were found in columns such as `telecommuting`, `has_company_logo`, and `fraudulent`. 

Upon further investigation, we plotted these columns and found that the "outliers" were a result of **class imbalance** rather than true outliers. Since these columns are binary and categorical in nature, the detected values were not problematic for analysis and thus no additional outlier removal was needed.

This step helped us realise that there is data imbalance in certain categories.

In [11]:
numeric_columns = df_null_removed.select_dtypes(include=['int64', 'float64']).columns

def detect_outliers_iqr_with_print(df, column):
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1

    # Defining outlier boundaries
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identifying outliers
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]

    # Print IQR and boundaries
    print(f"Column: {column}")
    print(f"Q1: {Q1}, Q3: {Q3}")
    print(f"IQR: {IQR}")
    print(f"Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")
    print(f"Number of outliers: {len(outliers)}\n")

    return outliers

for col in numeric_columns:
    outliers = detect_outliers_iqr_with_print(df_null_removed, col)


Column: job_id
Q1: 4470.75, Q3: 13410.25
IQR: 8939.5
Lower Bound: -8938.5, Upper Bound: 26819.5
Number of outliers: 0

Column: telecommuting
Q1: 0.0, Q3: 0.0
IQR: 0.0
Lower Bound: 0.0, Upper Bound: 0.0
Number of outliers: 767

Column: has_company_logo
Q1: 1.0, Q3: 1.0
IQR: 0.0
Lower Bound: 1.0, Upper Bound: 1.0
Number of outliers: 3660

Column: has_questions
Q1: 0.0, Q3: 1.0
IQR: 1.0
Lower Bound: -1.5, Upper Bound: 2.5
Number of outliers: 0

Column: fraudulent
Q1: 0.0, Q3: 0.0
IQR: 0.0
Lower Bound: 0.0, Upper Bound: 0.0
Number of outliers: 866

Column: min_salary_digits
Q1: 0.0, Q3: 0.0
IQR: 0.0
Lower Bound: 0.0, Upper Bound: 0.0
Number of outliers: 2867

Column: max_salary_digits
Q1: 0.0, Q3: 0.0
IQR: 0.0
Lower Bound: 0.0, Upper Bound: 0.0
Number of outliers: 2867

Column: salary_range_provided
Q1: 0.0, Q3: 0.0
IQR: 0.0
Lower Bound: 0.0, Upper Bound: 0.0
Number of outliers: 2868

Column: description_length
Q1: 607.0, Q3: 1586.0
IQR: 979.0
Lower Bound: -861.5, Upper Bound: 3054.5
Numbe

In [12]:
telecommuting_plot = px.histogram(df_null_removed, x='telecommuting', title='Distribution of Telecommuting (Outliers Detected)',
                                  labels={'telecommuting': 'Telecommuting'},
                                  color='telecommuting', barmode='group')

has_company_logo_plot = px.histogram(df_null_removed, x='has_company_logo', title='Distribution of Has Company Logo (Outliers Detected)',
                                     labels={'has_company_logo': 'Has Company Logo'},
                                     color='has_company_logo', barmode='group')

fraudulent_plot = px.histogram(df_null_removed, x='fraudulent', title='Distribution of Fraudulent Job Postings (Outliers Detected)',
                               labels={'fraudulent': 'Fraudulent'},
                               color='fraudulent', barmode='group')

telecommuting_plot.show()
has_company_logo_plot.show()
fraudulent_plot.show()


  sf: grouped.get_group(s if len(s) > 1 else s[0])






### Exploding the `location` Column

The `location` column contains comma-separated values representing the country, state, and city. To make this data more usable, we exploded the `location` column into three separate fields: `country`, `state`, and `city`.

- **Before**: The `location` column was a single string in the format `country, state, city` (e.g., US, NY, New York).
- **After**: We split the column into three distinct columns: `country`, `state`, and `city`. 

For rows where either `country`, `state`, and `city` was missing, the missing value is replaced by **"Unknown"**

In [13]:
# Splitting the 'location' column into 'country', 'state', and 'city', handling missing values
df_null_removed[['country', 'state', 'city']] = df_null_removed['location'].str.split(',', expand=True, n=2)

df_null_removed[['job_id', 'location', 'country', 'state', 'city']].head()

Unnamed: 0,job_id,location,country,state,city
0,1,"US, NY, New York",US,NY,New York
1,2,"NZ, , Auckland",NZ,,Auckland
2,3,"US, IA, Wever",US,IA,Wever
3,4,"US, DC, Washington",US,DC,Washington
4,5,"US, FL, Fort Worth",US,FL,Fort Worth


In [14]:
df_location_split = df_null_removed.copy()

df_location_split['country'] = df_location_split['country'].replace([' ', None], 'Unknown')
df_location_split['state'] = df_location_split['state'].replace([' ', None], 'Unknown')
df_location_split['city'] = df_location_split['city'].replace([' ', None], 'Unknown')

df_location_split.drop(columns=['location'], inplace=True)

df_location_split[['job_id', 'country', 'state', 'city']].head()

df_location_split.to_csv("../dataset/data_cleaned_preTF.csv",index =False)

## Text Preprocessing

Clean textual columns by
- removing special characters
- removing stopwords
- performing tokenization
- performing lemmatization

This will allow for easier identification of key words/phrases that might be related to fraudulent job postings.

In [15]:
abbreviation_mapping = {
    "dept": "department",
    "mgr": "manager",
    "hr": "human resources",
    "Nov": "November",
    "ASO": "Application Service Outsourcing",
    "PMR": "Performance Management Report",
    "SVN": "Subversion",
    "APA": "American Psychological Association",
    "fax": "facsimile",
    "ADA": "Americans with Disabilities Act",
    "UST": "United States Treasury",
    "JCI": "Johnson Controls International",
    "DC": "District of Columbia",
    "API": "Application Programming Interface",
    "COO": "Chief Operating Officer",
    "ERP": "Enterprise Resource Planning",
    "TV": "Television",
    "Dr": "Doctor",
    "HQ": "Headquarters",
    "USD": "United States Dollar",
    "UPS": "Uninterruptible Power Supply",
    "USA": "United States of America",
    "IT": "Information Technology",
    "AWS": "Amazon Web Services",
    "GUI": "Graphical User Interface",
    "CEO": "Chief Executive Officer",
    "HR": "Human Resources",
    "CFO": "Chief Financial Officer",
    "PDF": "Portable Document Format",
    "GMT": "Greenwich Mean Time",
    "KYC": "Know Your Customer",
    "CRM": "Customer Relationship Management",
    "URL": "Uniform Resource Locator",
    "SDK": "Software Development Kit",
    "SQL": "Structured Query Language",
    "DNS": "Domain Name System",
    "VPN": "Virtual Private Network",
    "SaaS": "Software as a Service",
    "HRL": "Human Resources Leader",
    "FTP": "File Transfer Protocol",
    "HTTP": "Hypertext Transfer Protocol",
    "HTML": "Hypertext Markup Language",
    "NPI": "New Product Introduction",
    "MIS": "Management Information Systems",
    "PPM": "Parts Per Million",
    "AOD": "Advanced Optical Disc",
    "QC": "Quality Control",
    "SS": "Stainless Steel",
    "PTA": "Parent-Teacher Association",
    "UI": "User Interface",
    "VM": "Virtual Machine",
    "IPG": "Internet Protocol Gateway",
    "BBC": "British Broadcasting Corporation",
    "VAT": "Value-Added Tax",
    "CSV": "Comma-Separated Values",
    "AI": "Artificial Intelligence",
    "OCR": "Optical Character Recognition",
    "SSN": "Social Security Number",
    "UX": "User Experience",
    "BPO": "Business Process Outsourcing",
    "RF": "Radio Frequency",
    "IoT": "Internet of Things",
    "IP": "Internet Protocol",
    "PMO": "Project Management Office",
    "BI": "Business Intelligence",
    "POS": "Point of Sale",
    "CAD": "Computer-Aided Design",
    "HRM": "Human Resource Management",
    "UAT": "User Acceptance Testing",
    "TBA": "To Be Announced",
    "CTO": "Chief Technology Officer",
}


In [17]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from textblob import TextBlob
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd
from tqdm import tqdm
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer
from textatistic import Textatistic


nlp = spacy.load("en_core_web_sm")

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

def handle_repeated_chars(text):
    # Reduce repeated characters to just two (e.g., cooool -> cool)
    return re.sub(r'(.)\1+', r'\1\1', text)

def expand_abbreviations(text):
    words = text.split()
    return ' '.join([abbreviation_mapping.get(word, word) for word in words])

# Entity Recognition (NER) using NLTK POS tagging
def named_entity_recognition(text):
    words = word_tokenize(text)
    pos_tagged = nltk.pos_tag(words)
    entities = [word for word, pos in pos_tagged if pos in ['NNP', 'NNPS']]
    return entities  # Return as a list for more flexible feature handling

# Function to extract NER features
def extract_ner_features(text):
    # Parse the text using SpaCy
    doc = nlp(text)
    
    # Entity counts
    org_count = sum(1 for ent in doc.ents if ent.label_ == "ORG")
    person_count = sum(1 for ent in doc.ents if ent.label_ == "PERSON")
    location_count = sum(1 for ent in doc.ents if ent.label_ in ["GPE", "LOC"])
    norp_count = sum(1 for ent in doc.ents if ent.label_ == "NORP")
    fac_count = sum(1 for ent in doc.ents if ent.label_ == "FAC")
    product_count = sum(1 for ent in doc.ents if ent.label_ == "PRODUCT")
    event_count = sum(1 for ent in doc.ents if ent.label_ == "EVENT")
    work_of_art_count = sum(1 for ent in doc.ents if ent.label_ == "WORK_OF_ART")
    percent_count = sum(1 for ent in doc.ents if ent.label_ == "PERCENT")
    money_count = sum(1 for ent in doc.ents if ent.label_ == "MONEY")
    quantity_count = sum(1 for ent in doc.ents if ent.label_ == "QUANTITY")
    ordinal_count = sum(1 for ent in doc.ents if ent.label_ == "ORDINAL")
    cardinal_count = sum(1 for ent in doc.ents if ent.label_ == "CARDINAL")
    
    org_present = int(org_count > 0)
    person_present = int(person_count > 0)
    location_present = int(location_count > 0)
    work_of_art_present = int(work_of_art_count > 0)
    percent_present = int(percent_count > 0)
    money_present = int(money_count > 0)
    quantity_present = int(quantity_count > 0)
    
    return (
        org_present, person_present, location_present, org_count, person_count, location_count,
        norp_count, fac_count, product_count, event_count, 
        work_of_art_present, work_of_art_count,
        percent_present, percent_count, money_present, money_count, 
        quantity_present, quantity_count, ordinal_count, cardinal_count
    )

# Function to extract POS features
def extract_pos_features(text):
    doc = nlp(text)
    
    # Count features
    noun_count = sum(1 for token in doc if token.pos_ == "NOUN")
    verb_count = sum(1 for token in doc if token.pos_ == "VERB")
    pronoun_count = sum(1 for token in doc if token.pos_ == "PRON")
    adj_count = sum(1 for token in doc if token.pos_ == "ADJ")
    adv_count = sum(1 for token in doc if token.pos_ == "ADV")
    prep_count = sum(1 for token in doc if token.pos_ == "ADP")
    conj_count = sum(1 for token in doc if token.pos_ in ["CCONJ", "SCONJ"])
    det_count = sum(1 for token in doc if token.pos_ == "DET")
    interj_count = sum(1 for token in doc if token.pos_ == "INTJ")
    sym_count = sum(1 for token in doc if token.pos_ == "SYM")
    punct_count = sum(1 for token in doc if token.pos_ == "PUNCT")
    
    noun_present = int(noun_count > 0)
    verb_present = int(verb_count > 0)
    pronoun_present = int(pronoun_count > 0)
    adj_present = int(adj_count > 0)
    prep_present = int(prep_count > 0)
    conj_present = int(conj_count > 0)
    det_present = int(det_count > 0)
    sym_present = int(sym_count > 0)
    punct_present = int(punct_count > 0)
    
    return (noun_present, verb_present, pronoun_present, noun_count, verb_count, pronoun_count,
            adj_present, adj_count, adv_count, prep_present, prep_count,
            conj_present, conj_count, det_present, det_count, interj_count,
            sym_present, sym_count, punct_present, punct_count)

def preprocess_text(text):
    text = text.lower()
    text = handle_repeated_chars(text)
    text = expand_abbreviations(text)

    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)

    tokens = word_tokenize(text)

    return tokens

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    negations = {"no", "nor", "not"}
    stop_words = stop_words - negations
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

def perform_lemmatization(tokens):
    lemmer = nltk.WordNetLemmatizer()
    lem_tokens = [lemmer.lemmatize(token) for token in tokens]
    return lem_tokens

def clean_text(text):
    tokens = preprocess_text(text)
    tokens = remove_stopwords(tokens)
    tokens = perform_lemmatization(tokens)
    clean_text = ' '.join(tokens)
    return clean_text

def extract_entities(text):
    return named_entity_recognition(text)

#correlated with one other variable so i commented out
#tool = language_tool_python.LanguageTool('en-US')
def features_from_desc(input_df):
    '''
    Extracting features from company job description and profile

    input: dataframe

    output : extracted features from raw description

    '''
    df = input_df.copy()
    desc = df["description"].tolist()
    num_grammatical_errors_desc = []
    desc_readability = []
    count_special_chars = []
    for sentence in desc:
        #num_grammatical_errors_desc.append(len(tool.check(sentence))) #has checked previously but correlated with another variable
        count_special_chars.append(len(re.sub('[^\^&*$]+' ,'', sentence)))
        try:
            desc_readability.append(Textatistic(sentence).scores["flesch_score"])
        except:
            desc_readability.append(0)
    #df["num_grammatical_errors_desc"] = num_grammatical_errors_desc
    df["desc_readability"] = desc_readability
    df["count_special_chars"] = count_special_chars
    return df

# Function to apply one-hot encoding for entities
def one_hot_encode_entities(df, entity_columns, top_n=50):
    """Applies one-hot encoding for the most common entities in the specified columns."""
    mlb = MultiLabelBinarizer()
    
    for col in entity_columns:
        # Flatten and get the most common entities across the column
        all_entities = [entity for sublist in df[col] if isinstance(sublist, list) for entity in sublist]
        common_entities = pd.Series(all_entities).value_counts().head(top_n).index
        
        # Filter entities and one-hot encode only those that are common
        df[f'{col}_filtered'] = df[col].apply(lambda x: [e for e in x if e in common_entities] if isinstance(x, list) else [])
        one_hot_df = pd.DataFrame(mlb.fit_transform(df[f'{col}_filtered']),
                                  columns=[f"{col}_{entity}" for entity in mlb.classes_],
                                  index=df.index)
        
        # Concatenate the one-hot encoded features to the original DataFrame
        df = pd.concat([df, one_hot_df], axis=1)
        df.drop(columns=[f'{col}_filtered'], inplace=True)  # Drop the filtered column if not needed
    
    return df

[nltk_data] Downloading package punkt to /Users/lienahtan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lienahtan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/lienahtan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [18]:
df_cleaned = df_location_split.copy()

tqdm.pandas()

# List of columns to clean and process for both cleaned text and entities
text_columns = ['description', 'requirements', 'benefits', 'title', 'company_profile']
entity_columns = [f"{col}_entities" for col in text_columns]

# Apply the clean_text and extract_entities functions to each respective column
for col in text_columns:
    df_cleaned[f'{col}_cleaned'] = df_cleaned[col].progress_apply(clean_text)
    df_cleaned[f'{col}_entities'] = df_cleaned[col].progress_apply(extract_entities)

# Apply the NER feature extraction function
# Iterate over each column in text_columns
for col in text_columns:
    
    # Apply the NER feature extraction function
    ner_features = df_cleaned[col].apply(extract_ner_features)
    
    # Define the column names specific to the current text column
    ner_columns = [
        f'{col}_has_org', f'{col}_has_person', f'{col}_has_location', 
        f'{col}_org_count', f'{col}_person_count', f'{col}_location_count',
        f'{col}_norp_count', f'{col}_fac_count', f'{col}_product_count', 
        f'{col}_event_count', f'{col}_has_work_of_art', f'{col}_work_of_art_count',
        f'{col}_has_percent', f'{col}_percent_count', f'{col}_has_money', 
        f'{col}_money_count', f'{col}_has_quantity', f'{col}_quantity_count', 
        f'{col}_ordinal_count', f'{col}_cardinal_count'
    ]
    
    # Convert the results into a DataFrame with appropriate column names
    ner_features_df = pd.DataFrame(ner_features.tolist(), columns=ner_columns)
    
    # Concatenate the new NER features DataFrame with the original DataFrame
    df_cleaned = pd.concat([df_cleaned, ner_features_df], axis=1)

# Iterate over each column in text_columns
for col in text_columns:
    
    # Apply the POS feature extraction function
    pos_features = df_cleaned[col].apply(extract_pos_features)
    
    # Define the column names specific to the current text column
    pos_columns = [
        f'{col}_has_noun', f'{col}_has_verb', f'{col}_has_pronoun',
        f'{col}_noun_count', f'{col}_verb_count', f'{col}_pronoun_count',
        f'{col}_has_adj', f'{col}_adj_count', f'{col}_adv_count', 
        f'{col}_has_prep', f'{col}_prep_count', f'{col}_has_conj', 
        f'{col}_conj_count', f'{col}_has_det', f'{col}_det_count', 
        f'{col}_interj_count', f'{col}_has_sym', f'{col}_sym_count', 
        f'{col}_has_punct', f'{col}_punct_count'
    ]
    
    # Convert the results into a DataFrame with appropriate column names
    pos_features_df = pd.DataFrame(pos_features.tolist(), columns=pos_columns)
    
    # Concatenate the new POS features DataFrame with the original DataFrame
    df_cleaned = pd.concat([df_cleaned, pos_features_df], axis=1)

# Create simple entity count features
def create_entity_count_features(df, entity_columns):
    """Creates count features based on the number of entities in each column."""
    for col in entity_columns:
        df[f'{col}_count'] = df[col].apply(lambda x: len(x) if isinstance(x, list) else 0)
    return df

# Apply the entity count feature creation
df_cleaned = create_entity_count_features(df_cleaned, entity_columns)

# Apply one-hot encoding for entities, with a limit on the top N entities
df_cleaned = one_hot_encode_entities(df_cleaned, entity_columns, top_n=50)

100%|██████████| 17880/17880 [00:15<00:00, 1164.75it/s]
100%|██████████| 17880/17880 [01:12<00:00, 246.74it/s]
100%|██████████| 17880/17880 [00:07<00:00, 2548.36it/s]
100%|██████████| 17880/17880 [00:36<00:00, 484.02it/s]
100%|██████████| 17880/17880 [00:03<00:00, 4775.24it/s]
100%|██████████| 17880/17880 [00:14<00:00, 1207.73it/s]
100%|██████████| 17880/17880 [00:01<00:00, 9017.94it/s]
100%|██████████| 17880/17880 [00:03<00:00, 5145.45it/s]
100%|██████████| 17880/17880 [00:07<00:00, 2289.07it/s]
100%|██████████| 17880/17880 [00:42<00:00, 415.97it/s] 


## Bag of Words

- Separate BoW for each column: For each column (`description`, `requirements`, etc.), we apply the BoW model individually. This will give us a word frequency matrix for each column.

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

# Define textual columns to process
text_columns = ['description', 'requirements', 'benefits', 'title', 'company_profile']

# Dictionary to store BoW features
bow_features = {}

for col in text_columns:
    vectorizer = CountVectorizer(max_features=1000)  # Limit features for dimensionality control
    bow_matrix = vectorizer.fit_transform(df_cleaned[col].fillna(''))
    bow_features[col] = pd.DataFrame(bow_matrix.toarray(), columns=[f"{col}_bow_{word}" for word in vectorizer.get_feature_names_out()])

# Append BoW features to the dataset
for col, bow_df in bow_features.items():
    df_cleaned = pd.concat([df.reset_index(drop=True), bow_df.reset_index(drop=True)], axis=1)



## Topic Modelling 
- By discovering any hidden semantic patterns within the textual data, we are able to identify topics that lie within it
- We can utilise the BoW matrices to check for any usful topics using LatentDirichletAllocation

In [20]:
from sklearn.decomposition import LatentDirichletAllocation

lda_features = {}

for col in text_columns:
    vectorizer = CountVectorizer(max_features=1000)
    bow_matrix = vectorizer.fit_transform(df_cleaned[col].fillna(''))

    lda = LatentDirichletAllocation(n_components=5, random_state=42)  # 5 topics
    lda_matrix = lda.fit_transform(bow_matrix)
    
    # Create a DataFrame for topics
    lda_df = pd.DataFrame(lda_matrix, columns=[f"{col}_topic_{i}" for i in range(5)])
    lda_features[col] = lda_df

# Append LDA features to the dataset
for col, lda_df in lda_features.items():
    df_cleaned = pd.concat([df_cleaned.reset_index(drop=True), lda_df.reset_index(drop=True)], axis=1)


In [21]:
df_cleaned.to_csv("../dataset/k-means_dataset.csv", index=False)