In [1]:
import pandas as pd 
import os
import pyspark
import docx
from docx import Document
import re
import csv 
import uuid

myuuid = uuid.uuid4()

print('Your UUID is: ' + str(myuuid))

Your UUID is: 5adc5d61-6d95-4fe7-a4e6-ee5d9ba6ed79


In [2]:
raw_files = r"C:\Users\jclaros\Downloads\Python Folder\Personal\Indeed Scraper\raw_files"

In [3]:
def read_word(path):
    '''Creates a string object from a word document of job descriptions div containers'''
    doc = Document(path)
    full_text = []

    # Iterate document object and extract text
    for para in doc.paragraphs:
        full_text.append(para.text)

    # Join and create string object 
    return '\n'.join(full_text)
    

# Each word document collected contains about 75 job description div containers, below is how one looks

In [4]:
list_of_files = os.listdir(raw_files)
document_to_string = read_word(raw_files+ "//" + list_of_files[0])
separate_job_objects = document_to_string.split('\n\n--------------------')
test = separate_job_objects[49]
#separate_job_objects[0]

# We want to extract the job description portion and clean up all remaining html tags and grammar

In [5]:
def clean_job_description_paragraph(job_desc_list):
    
    job_description_string = ''
    
    phone_number_pattern = r'\(?\b\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b'
    date_pattern = r'\(?\b\d{4}\)?[-.\s]?\d{2}[-.\s]?\d{2}\b'
    remove_top_html_pattern = r'</div><div id="jobDescriptionText"'
    remove_bottom_html_pattern = r'</div></div>'
    merged_tag_pattern = r'<(\w{1,2})(\w+)'
    replacement = r'\2'
    first_line_pattern = re.compile(r' class="[^"]*">\s')
    
    tags_to_remove = ['<div>', '</div>', '<p>', '</p>', '<br>', '</br>', '<ul>', '</ul>', '<i>', '</i>', '<b>', '</b>', '<li>', '</li>', '\n', '\n+', '<i>', "'", '<h4>', 
                        '</h4>', '</h3>', '<h3>', '<h2>', '</h2>', "’", r'/', r'\.00\b','  +']
    
    
    for job_desc_html in job_desc_list:
        try:
            job_desc_html_v2 = job_desc_html.split(remove_top_html_pattern)[1]
            job_desc_html_v3 = job_desc_html_v2.split(remove_bottom_html_pattern)[0]
        except:
            continue
        


        for items in tags_to_remove: 
            job_desc_html_v3 = re.sub(items, ' ', job_desc_html_v3)

        
        for regex_fliters in [first_line_pattern, phone_number_pattern, date_pattern]:
            job_desc_html_v3 = re.sub(regex_fliters, '', job_desc_html_v3)

        
        refined_job_desc = re.sub(merged_tag_pattern, replacement, job_desc_html_v3)

        job_description_string += "\u0332".join("Job Description:  ") + refined_job_desc +  "\n\n -------------------------------------------------------------------------------------- \n\n "
        
    return job_description_string.lower()

input_folder = r'C:\Users\jclaros\Downloads\Python Folder\Personal\Indeed Scraper\raw_files' 
output_folder = r'C:\Users\jclaros\Downloads\Python Folder\Personal\Indeed Scraper\cleansed_files'

list_of_files = os.listdir(input_folder)

list_of_files
for word_docx in list_of_files: 
    
    document_to_string = read_word(raw_files + r"\\" +  word_docx)
    separate_job_objects = document_to_string.split('\n\n--------------------') # Separate the job from a divider marker my bot set
    
    repaired_doc = clean_job_description_paragraph(separate_job_objects)
    
    cleansed_page = Document()
    cleansed_page.add_paragraph(repaired_doc)
    
    
    new_file_path = os.path.join(output_folder, word_docx)
    cleansed_page.save(new_file_path)
    


# This Cleansed Layer will not take the cleaned up job description paragraphs and use dictionaries to extract word frequencies

In [59]:
def create_dictionary_from_csv(path_to_csv: str) -> dict:
    ''' Creates environment folders and partitioning folders when new job title is created 
    
        Args: 
        path_to_csv (str): string path to csv file needed to make a frequency dictionary
        
        Returns:
        freq_dictionary (dict): blank frequency dictionary '''
    
    freq_dictionary = {} 

    with open(path_to_csv, mode='r', newline='') as file: 
        reader = csv.reader(file)
        next(reader, None) # Skips header
        for row in reader: 
            if row:
                freq_dictionary[row[0]] = 0
    
    return freq_dictionary

def create_cleansed_enviornment(list_of_files_in_raw: list, output_folder: str) -> list:
    ''' Creates environment folders and partitioning folders when new job title is created 
    
        Args: 
        list_of_files_in_raw (list): list of all files collected using os.listdir
        
        Returns:
        all_job_names (list): list of all unique names collected from raw files listed'''
        
    partition_folders = ['education', 'programming_languages', 'personality_traits', 'skillset', 'software', 'security_clearance', 'experience', 'salary']
    all_job_names = []
    
    for word_docx_filename in list_of_files_in_raw:

        position_title = word_docx_filename.split('_')[0]
    
        if position_title not in all_job_names: all_job_names.append(position_title)
            

    for items in partition_folders:
        if not os.path.exists(output_folder + "//" + items):
            os.makedirs(output_folder + "//" + items)
        

        
    return all_job_names

def clean_salary(job_id: str, output_folder: str, word_docx_filename: str, salary_tuple: tuple):
    ''' Cleans salary tuple in order to convert it into two dataframe columns after
        
        Args: 
        salary_tuple: tuple of low end, high end, and hourly/salary pay grades 
        
        Returns: 
        Tuple: a tuple of job low/high end pay ranges '''
    
    pattern = r'[^a-zA-Z0-9\s]' # remove special characters
    low_end_pay = [] 
    high_end_pay = [] 

    
    for matches in salary_tuple: 
        low = re.sub(pattern, '', matches[0]) # remove special characters
        high = re.sub(pattern, '', matches[1]) # remove special characters
        
        low = re.sub('k', '000', low) # replace k with 000
        high = re.sub('k', '000', high) 
        
        
        if int(low) < 20000 or int(high) < 20000:
            
            if matches[2] == 'year' or matches[2] == 'annually':
                
                if len(low) < 5: low = int(low) * 1000
                if len(high) < 5: high = int(high) * 1000
                
                if int(low) < 20000 or int(high) < 20000:
                    continue
                    
            elif matches[2] == 'hour' or matches[2] == 'hourly':
                low = int(low) * 40 * 52 # Convert to salary
                high = int(high) * 40 * 52 # Convert to salary
                
            else: 
                continue
        
        low_end_pay.append(int(low))
        high_end_pay.append(int(high))
        
    salary_dataframe = pd.DataFrame()
    salary_dataframe['pay_low_end'] = low_end_pay
    salary_dataframe['pay_high_end'] = high_end_pay
    salary_dataframe['job_id'] = job_id
    
    salary_dataframe.to_csv(output_folder + "\\" + 'salary' + "\\"  + word_docx_filename + '.csv', index=False)

def repair_experience(job_id: str, output_folder: str, word_docx_filename: str, list_of_experience: list):
    print("JOB ID" + str(job_id))
    """Retrieve or assign a unique job ID based on the job title.
    
    Args:
    list_of_experience: a list of all experience years collected from job descriptions
    
    Returns:
    years_experience: a cleaned list of professional experience
    """
    years_experience = []
    
    for items in list_of_experience:
        years = re.sub(r'[^a-zA-Z0-9-]', '', items[0])
        
        if '-' in years: 
            years_array = years.split('-') 
            ranged_item = list(range(int(years_array[0]), int(years_array[1]) + 1))
            years_experience.extend(ranged_item)
        elif int(years) > 15: 
            continue
        else: years_experience.append(int(years))
    
    experience_db = pd.DataFrame()
    experience_db['years_experience_recorded'] = years_experience
    experience_db['job_id'] = job_id
    
    experience_db.to_csv(output_folder + "\\" + 'experience' + "\\"  + word_docx_filename + '.csv', index=False)

def get_job_id(job_name: str, file_path: str) -> int:
    """Retrieve or assign a unique job ID based on the job title.
    
    Args:
    job_name (str): The name of the job to retrieve or create an ID for.
    filepath (str): Path to the CSV file containing job categories.
    
    Returns:
    int: The job ID.
    """
    
    job_id_db = pd.read_csv(file_path)
    
    if not (job_id_db['job_title'].eq(job_name)).any(): # If doesnt exists in job_categories.csv
        
        # Set new id value
        new_job_id = len(job_id_db)
        job_id_db = job_id_db.append({'job_id': new_job_id, 'job_title': job_name}, ignore_index=True)
        
        # Order them numerically
        job_id_db = job_id_db.sort_values(by=["job_id"], ascending=True, ignore_index=True) 
        
        # Save file back as updated csv file 
        job_id_db.to_csv(filepath, index=False)
        
        return len(job_id_db)
    
    else: 
        
        job_id_col = job_id_db["job_id"]
        id = job_id_db.loc[job_id_db["job_title"] == job_name]["job_id"].iloc[0]
        return id
       
def write_dictionary_to_cleansed_layer(measurement_dictionary: dict, job_id: str, output_folder: str, folder: str): 
    
    for name, dictionary in measurement_dictionary.items():
        
        dataframe = pd.DataFrame(list(measurement_dictionary.items()), columns = [folder, 'frequency'])
        dataframe['job_id'] = job_id  # Adding job_id column
        dataframe.to_csv(output_folder + "\\" + folder + "\\"  + word_docx_filename + '.csv', index=False)
        
def mark_phrases(list_of_measurement_dictionaries: list, word_doc_text: str) -> str:
    ''' Function takes multiple worded phrases from dictionaries and replaced space with '-' in order to mark them before counting 
        
        Args: 
        list_of_measurement_dictionaries (list): list of dictionaries from csvs 
        word_doc_text (str): word documented converted into a string 
        
        Returns: 
        word_doc_text (str): word document with marked text '''
    
    word_list = [] 
    for dictionaries in list_of_measurement_dictionaries: 
        temp_list = []
        keys_list = list(dictionaries.keys())
        for word in keys_list: 
            if '-' in word: 
                repaired_word = word.replace('-', ' ') 
                temp_list.append(repaired_word)
        word_list += temp_list
        
        word_list_desc = sorted(word_list, key=len, reverse=True)
        
    for phrase in word_list_desc:
        
        word_doc_text = re.sub(phrase, phrase.replace(' ', '-'), word_doc_text)
        
    return word_doc_text

def find_special_characters(s):
    # This pattern matches any character that is not a letter or a number
    pattern = re.compile('[^a-zA-Z0-9]')
    # Find all non-alphanumeric characters in the string
    special_chars = pattern.findall(s)
    unique_special_chars = set(special_chars)
    
    for special_character in unique_special_chars: 
        s = s.replace(special_character, "\\" + special_character)
    return s

In [60]:
input_folder = r'C:\Users\jclaros\Downloads\Python Folder\Personal\Indeed Scraper\cleansed_files'
output_folder = r'C:\Users\jclaros\Downloads\Python Folder\Personal\Indeed Scraper\curated_files'
filepath = r'C:\Users\jclaros\Downloads\Python Folder\Personal\Indeed Scraper\artifacts\dictionaries_skeletons\job_categories.csv'
artifacts_folder = r'C:\Users\jclaros\Downloads\Python Folder\Personal\Indeed Scraper\artifacts'

    
list_of_files = os.listdir(input_folder)

unique_job_names = create_cleansed_enviornment(list_of_files, output_folder)

for job_titles in unique_job_names: 

    
    education_dictionary = create_dictionary_from_csv(artifacts_folder + r"\dictionaries_skeletons\education.csv")
    prog_lang_dictionary = create_dictionary_from_csv(artifacts_folder + r"\dictionaries_skeletons\programming_languages.csv")
    personality_traits_dict = create_dictionary_from_csv(artifacts_folder + r"\dictionaries_skeletons\personality_traits.csv")
    skills_dict = create_dictionary_from_csv(artifacts_folder + r"\dictionaries_skeletons\skills.csv")
    software_dict = create_dictionary_from_csv(artifacts_folder + r"\dictionaries_skeletons\software.csv")
    security_clearance = create_dictionary_from_csv(artifacts_folder + r"\dictionaries_skeletons\security_clearance.csv")
    
    job_id = get_job_id(job_titles, filepath)
    
    job_specific_files = [x for x in list_of_files if job_titles in x]
    
    for word_docx_filename in job_specific_files:
        print(word_docx_filename)
        counter = 0
        
        document_to_string = read_word(input_folder + r"\\" +  word_docx_filename)
        
        experience_regex = re.findall(re.compile(r'(\d+\+?|\d+\s*[-–to]\s*\d+)\s*(years?)'), document_to_string)
        repair_experience(job_id, output_folder, word_docx_filename, experience_regex)

        salary_regex = re.findall(r'(\$?\d{1,3}(?:k|,\d{1,3}|\d{1,3}))\s*?(?:to|-)\s*?(\$?\d{1,3}(?:k|,\d{1,3}|\d{1,3}))(?:\s*(?:per\s+|a\s+)?(hour|annually|year|yearly))?', document_to_string)
        clean_salary(job_id, output_folder, word_docx_filename, salary_regex)
       
        document_to_string_v2 = mark_phrases([education_dictionary, prog_lang_dictionary, personality_traits_dict, skills_dict, software_dict, security_clearance], document_to_string)
        

        separated_list_of_words = document_to_string_v2.split(" ") # Separate the job from a divider marker my bot set
        
        
        for dictionaries in [education_dictionary, prog_lang_dictionary, personality_traits_dict, skills_dict, software_dict, security_clearance]: 
            for keys in dictionaries:
                regex_key =  find_special_characters(keys)
                value = len(re.findall(r"\.?\s\,?\(?" + regex_key + r"\,?\)?\.?\s", document_to_string_v2))
                dictionaries[keys] = value
                
            
            partition_folders = ['education', 'programming_languages', 'personality_traits', 'skillset', 'software', 'security_clearance']
            write_dictionary_to_cleansed_layer(dictionaries, job_id, output_folder, partition_folders[counter])
            counter += 1


data analyst_dc_2024-03-22_page1.docx
JOB ID0
data analyst_dc_2024-03-22_page10.docx
JOB ID0
data analyst_dc_2024-03-22_page11.docx
JOB ID0
data analyst_dc_2024-03-22_page12.docx
JOB ID0
data analyst_dc_2024-03-22_page2.docx
JOB ID0
data analyst_dc_2024-03-22_page3.docx
JOB ID0
data analyst_dc_2024-03-22_page4.docx
JOB ID0
data analyst_dc_2024-03-22_page5.docx
JOB ID0
data analyst_dc_2024-03-22_page6.docx
JOB ID0
data analyst_dc_2024-03-22_page7.docx
JOB ID0
data analyst_dc_2024-03-22_page8.docx
JOB ID0
data analyst_dc_2024-03-22_page9.docx
JOB ID0
data engineer_dc_2024-03-22_page1.docx
JOB ID1
data engineer_dc_2024-03-22_page2.docx
JOB ID1
devops_dc_2024-03-22_page1.docx
JOB ID2
devops_dc_2024-03-22_page2.docx
JOB ID2


[('000', '002', ''),
 ('$1000', '$5000', ''),
 ('$90k', '150k', 'year'),
 ('$90k', '150k', 'year'),
 ('$140k', '200k', 'year'),
 ('$1000', '$5000', ''),
 ('$140k', '200k', 'year'),
 ('800', '53', ''),
 ('800', '53', ''),
 ('800', '53', ''),
 ('$57,737', '$98,153', '')]

In [116]:
def get_job_id(job_name: str, filepath: str) -> int:ase
    job_id_db = pd.read_csv(filepath)
    
    # Check if job_name exists in the DataFrame
    if not (job_id_db['job_title'] == job_name).any():
        # Assign new job ID and append new row
        new_job_id = len(job_id_db)
        job_id_db = job_id_db.append({'job_id': new_job_id, 'job_title': job_name}, ignore_index=True)
        
        # Save the updated DataFrame back to CSV
        job_id_db.to_csv(filepath, index=False)
        
        return new_job_id
    else:
        # Return the existing job ID
        return job_id_db.loc[job_id_db['job_title'] == job_name, 'job_id'].iloc[0]

In [117]:
salary_regex

[('000', '002', ''),
 ('$1000', '$5000', ''),
 ('$90k', '150k', 'year'),
 ('$90k', '150k', 'year'),
 ('$140k', '200k', 'year'),
 ('$1000', '$5000', ''),
 ('$140k', '200k', 'year'),
 ('800', '53', ''),
 ('800', '53', ''),
 ('800', '53', ''),
 ('$57,737', '$98,153', '')]

In [118]:
clean_salary(salary_regex)

([90000, 90000, 140000, 140000, 57737],
 [150000, 150000, 200000, 200000, 98153])

In [165]:
import re
from collections import Counter

# Example text
text = "Hello world! Hello everyone. This is an example to demonstrate how to count word frequencies, including words like co-op and O'Reilly. Hello again."

# Regex pattern to include words, apostrophes in words, and hyphens
pattern = r'\b[\w\'-]+\b'

# Use re.findall to extract words based on the pattern
words = re.findall(pattern, text)

# Use Counter to create a dictionary of word frequencies
word_frequencies = Counter(words)

# Output the frequency dictionary
print(word_frequencies)

Counter({'Hello': 3, 'to': 2, 'world': 1, 'everyone': 1, 'This': 1, 'is': 1, 'an': 1, 'example': 1, 'demonstrate': 1, 'how': 1, 'count': 1, 'word': 1, 'frequencies': 1, 'including': 1, 'words': 1, 'like': 1, 'co-op': 1, 'and': 1, "O'Reilly": 1, 'again': 1})


In [168]:
dict(word_frequencies)

{'Hello': 3,
 'world': 1,
 'everyone': 1,
 'This': 1,
 'is': 1,
 'an': 1,
 'example': 1,
 'to': 2,
 'demonstrate': 1,
 'how': 1,
 'count': 1,
 'word': 1,
 'frequencies': 1,
 'including': 1,
 'words': 1,
 'like': 1,
 'co-op': 1,
 'and': 1,
 "O'Reilly": 1,
 'again': 1}

In [13]:
from collections import Counter
import re

# Example dictionary and text
words_phrases_dict = {
    "data science": 0,
    "machine learning": 0,
    "python": 0,
    "learning": 0
}
text = "Data science and machine learning are popular fields. Many use python for machine learning."
text = text.lower()
sorted_keys = sorted(words_phrases_dict.keys(), key=len, reverse=True)

for key in sorted_keys:
    pattern = re.escape(key)
    matches = re.findall(pattern, text)
    words_phrases_dict[key] += len(matches)
    text = re.sub(pattern, "", text)

# Optional: count remaining words
remaining_words = re.findall(r'\b\w+\b', text)
remaining_count = Counter(remaining_words)

# Output the results
print("Specific words/phrases counts:", words_phrases_dict)
print("Remaining words count:", remaining_count)

Specific words/phrases counts: {'data science': 1, 'machine learning': 2, 'python': 1, 'learning': 0}
Remaining words count: Counter({'and': 1, 'are': 1, 'popular': 1, 'fields': 1, 'many': 1, 'use': 1, 'for': 1})
