In [68]:
import pandas as pd 
import os
import pyspark
from pyspark.sql import SparkSession
import docx
from docx import Document
import re
import csv 
import uuid

myuuid = uuid.uuid4()

print('Your UUID is: ' + str(myuuid))

Your UUID is: 41e424a0-9334-49fb-b227-d8f31d475a16


In [69]:
current_dir = os.getcwd()
raw_files = str(current_dir) + r"\raw_files"


In [70]:
def read_word(path):
    '''Creates a string object from a word document of job descriptions div containers'''
    doc = Document(path)
    full_text = []

    # Iterate document object and extract text
    for para in doc.paragraphs:
        full_text.append(para.text)

    # Join and create string object 
    return '\n'.join(full_text)
    

# Each word document collected contains about 75 job description div containers, below is how one looks

In [63]:
list_of_files = os.listdir(raw_files)
document_to_string = read_word(raw_files+ "//" + list_of_files[0])
separate_job_objects = document_to_string.split('\n\n--------------------')
#separate_job_objects[0]

# We want to extract the job description portion and clean up all remaining html tags and grammar

In [64]:
def clean_job_description_paragraph(job_desc_list):
    
    job_description_string = ''
    
    phone_number_pattern = r'\(?\b\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b'
    date_pattern = r'\(?\b\d{4}\)?[-.\s]?\d{2}[-.\s]?\d{2}\b'
    remove_top_html_pattern = r'</div><div id="jobDescriptionText"'
    remove_bottom_html_pattern = r'</div></div>'
    merged_tag_pattern = r'<(\w{1,2})(\w+)'
    replacement = r'\2'
    first_line_pattern = re.compile(r' class="[^"]*">\s')
    
    tags_to_remove = ['<div>', '</div>', '<p>', '</p>', '<br>', '</br>', '<ul>', '</ul>', '<i>', '</i>', '<b>', '</b>', '<li>', '</li>', '\n', '\n+', '<i>', "'", '<h4>', 
                        '</h4>', '</h3>', '<h3>', '<h2>', '</h2>', "’", r'/', r'\.00\b','  +']
    
    
    for job_desc_html in job_desc_list:
        try:
            job_desc_html_v2 = job_desc_html.split(remove_top_html_pattern)[1]
            job_desc_html_v3 = job_desc_html_v2.split(remove_bottom_html_pattern)[0]
        except:
            continue
        


        for items in tags_to_remove: 
            job_desc_html_v3 = re.sub(items, ' ', job_desc_html_v3)

        
        for regex_fliters in [first_line_pattern, phone_number_pattern, date_pattern]:
            job_desc_html_v3 = re.sub(regex_fliters, '', job_desc_html_v3)

        
        refined_job_desc = re.sub(merged_tag_pattern, replacement, job_desc_html_v3)

        job_description_string += "\u0332".join("Job Description:  ") + refined_job_desc +  "\n\n -------------------------------------------------------------------------------------- \n\n "
        
    return job_description_string.lower()

input_folder = str(current_dir) + r"\raw_files" 
output_folder = str(current_dir) + r"\cleansed_files"

list_of_files = os.listdir(input_folder)

list_of_files
for word_docx in list_of_files: 
    
    document_to_string = read_word(raw_files + r"\\" +  word_docx)
    separate_job_objects = document_to_string.split('\n\n--------------------') # Separate the job from a divider marker my bot set
    
    repaired_doc = clean_job_description_paragraph(separate_job_objects)
    
    cleansed_page = Document()
    cleansed_page.add_paragraph(repaired_doc)
    
    
    new_file_path = os.path.join(output_folder, word_docx)
    cleansed_page.save(new_file_path)
    


# This Cleansed Layer will not take the cleaned up job description paragraphs and use dictionaries to extract word frequencies

In [65]:
def create_dictionary_from_csv(path_to_csv: str) -> dict:
    ''' Creates environment folders and partitioning folders when new job title is created 
    
        Args: 
        path_to_csv (str): string path to csv file needed to make a frequency dictionary
        
        Returns:
        freq_dictionary (dict): blank frequency dictionary '''
    
    freq_dictionary = {} 

    with open(path_to_csv, mode='r', newline='') as file: 
        reader = csv.reader(file)
        next(reader, None) # Skips header
        for row in reader: 
            if row:
                freq_dictionary[row[0]] = 0
    
    return freq_dictionary

def create_cleansed_enviornment(list_of_files_in_raw: list, output_folder: str) -> list:
    ''' Creates environment folders and partitioning folders when new job title is created 
    
        Args: 
        list_of_files_in_raw (list): list of all files collected using os.listdir
        
        Returns:
        all_job_names (list): list of all unique names collected from raw files listed'''
        
    partition_folders = ['education', 'programming_languages', 'personality_traits', 'skillset', 'software', 'security_clearance', 'experience', 'salary']
    all_job_names = []
    
    for word_docx_filename in list_of_files_in_raw:

        position_title = word_docx_filename.split('_')[0]
    
        if position_title not in all_job_names: all_job_names.append(position_title)
            

    for items in partition_folders:
        if not os.path.exists(output_folder + "//" + items):
            os.makedirs(output_folder + "//" + items)
        

        
    return all_job_names

def extract_salary_from_paragraph(job_id: str, output_folder: str, word_docx_filename: str, document_to_string: str):
    ''' Cleans salary tuple in order to convert it into two dataframe columns after
        
        Args: 
        salary_tuple: tuple of low end, high end, and hourly/salary pay grades 
        
        Returns: 
        Tuple: a tuple of job low/high end pay ranges '''
    
    salary_tuple = re.findall(r'(\$?\d{1,3}(?:k|,\d{1,3}|\d{1,3}))\s*?(?:to|-)\s*?(\$?\d{1,3}(?:k|,\d{1,3}|\d{1,3}))(?:\s*(?:per\s+|a\s+)?(hour|annually|year|yearly))?', document_to_string)
    pattern = r'[^a-zA-Z0-9\s]' # remove special characters
    low_end_pay = [] 
    high_end_pay = [] 
    
    
    for matches in salary_tuple: 
        low = re.sub(pattern, '', matches[0]) # remove special characters
        high = re.sub(pattern, '', matches[1]) # remove special characters
        
        low = re.sub('k', '000', low) # replace k with 000
        high = re.sub('k', '000', high) 
        
        
        if int(low) < 20000 or int(high) < 20000:
            
            if matches[2] == 'year' or matches[2] == 'annually':
                
                if len(low) < 5: low = int(low) * 1000
                if len(high) < 5: high = int(high) * 1000
                
                if int(low) < 20000 or int(high) < 20000:
                    continue
                    
            elif matches[2] == 'hour' or matches[2] == 'hourly':
                low = int(low) * 40 * 52 # Convert to salary
                high = int(high) * 40 * 52 # Convert to salary
                
            else: 
                continue
        
        low_end_pay.append(int(low))
        high_end_pay.append(int(high))
        
    salary_dataframe = pd.DataFrame()
    salary_dataframe['pay_low_end'] = low_end_pay
    salary_dataframe['pay_high_end'] = high_end_pay
    salary_dataframe['job_id'] = job_id
    
    salary_dataframe.to_csv(output_folder + "\\" + 'salary' + "\\"  + word_docx_filename + '.csv', index=False)

def extract_experience_from_paragraph(job_id: str, output_folder: str, word_docx_filename: str, document_to_string: str):
    """Retrieve or assign a unique job ID based on the job title.
    
    Args:
    list_of_experience: a list of all experience years collected from job descriptions
    
    Returns:
    years_experience: a cleaned list of professional experience
    """
    years_experience = []
    list_of_experience = re.findall(re.compile(r'(\d+\+?|\d+\s*[-–to]\s*\d+)\s*(years?)'), document_to_string)
    
    for items in list_of_experience:
        years = re.sub(r'[^a-zA-Z0-9-]', '', items[0])
        
        if '-' in years: 
            years_array = years.split('-') 
            ranged_item = list(range(int(years_array[0]), int(years_array[1]) + 1))
            years_experience.extend(ranged_item)
        elif int(years) > 15: 
            continue
        else: years_experience.append(int(years))
    
    experience_db = pd.DataFrame()
    experience_db['years_experience_recorded'] = years_experience
    experience_db['job_id'] = job_id
    
    experience_db.to_csv(output_folder + "\\" + 'experience' + "\\"  + word_docx_filename + '.csv', index=False)

def get_job_id(job_name: str, file_path: str) -> int:
    """Retrieve or assign a unique job ID based on the job title.
    
    Args:
    job_name (str): The name of the job to retrieve or create an ID for.
    filepath (str): Path to the CSV file containing job categories.
    
    Returns:
    int: The job ID.
    """
    
    job_id_db = pd.read_csv(file_path)
    
    if not (job_id_db['job_title'].eq(job_name)).any(): # If doesnt exists in job_categories.csv
        
        # Set new id value
        new_job_id = len(job_id_db)
        new_data = pd.DataFrame({'job_id': new_job_id, 'job_title': job_name})
        #job_id_db = job_id_db.append({'job_id': new_job_id, 'job_title': job_name}, ignore_index=True)
        job_id_db = pd.concat([job_id_db, new_data], ignore_index=True)
        
        # Order them numerically
        job_id_db = job_id_db.sort_values(by=["job_id"], ascending=True, ignore_index=True) 
        
        # Save file back as updated csv file 
        job_id_db.to_csv(filepath, index=False)
        
        return len(job_id_db)
    
    else: 
        
        job_id_col = job_id_db["job_id"]
        id = job_id_db.loc[job_id_db["job_title"] == job_name]["job_id"].iloc[0]
        return id
       
def write_dictionary_to_cleansed_layer(measurement_dictionary: dict, job_id: str, output_folder: str, folder: str, word_docx_filename: str): 
    """Write creates a dataframe object from dictionary passed, and writes it into a csv with the same file name
    
    Args:
    measurement_dictionary (dict): frequency dictionary.
    job_id (int): unique job id.
    output_folder (str): path to cleansed layer.
    folder (str): job specific folder in cleansed layer.
    word_docx_filename (str): name of file being used
    
    """
    
    file_name_split = word_docx_filename.split('_')
    state = file_name_split[1]
    report_year = file_name_split[2].split('-')[0]
    
    for name, dictionary in measurement_dictionary.items():
        
        dataframe = pd.DataFrame(list(measurement_dictionary.items()), columns = [folder, 'frequency'])
        dataframe['job_id'] = job_id  # Adding job_id column
        dataframe['state'] = state  # Adding state column
        dataframe['report_year'] = report_year  # Adding report year column
        dataframe.to_csv(output_folder + "\\" + folder + "\\"  + word_docx_filename + '.csv', index=False)
        
def mark_phrases(word_doc_text: str, words_for_marking_desc: list) -> str:
    ''' Function takes multiple worded phrases from dictionaries and replaced space with '-' in order to mark them before counting 
        
        Args: 
        word_doc_text (str): word documented converted into a string 
        words_for_marking (list) : list of phrases that will be marked to count accurately 
        
        Returns: 
        word_doc_text (str): word document with marked text '''
    for phrase in words_for_marking_desc:

        word_doc_text = re.sub(phrase, phrase.replace(' ', '-'), word_doc_text)
        
    return word_doc_text

def find_special_characters(s):
    # This pattern matches any character that is not a letter or a number
    pattern = re.compile('[^a-zA-Z0-9]')
    # Find all non-alphanumeric characters in the string
    special_chars = pattern.findall(s)
    unique_special_chars = set(special_chars)
    
    for special_character in unique_special_chars: 
        s = s.replace(special_character, "\\" + special_character)
    return s

def create_key_glossary_from_dict_shells(dictionary_skeletons_folder: str):
    # Specify the path to the file
    glossary_path = dictionary_skeletons_folder + '\dict_key_glossary.csv'
    
    
    # Check if the file exists
    if os.path.exists(glossary_path):
        glossary_db = pd.read_csv(glossary_path)
    else:
        glossary_db = pd.DataFrame(columns=['keys'])
        
        for file in os.listdir(dictionary_skeletons_folder):
            if file.endswith('.csv'):
                keyword_list = pd.read_csv(dictionary_skeletons_folder + r"\\" +  file).iloc[:, 0].tolist()
                new_df = pd.DataFrame(keyword_list, columns=['keys'])
                glossary_db = pd.concat([glossary_db, new_df], ignore_index=True)
        
        glossary_db.to_csv(glossary_path, index=False)
        
    list_of_hypend_words = [x for x in glossary_db.iloc[:, 0].tolist() if "-" in x]
    words_without_hyphens = list(map(lambda word: word.replace('-', ' '), list_of_hypend_words))
    words_without_hyphens_desc = sorted(words_without_hyphens, key=len, reverse=True)
    return words_without_hyphens_desc



In [66]:
input_folder = str(current_dir) + r"\cleansed_files"
output_folder = str(current_dir) + r"\curated_files"
job_category_folder = str(current_dir) + r"\artifacts\dependencies\job_categories.csv"
dictionary_skeletons_folder = str(current_dir) + r"\artifacts\dictionary_shells"

words_for_marking_desc = create_key_glossary_from_dict_shells(dictionary_skeletons_folder)
unique_job_names = create_cleansed_enviornment(list_of_files, output_folder)
list_of_files = os.listdir(input_folder)


for job_titles in unique_job_names: 

    
    education_dictionary = create_dictionary_from_csv(dictionary_skeletons_folder + r"\education.csv")
    prog_lang_dictionary = create_dictionary_from_csv(dictionary_skeletons_folder + r"\programming_languages.csv")
    personality_traits_dict = create_dictionary_from_csv(dictionary_skeletons_folder + r"\personality_traits.csv")
    skills_dict = create_dictionary_from_csv(dictionary_skeletons_folder + r"\skills.csv")
    software_dict = create_dictionary_from_csv(dictionary_skeletons_folder + r"\software.csv")
    security_clearance = create_dictionary_from_csv(dictionary_skeletons_folder + r"\security_clearance.csv")
    
    job_id = get_job_id(job_titles, job_category_folder)
    job_specific_files = [x for x in list_of_files if job_titles in x]
    
    for word_docx_filename in job_specific_files:
        print(word_docx_filename)
        counter = 0
        
        document_to_string = read_word(input_folder + r"\\" +  word_docx_filename)
        
        extract_experience_from_paragraph(job_id, output_folder, word_docx_filename, document_to_string)

        extract_salary_from_paragraph(job_id, output_folder, word_docx_filename, document_to_string)
       
        document_to_string_v2 = mark_phrases(document_to_string, words_for_marking_desc)
        
        
        for dictionaries in [education_dictionary, prog_lang_dictionary, personality_traits_dict, skills_dict, software_dict, security_clearance]: 
            for keys in dictionaries:
                regex_key =  find_special_characters(keys)
                value = len(re.findall(r"\.?\s\,?\(?" + regex_key + r"\,?\)?\.?\s", document_to_string_v2))
                dictionaries[keys] = value
                
            
            partition_folders = ['education', 'programming_languages', 'personality_traits', 'skillset', 'software', 'security_clearance']
            write_dictionary_to_cleansed_layer(dictionaries, job_id, output_folder, partition_folders[counter], word_docx_filename[:-4])
            counter += 1


data analyst_dmv_2024-05-05_page1.docx
data analyst_dmv_2024-05-05_page10.docx
data analyst_dmv_2024-05-05_page11.docx
data analyst_dmv_2024-05-05_page12.docx
data analyst_dmv_2024-05-05_page2.docx
data analyst_dmv_2024-05-05_page3.docx
data analyst_dmv_2024-05-05_page4.docx
data analyst_dmv_2024-05-05_page5.docx
data analyst_dmv_2024-05-05_page6.docx
data analyst_dmv_2024-05-05_page7.docx
data analyst_dmv_2024-05-05_page8.docx
data analyst_dmv_2024-05-05_page9.docx
software engineer_dmv_2024-05-05_page1.docx
software engineer_dmv_2024-05-05_page10.docx
software engineer_dmv_2024-05-05_page11.docx
software engineer_dmv_2024-05-05_page12.docx
software engineer_dmv_2024-05-05_page13.docx
software engineer_dmv_2024-05-05_page14.docx
software engineer_dmv_2024-05-05_page15.docx
software engineer_dmv_2024-05-05_page16.docx
software engineer_dmv_2024-05-05_page17.docx
software engineer_dmv_2024-05-05_page18.docx
software engineer_dmv_2024-05-05_page19.docx
software engineer_dmv_2024-05-05_pa

['cloud infrastructure management',
 'linux system administration',
 'personal finance management',
 'health and safety awareness',
 'online collaboration tools',
 'secret security clearance',
 'quality assurance testing',
 'virtual meeting etiquette',
 'cloud storage and sharing',
 'editing and proofreading',
 'artificial intelligence',
 'social media management',
 'diversity and inclusion',
 'environmental awareness',
 'digital nomadism skills',
 'public trust clearance',
 'mobile app development',
 'computational thinking',
 'performance management',
 'emotional intelligence',
 'stakeholder management',
 'data privacy awareness',
 'azure virtual machines',
 'google cloud functions',
 'microsoft office suite',
 'blockchain technology',
 'e commerce management',
 'regulatory compliance',
 'operations management',
 'written communication',
 'time zone sensitivity',
 'google compute engine',
 'software development',
 'network architecture',
 'statistical analysis',
 'business developmen

# Reporting Layer

In [None]:
spark = SparkSession.builder.master("local[1]") \
                    .appName('headstart_pipeline_code') \
                    .getOrCreate()

spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
spark.conf.set("spark.sql.execution.arrow.enabled","true")

# Education Reporting Layer

In [61]:
s3_input = str(current_dir) + r"\curated_files"
s3_output = str(current_dir) + r"\reporting_layer"
skeletons_mapping_folder = str(current_dir) + r"\artifacts\category_mapping"


education = spark.read.option("header", True).csv('education_test.csv')
education_map = spark.read.option("header", True).csv(skeletons_mapping_folder + r"\education_category_mapping.csv")

education.createOrReplaceTempView("education_database") 
education_map.createOrReplaceTempView("edu_map") 

mapped_database = spark.sql('''
        SELECT ed.job_id, em.degree_category_id, SUM(ed.frequency) AS frequency
        FROM education_database ed
        LEFT JOIN edu_map em 
            ON ed.education = em.degree 
        GROUP BY em.degree_category_id, ed.job_id
        ORDER BY ed.job_id ASC, em.degree_category_id
    ''')


mapped_database.show()

+------+------------------+---------+
|job_id|degree_category_id|frequency|
+------+------------------+---------+
|     0|  bachelors degree|     53.0|
|     0|        highschool|      0.0|
|     0|    masters degree|     16.0|
|     0|               phd|      0.0|
|     4|  bachelors degree|     30.0|
|     4|        highschool|      1.0|
|     4|    masters degree|     16.0|
|     4|               phd|     12.0|
+------+------------------+---------+



# Job Description Experience In Years For Positions Reporting Layer

In [73]:
s3_input = str(current_dir) + r"\curated_files"
s3_output = str(current_dir) + r"\reporting_layer"

experience_db = spark.read.option("header", True).csv('experience_test.csv')

experience_db.createOrReplaceTempView('experience_database') 

result_db = spark.sql(''' SELECT job_id, CAST(years_experience_recorded AS INT) AS years_experience, COUNT(years_experience_recorded) AS frequency
                          FROM experience_database 
                          GROUP BY years_experience_recorded, job_id
                          ORDER BY job_id ASC, years_experience ASC
    ''')


+------+----------------+---------+
|job_id|years_experience|frequency|
+------+----------------+---------+
|     0|               1|        4|
|     0|               2|       11|
|     0|               3|       17|
|     0|               4|        9|
|     0|               5|       15|
|     0|               6|        7|
|     0|               7|        3|
|     0|               8|        8|
|     0|               9|        4|
|     0|              10|        9|
|     0|              11|        3|
|     0|              12|        3|
|     0|              13|        2|
|     0|              14|        3|
|     0|              15|        2|
|     0|              16|        1|
|     1|               3|        1|
|     1|               4|        2|
|     1|               5|        3|
|     1|               6|        1|
|     1|               8|        2|
|     1|               9|        1|
|     1|              10|        2|
|     1|              14|        1|
|     2|               1|   

# Job Description Personality Trait Frequencies For Positions Reporting Layer

In [117]:
salary_regex

[('000', '002', ''),
 ('$1000', '$5000', ''),
 ('$90k', '150k', 'year'),
 ('$90k', '150k', 'year'),
 ('$140k', '200k', 'year'),
 ('$1000', '$5000', ''),
 ('$140k', '200k', 'year'),
 ('800', '53', ''),
 ('800', '53', ''),
 ('800', '53', ''),
 ('$57,737', '$98,153', '')]

In [118]:
clean_salary(salary_regex)

([90000, 90000, 140000, 140000, 57737],
 [150000, 150000, 200000, 200000, 98153])

In [165]:
import re
from collections import Counter

# Example text
text = "Hello world! Hello everyone. This is an example to demonstrate how to count word frequencies, including words like co-op and O'Reilly. Hello again."

# Regex pattern to include words, apostrophes in words, and hyphens
pattern = r'\b[\w\'-]+\b'

# Use re.findall to extract words based on the pattern
words = re.findall(pattern, text)

# Use Counter to create a dictionary of word frequencies
word_frequencies = Counter(words)

# Output the frequency dictionary
print(word_frequencies)

Counter({'Hello': 3, 'to': 2, 'world': 1, 'everyone': 1, 'This': 1, 'is': 1, 'an': 1, 'example': 1, 'demonstrate': 1, 'how': 1, 'count': 1, 'word': 1, 'frequencies': 1, 'including': 1, 'words': 1, 'like': 1, 'co-op': 1, 'and': 1, "O'Reilly": 1, 'again': 1})


In [168]:
dict(word_frequencies)

{'Hello': 3,
 'world': 1,
 'everyone': 1,
 'This': 1,
 'is': 1,
 'an': 1,
 'example': 1,
 'to': 2,
 'demonstrate': 1,
 'how': 1,
 'count': 1,
 'word': 1,
 'frequencies': 1,
 'including': 1,
 'words': 1,
 'like': 1,
 'co-op': 1,
 'and': 1,
 "O'Reilly": 1,
 'again': 1}

In [13]:
from collections import Counter
import re

# Example dictionary and text
words_phrases_dict = {
    "data science": 0,
    "machine learning": 0,
    "python": 0,
    "learning": 0
}
text = "Data science and machine learning are popular fields. Many use python for machine learning."
text = text.lower()
sorted_keys = sorted(words_phrases_dict.keys(), key=len, reverse=True)

for key in sorted_keys:
    pattern = re.escape(key)
    matches = re.findall(pattern, text)
    words_phrases_dict[key] += len(matches)
    text = re.sub(pattern, "", text)

# Optional: count remaining words
remaining_words = re.findall(r'\b\w+\b', text)
remaining_count = Counter(remaining_words)

# Output the results
print("Specific words/phrases counts:", words_phrases_dict)
print("Remaining words count:", remaining_count)

Specific words/phrases counts: {'data science': 1, 'machine learning': 2, 'python': 1, 'learning': 0}
Remaining words count: Counter({'and': 1, 'are': 1, 'popular': 1, 'fields': 1, 'many': 1, 'use': 1, 'for': 1})
