## Declare the global variable

In [81]:
JOB_LIST = ["Software Architect", "Business Analyst", "Data Scientist", "Game Development",
            "Database Administrator", "Data Engineer", "Data Analysts","Software Engineer", "Web Development",
            "Devops Engineer", "Network Engineer", "UI Designer", "Tester", "Mobile Developer", "Backend Developer", "Frontend Developer"]
DEFAULT_PATH = "/Users/nguyenvanviet/Work/Courses/DataGen/MyPaperDataSet/JobData/"
COMPETENCIES_LIST = ["Knowledge", "Platform", "Framework", "ProgrammingLanguage", "Tool"]
WEB_LIST = ["Indeed", "CareerBuilder", "Glint"]

## Import libs

In [82]:
import os
import shutil
import pandas as pd
from neo4j import GraphDatabase
import sparknlp as sp
import csv
from datetime import datetime


## pre processing the folder

In [83]:

def process_folders(main_folder, year):

    for fold in WEB_LIST:
        main_folder_path = main_folder + "Job" + year + "Data/" + fold + "/"
        stack = [main_folder_path]

        i = 0
        while stack:
            current_folder = stack.pop()

            for item in os.listdir(current_folder):
                item_path = os.path.join(current_folder, item)

                if os.path.isdir(item_path):
                    stack.append(item_path)
                else:
                    if item.endswith(".csv"):
                        # Rename the CSV file with its folder name
                        new_name = os.path.basename(current_folder) + ".csv"

                        current_path = os.path.join(current_folder, new_name)
                        # Move the renamed CSV file out of the folder
                        new_location = main_folder_path  + new_name
                        shutil.move(item_path, new_location)
            if (current_folder != main_folder_path):
                shutil.rmtree(current_folder)

#process_folders(DEFAULT_PATH, "2024")


## Get data from the csv file, merge them into a single file

In [84]:
spark = sp.start()



In [85]:
DataList = {}

for job in JOB_LIST:
    for web in WEB_LIST:
        df = spark.read.csv(DEFAULT_PATH + "Job2024Data/" + web + "/" + job + ".csv", header=True, inferSchema=True)
        DataList[web + "+" + job] = df
len(DataList)


                                                                                

48

## Format the date

In [86]:
from datetime import datetime

def convert_date_format(date_str):
    # Try to parse the date string
    try:
        date_obj = datetime.strptime(date_str, '%d-%m-%Y')
    except ValueError:
        # If parsing fails, try the alternative format
        date_obj = datetime.strptime(date_str, '%Y-%m-%d')
    
    # Format the date object into 'dd-mm-yyyy'
    formatted_date = date_obj.strftime('%d-%m-%Y')
    return formatted_date

## Detect the entities

In [87]:
def toRow(data, web, jobType):
    res = []
    res.append(data["companyName"])
    res.append(data["jobName"])
    res.append(data["location"])
    res.append(data["require"])
    res.append(data["salary"])
    res.append(convert_date_format(str(data["timePost"])))
    res.append(web)
    res.append(jobType)
    
    return res

def checkRelatedWord(keyword, name):
    words_a = keyword.split()  
    for word in words_a:
        if word in name:
            return True
    
    return False



def dataToCsv(fileName, header):
    with open(fileName, 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(header)

        for job in JOB_LIST:
            for web in WEB_LIST:
                jobPost = DataList[web + "+" + job].toPandas()
                for _, row in jobPost.iterrows():
                    if checkRelatedWord(job.upper(), row['jobName'].upper()) is False:
                        continue  # skip none related row
                    writedRow = toRow(row, web, job)
                    csv_writer.writerow(writedRow)
                print (f"{job}  -  {web}  -   Done", end='\r')
          

In [88]:
# Specify the file path
fileName = DEFAULT_PATH + 'jobPost2024.csv'
header = ["companyName", "jobName", "location", "requirement", "salary", "timePost", "website", "jobType"]

# Writing to CSV file line by line
dataToCsv(fileName, header)
spark.stop()

Frontend Developer  -  Glint  -   Done-   DoneDone

In [89]:
def detectCompetency(df, search_string: str):
    search_string = search_string.upper()
    matching_values_dict = {}

    for column in COMPETENCIES_LIST:
        matching_values_dict[column] = []
        if len(df[column]) < 1:
            continue

        for value in df[column]:
            if value == '':
                continue

            idx1 = search_string.find(" " + value + " ")
            idx2 = search_string.find(" " + value + ",")
            idx3 = search_string.find("," + value + ",")
            idx4 = search_string.find("," + value + " ")
            idx5 = search_string.find("." + value + " ")
            idx6 = search_string.find("." + value + ",")
            idx7 = search_string.find("." + value + ".")
            idx8 = search_string.find("," + value + ".")
            idx9 = search_string.find(" " + value + ".")
            final = max(idx1, idx2, idx3, idx4, idx5, idx6, idx7, idx8, idx9)

            if final >= 0:
                matching_values_dict[column].append(value)

    return matching_values_dict

In [104]:
unifiedJobData = pd.read_csv(fileName)

Competencies = {"Knowledge": [], "Platform": [], "Framework":[], "ProgrammingLanguage":[], "Tool":[]}

for comp in COMPETENCIES_LIST:
    temp = pd.read_csv(DEFAULT_PATH + "Competency/" + comp + ".csv")
    Competencies[comp] = temp[comp[:1].lower() + comp[1:]].dropna().to_list()

def detectCompetencyFromJobData(jobData, Competencies):
    i = 0
    n = len(jobData)
    for index, row in jobData.iterrows():
        skill_requirement = row['requirement']
        i = i + 1
        print(f"Loading process {i/n * 100} +-*/%", end='\r')
        
        if pd.isna(skill_requirement) or len(skill_requirement) < 2:
            for comp in COMPETENCIES_LIST:
                jobData.at[index, comp] = pd.NA
            continue    
        competency_values = detectCompetency(Competencies, skill_requirement)
            
        #Add competency values as new columns
        for competency, values in competency_values.items():
            jobData.at[index, competency] = ','.join(values)

        
        

unifiedJobData.head(5)


Unnamed: 0,companyName,jobName,location,requirement,salary,timePost,website,jobType
0,CSX,Software Architect I | Messaging & Data Stream...,Remote in United States,Job Summary CSX Technology is transforming the...,Competitive,14-01-2024,Indeed,Software Architect
1,EX2 Outcoding,Software Architect,Remote,Become an Outcoder as a Software Architect We ...,Competitive,14-01-2024,Indeed,Software Architect
2,Innominds Software,Software Architect,"San Jose, CA 95131 (North San Jose area)","Job duration: 40 Hours / Week, Permanent posit...",Competitive,14-01-2024,Indeed,Software Architect
3,"Karsun Solutions, LLC",Software Design Architect,Remote,Overview: The Software Design Architect is res...,Competitive,14-01-2024,Indeed,Software Architect
4,INTEL,Cloud Software Architect SW-Defined Storage,Remote,"Job Description Architects, designs and provid...",Competitive,14-01-2024,Indeed,Software Architect


In [110]:
detectCompetencyFromJobData(unifiedJobData, Competencies)
unifiedJobData = unifiedJobData.fillna("")
unifiedJobData.head(5)

Loading process 100.0 +-*/%382887 +-*/%%%

Unnamed: 0,companyName,jobName,location,requirement,salary,timePost,website,jobType,Knowledge,Platform,Framework,ProgrammingLanguage,Tool
0,CSX,Software Architect I | Messaging & Data Stream...,Remote in United States,Job Summary CSX Technology is transforming the...,Competitive,14-01-2024,Indeed,Software Architect,"AGILE,DATABASE,DEVOPS,SCRUM,DATABASE DESIGN,RE...","KUBERNETES,AZURE,WEB DEVELOPMENT,SPRING,CLOUD","SPRING,JUNIT,UNIT,DATABASE,EXCEL","JAVA,AZURE,KUBERNETES","EXCEL,KUBERNETES,GRAFANA,DEVOPS,SOFTWARE DEVEL..."
1,EX2 Outcoding,Software Architect,Remote,Become an Outcoder as a Software Architect We ...,Competitive,14-01-2024,Indeed,Software Architect,"SOLID,COMPUTER SCIENCE,FUNCTIONS,BUSINESS,MODE...",,.NET,"JAVA,JAVASCRIPT,UML","SOFTWARE DEVELOPMENT,SOFTWARE DESIGN,SOLID,UML..."
2,Innominds Software,Software Architect,"San Jose, CA 95131 (North San Jose area)","Job duration: 40 Hours / Week, Permanent posit...",Competitive,14-01-2024,Indeed,Software Architect,"AGILE,DATABASE,UI,DATABASE DESIGN,MICROSERVICE...","KUBERNETES,AWS,KAFKA,APACHE KAFKA,APACHE,API,M...","UNIT,MYSQL,POSTMAN,REQUESTS,DATABASE,GITLAB","MYSQL,AWS,DOCKER,KUBERNETES,APACHE,KAFKA","MYSQL,DOCKER,POSTMAN,KUBERNETES,ORACLE,GITLAB,..."
3,"Karsun Solutions, LLC",Software Design Architect,Remote,Overview: The Software Design Architect is res...,Competitive,14-01-2024,Indeed,Software Architect,"AGILE,MICROSERVICES,PROTOTYPE,DATABASES,APIS,S...","AWS,NODEJS,REACT,CLOUD",REACT,"SQL,UML,AWS","MICROSERVICES,COMPUTING,NOSQL,SQL,SERVERLESS,N..."
4,INTEL,Cloud Software Architect SW-Defined Storage,Remote,"Job Description Architects, designs and provid...",Competitive,14-01-2024,Indeed,Software Architect,"AGILE,DATABASE,DEVOPS,SECURITY,MICROSERVICES,C...","GCP,AZURE,API,CLOUD","PYTORCH,DATABASE","AZURE,GCP","MICROSERVICES,DEVOPS,API,AI,DATABASE,SECURITY,..."


In [111]:
unifiedJobData = unifiedJobData.fillna("")
fullJobData = unifiedJobData.groupby(['companyName', 'location', 'timePost', 'website', 'jobType']).agg({
    'jobName': 'count',
    'Knowledge': lambda x: ','.join(x),
    'Platform': lambda x: ','.join(x),
    'Framework': lambda x: ','.join(x),
    'ProgrammingLanguage': lambda x: ','.join(x),
    'Tool': lambda x: ','.join(x)
}).reset_index()
fullJobData = fullJobData.rename(columns={'jobName': 'jobCount'})

# Convert the 'timePost' column to datetime format
fullJobData['timePost'] = pd.to_datetime(fullJobData['timePost'], format='%d-%m-%Y')

# Extract the date, month, and year into separate columns
fullJobData['date'] = fullJobData['timePost'].dt.day
fullJobData['month'] = fullJobData['timePost'].dt.month
fullJobData['year'] = fullJobData['timePost'].dt.year
fullJobData = fullJobData.replace("", pd.NA)
fullJobData.head(5)

Unnamed: 0,companyName,location,timePost,website,jobType,jobCount,Knowledge,Platform,Framework,ProgrammingLanguage,Tool,date,month,year
0,AAA Minneapolis,"Saint Louis Park, MN 55416 (Wolfe Park area)",2024-01-14,Indeed,Database Administrator,3,",,BUSINESS INTELLIGENCE,DATA WAREHOUSE,DATA VI...",",,GOOGLE ANALYTICS,TABLEAU,DATA ANALYSIS,GOOGL...",",,SSIS,DATA ANALYSIS,DATA VISUALIZATION,DATABA...",",,SQL,GOOGLE DATA STUDIO",",,EXCEL,TABLEAU,SSIS,POWER BI,KPIS,BI,DATABASE...",14,1,2024
1,ACB - Ngân Hàng TMCP Á Châu,Hồ Chí Minh,2024-01-08,CareerBuilder,Business Analyst,1,"DATABASE,PROGRAMMING,COMPUTER SCIENCE,BUSINESS...",,DATABASE,SQL,"DATABASE,SQL,VIETNAMESE,ENGLISH,DATABASE SCHEM...",8,1,2024
2,ADPMN IT SOLUTIONS,Georgia,2024-01-14,Indeed,Database Administrator,1,,,,,,14,1,2024
3,"ADS, Inc.","Austin, TX",2024-01-14,Indeed,Frontend Developer,1,"COLOR,WEB DEVELOPMENT,WEB DESIGN,MOBILE APPS,D...","WEB DEVELOPMENT,REACT",REACT,"SQL,PYTHON,JAVASCRIPT,HTML,CSS,JS","ADOBE ILLUSTRATOR,WEB DEVELOPMENT,SQL,WEB DESI...",14,1,2024
4,"ADS, Inc.","Austin, TX",2024-01-14,Indeed,Mobile Developer,1,"COLOR,WEB DEVELOPMENT,WEB DESIGN,MOBILE APPS,D...","WEB DEVELOPMENT,REACT",REACT,"SQL,PYTHON,JAVASCRIPT,HTML,CSS,JS","ADOBE ILLUSTRATOR,WEB DEVELOPMENT,SQL,WEB DESI...",14,1,2024


## Convert clean data to csv

In [112]:
fullJobData.to_csv(DEFAULT_PATH + "job2024clean.csv", index=False)