## Import libraries

In [2]:
import pandas as pd
from neo4j import GraphDatabase
import csv


## Define the global variable

In [3]:
JOB_LIST = ["Software Architect", "Business Analyst", "Data Scientist", "Game Development",
            "Database Administrator", "Data Engineer", "Data Analysts","Software Engineer", "Web Development",
            "Devops Engineer", "Network Engineer", "UI Designer", "Tester", "Mobile Developer", "Backend Developer", "Frontend Developer"]
DEFAULT_PATH = "/Users/nguyenvanviet/Work/Courses/DataGen/MyPaperDataSet/JobData/"
COMPETENCIES_LIST = ["Knowledge", "Platform", "Framework", "ProgrammingLanguage", "Tool"]
WEB_LIST = ["Indeed", "CareerBuilder"]

## connect to neo4j

In [4]:

uri = "bolt://localhost:7690"  # Adjust the URI based on your Neo4j server configuration
username = "neo4j"
password = "12345678"

# Create a Neo4j driver instance
driver = GraphDatabase.driver(uri, auth=(username, password))
driver


<neo4j._sync.driver.BoltDriver at 0x7f8ed86850a0>

## Convert query to csv

In [8]:
def queryToCsv(query, fileName):

    columnName = fileName.lower()
    if fileName == "ProgrammingLanguage":
        columnName = "programmingLanguage" # programmingLanguage


    with driver.session() as session:
        result = session.run(query)
        data = [record['k'][columnName] for record in result]

    # Create a DataFrame with the transformed data
    df = pd.DataFrame(data, columns=[columnName])
        
    # Save DataFrame to a CSV file
    df.to_csv("Compentencies/" + fileName + ".csv", index=False, encoding="utf-8")
    print("Query results saved to ", fileName)

def queryToDataFrame(query, key):

    columnName = key.lower()
    if key == "ProgrammingLanguage":
        columnName = "programmingLanguage" # programmingLanguage


    with driver.session() as session:
        result = session.run(query)
        data = [record['k'][columnName] for record in result]


    # Create a DataFrame with the transformed data
    df = pd.DataFrame(data, columns=[columnName])
    df = df.map(lambda x: x.strip())
    df_no_duplicates = df.drop_duplicates()
    df_remove_nonsense_value = df_no_duplicates.replace('A', pd.NA)
    df_remove_nan = df_remove_nonsense_value.dropna()
    df_remove_nan[columnName].to_csv(DEFAULT_PATH + "Competency/" + key + ".csv", index=False)
    return df_remove_nan[columnName]

def ListToSet(df):
    dataSet = set()
    for data in df:
        dataSet.add(data.strip())
    return dataSet


## detect the compentency entities here
def findCompetency(df, search_string):

    search_string = search_string.upper()
    matching_values_dict = {}

    for column in COMPETENCIES_LIST:
        matching_values_dict[column] = []
        if len(df[column]) < 1: continue

        for value in df[column]:
            if (value == ''): continue
            idx1 = search_string.find(" " + value + " ")
            idx2 = search_string.find(" " + value + ",")
            idx3 = search_string.find("," + value + ",")
            idx4 = search_string.find("," + value + " ")
            idx5 = search_string.find("." + value + " ")
            idx6 = search_string.find("." + value + ",")
            idx7 = search_string.find("." + value + ".")
            idx8 = search_string.find("," + value + ".")
            idx9 = search_string.find(" " + value + ".")
            final = idx1 + idx2 + idx3 + idx4 + idx5 + idx6 + idx7 + idx8 + idx9

            if (final != -9):
                matching_values_dict[column].append(value)

    return matching_values_dict


## Main

In [9]:
Competencies = {"Knowledge": [], "Platform": [], "Framework":[], "ProgrammingLanguage":[], "Tool":[]}


for comp in COMPETENCIES_LIST:
    query = "MATCH (k: " + comp + ") RETURN k"
    Competencies[comp] = queryToDataFrame(query, comp).tolist()



# # Drop rows containing NaN values
#Competencies.dropna(inplace=True)
for comp in COMPETENCIES_LIST:
    print(Competencies[comp])


## Reading course data

In [37]:
courseDf = pd.read_csv("/Users/nguyenvanviet/Work/Courses/DataGen/MyPaperDataSet/courseFinalVer2.csv" )


## process the course data then re write it to csv

In [44]:
# Process each row and add competency columns
for index, row in courseDf.iterrows():
    skill_requirement = row['skillrequirement']
    if pd.isna(skill_requirement):
        for comp in COMPETENCIES_LIST:
            courseDf.at[index, "Need"+comp] = ''
        continue    
    
    competency_values = findCompetency(Competencies, skill_requirement)
        
    #Add competency values as new columns
    for competency, values in competency_values.items():
        courseDf.at[index, "Need"+competency] = ','.join(values)
        
courseDf.head(5)


Unnamed: 0,Name,Link,Rating,Enroll,Instructor,Time,Level,skillrequirement,SkillWillLearn,Description,...,Practice Task,Unnamed: 24,Unnamed: 25,WebName,WebURL,NeedKnowledge,NeedPlatform,NeedFramework,NeedProgrammingLanguage,NeedTool
0,Bacula 1: the open source backup software,https://www.udemy.com/course/bacula-backup-sof...,4.2,697.0,Heitor Faria,8 hours on-demand video,,2 virtual or physical Linux machines for Bacul...,"Make the student ready to fully plan, install,...",The course presents the theory and practice of...,...,"run and after backup scripts, installing and c...",,,udemy,https://www.udemy.com,,"LINUX,WINDOWS,BACULA",LINUX,,"LINUX,WINDOWS"
1,Learning Android Apps Development from Scratch,https://www.udemy.com/course/learning-android-...,4.0,,Sarah Alenzei,2.5 hours on-demand video,,No Previous programming skills required,Develop Android App. Publish apps at google pl...,Start your Android Development journey here le...,...,"publish apps, publish their own apps, creating...",,,udemy,https://www.udemy.com,PROGRAMMING,,,,
2,Free Android Development Tutorial - How to pub...,https://www.udemy.com/course/how-to-publish-an...,4.5,55646.0,Cristian Gradisteanu,32min,,,,Ever wanted to know what are the required step...,...,"update your previous published app, generate a...",,,udemy,https://www.udemy.com,,,,,
3,Bentley STAAD Pro - Structural Analysis & Desi...,https://www.udemy.com/course/staadpro-3d-struc...,4.2,1055.0,SS eAcademy .,6 hours on-demand video,,Basic knowledge of Structural Analysis and Des...,Introduction of STAAD Pro and Structural Engin...,STAAD Pro stands for Structural Analysis and d...,...,"apply different type of load, modify beam and ...",,,udemy,https://www.udemy.com,"ANALYSIS,SOFTWARE",,,,"STRUCTURAL ANALYSIS AND DESIGN,STAAD PRO,AUTOC..."
4,Software Testing/ QA: Learn Basic Testing with...,https://www.udemy.com/course/software-qa-testi...,3.6,3138.0,Syam Mohan,2 hours on-demand video,,"Nothing special, Any one can join!",Writing test cases. Strong testing skills. Goo...,NOTE: This is a software testing beginner cour...,...,"writing test cases, bug tracking, live debug",,,udemy,https://www.udemy.com,,,,,


## clean the nan values

In [46]:
courseDfCleaner = courseDf.fillna('')
courseDfCleaner.head(5)


Unnamed: 0,Name,Link,Rating,Enroll,Instructor,Time,Level,skillrequirement,SkillWillLearn,Description,...,Practice Task,Unnamed: 24,Unnamed: 25,WebName,WebURL,NeedKnowledge,NeedPlatform,NeedFramework,NeedProgrammingLanguage,NeedTool
0,Bacula 1: the open source backup software,https://www.udemy.com/course/bacula-backup-sof...,4.2,697.0,Heitor Faria,8 hours on-demand video,,2 virtual or physical Linux machines for Bacul...,"Make the student ready to fully plan, install,...",The course presents the theory and practice of...,...,"run and after backup scripts, installing and c...",,,udemy,https://www.udemy.com,,"LINUX,WINDOWS,BACULA",LINUX,,"LINUX,WINDOWS"
1,Learning Android Apps Development from Scratch,https://www.udemy.com/course/learning-android-...,4.0,,Sarah Alenzei,2.5 hours on-demand video,,No Previous programming skills required,Develop Android App. Publish apps at google pl...,Start your Android Development journey here le...,...,"publish apps, publish their own apps, creating...",,,udemy,https://www.udemy.com,PROGRAMMING,,,,
2,Free Android Development Tutorial - How to pub...,https://www.udemy.com/course/how-to-publish-an...,4.5,55646.0,Cristian Gradisteanu,32min,,,,Ever wanted to know what are the required step...,...,"update your previous published app, generate a...",,,udemy,https://www.udemy.com,,,,,
3,Bentley STAAD Pro - Structural Analysis & Desi...,https://www.udemy.com/course/staadpro-3d-struc...,4.2,1055.0,SS eAcademy .,6 hours on-demand video,,Basic knowledge of Structural Analysis and Des...,Introduction of STAAD Pro and Structural Engin...,STAAD Pro stands for Structural Analysis and d...,...,"apply different type of load, modify beam and ...",,,udemy,https://www.udemy.com,"ANALYSIS,SOFTWARE",,,,"STRUCTURAL ANALYSIS AND DESIGN,STAAD PRO,AUTOC..."
4,Software Testing/ QA: Learn Basic Testing with...,https://www.udemy.com/course/software-qa-testi...,3.6,3138.0,Syam Mohan,2 hours on-demand video,,"Nothing special, Any one can join!",Writing test cases. Strong testing skills. Goo...,NOTE: This is a software testing beginner cour...,...,"writing test cases, bug tracking, live debug",,,udemy,https://www.udemy.com,,,,,


## save to csv file

In [33]:
# # Save the updated DataFrame to a new CSV file
output_file_path = DEFAULT_PATH + 'courseCleaner.csv'  # Replace with the desired output file path
courseDf.to_csv(output_file_path, index=False)

## test area

In [45]:
desc = "Are you a Business Analyst who wants to help make a difference conceptualizing the business needs for MilitaryChildcare.com and telework 100%? Then we have a great opportunity for you! As a Business Analyst you will: Work with a team of driven, supportive, and highly skilled professionals. Receive a robust benefits package that includes our Employee Stock Ownership Plan! (ESOP). Enjoy flexibility managing your work hours and personal needs with a single accrual leave plan. A week in the life of a Business Analyst: Translate high-level business needs into detailed requirements for new capabilities and enhancements to existing capabilities. Write comprehensive requirement documents resulting in a detailed and complete understanding of project deliverables. Work with customers, software engineers, architects, and other team members to capture business needs and drive quality solutions. Participate in system design and review test cases to ensure requirements are being met and addresses all impacted areas of the system. Perform functional validation to ensure the system aligns with requirements. Create wireframes and workflows using Figma, SnagIt, Excel, and Visio. Adhere to compliance standards and ensure all necessary approvals have been obtained throughout the project lifecycle. Support Tier 2 Support Desk working with Tier 1 Support Desk and technical team to triage issues and communicate timelines and status. Provide input to Weekly and Monthly Status Report(s). Other Business Analyst Team duties as required Job Requirements Bachelor’s degree 5 - 7 years of related experience US Citizenship required Understanding of analysis techniques Familiarity with Agile development methodology Experience with Figma, SnagIt, Microsoft Office tools, Visio Founded in 1975, AMERICAN SYSTEMS is one of the largest employee-owned companies in the United States. We are a government services contractor focused on delivering Strategic Solutions to complex national priority programs with 100+ locations worldwide. Through our focus on quality, strong cultural beliefs, and innovation we deliver excellence every day. Company Awards: Forbes National Best Midsize Companies Energage National Best Workplaces, National Washington Post Best Workplaces Veteran Hiring Awards: U.S. Department of Labor Hire Vets Medallion BEST FOR VETS by Military Times TOP 10 MILITARY FRIENDLY COMPANY by MilitaryFriendly.com AMERICAN SYSTEMS is committed to pay transparency for our applicants and employee-owners. The salary range for this position is $70,000 – 90,000. Actual compensation will be determined based on several factors permitted by law. AMERICAN SYSTEMS provides for the welfare of its employees and their dependents through a comprehensive benefits program by offering healthcare benefits, paid leave, retirement plans (including ESOP and 401k), insurance programs, and education and training assistance. EOE Minorities/Women/Disabled/Veterans/Gender Identity/Sexual Orientation"
test2 = "MathWorks has a hybrid work model that enables staff members to split their time between office and home. The hybrid model provides the advantage of having both in-person time with colleagues and flexible at-home life optimizations. Learn More: https://www.mathworks.com/company/jobs/resources/applying-and-interviewing.html#onboarding. Working under the direction of the Manager or Senior Team Lead, will be responsible for designing and developing sophisticated software for small self-contained projects; designing and implementing moderately-complex software features and components of control systems software; writing high quality code; designing extensible software using software engineering principles, patterns and methodologies; executing performance monitoring and integration; participating in software development projects of moderate scope, from requirements gathering and design to implementation, qualification and validation; assisting cross-functional team members in engineering, quality engineering, user experience, and documentation; increasing productivity and bringing quality features to market; and working with subject matter experts in pursuit of continuous improvement of designs and strategies. MathWorks nurtures growth, appreciates diversity, encourages initiative, values teamwork, shares success, and rewards excellence. Minimum Qualifications: Education and Experience: Master’s degree in Engineering, Computer Science, or a closely related field (or foreign education equivalent) and no experience. OR Bachelor’s degree in Engineering, Computer Science, or a closely related field (or foreign education equivalent) and three (3) years of experience as a Software Engineer (or related occupation) performing development, testing or technical support of control design automation toolboxes. Special Requirements: Demonstrated expertise in development, testing or technical support of controls-based simulation software. Demonstrated expertise in object-oriented design and analysis, including writing algorithms, applying design patterns, and programming in C++ and MATLAB or Simulink. Demonstrated expertise writing algorithms and data structures using modern C++ standard libraries - smart pointers, move semantics, templates and lambda functions - and using C++ Standard Template Library for code optimization. Demonstrated expertise in the full software development life cycle (SDLC), including functional design, architecture design, implementation, and testing, according to scrum-based Agile methodologies. [Expertise may be gained during Graduate program.]"
desc3 = "SKILL AND EXPERIENCE 2 years of experience in similar roles. Experience with SQL Databases, particularly Oracle and PostgreSQL (PostgreSQL is a plus). Proficiency in Python, Airflow, Docker, MinIO, Crawler libraries, Data Dictionary Platforms, etc. Strong analytical skills, logical thinking, problem-solving abilities, and effective communication. Adaptability with various programming languages. Nice to have experience with OTP/Captcha. Familiarity with encryption algorithms and hash codes such as SHA256. Additional experience with Java, JavaScript, Spark, Kafka, RabbitMQ is a plus. Strong data analysis skills and the ability to derive meaningful insights from data. Capacity to work well under high-pressure conditions. Demonstrated teamwork, independence, a can-do attitude, and self-study skills."

result = findCompetency(Competencies, desc3)
for col in COMPETENCIES_LIST:
    print(col + ": ")
    print(result[col])

## separate the compe to single components
## remove the nan and duplicates

Knowledge: 
['ALGORITHMS', 'DATA ANALYSIS', 'DATABASES', 'PROGRAMMING', 'SQL', 'ANALYSIS', 'SPARK']
Platform: 
['KAFKA', 'AIRFLOW', 'SPARK', 'DATA ANALYSIS']
Framework: 
['SPARK', 'DATA ANALYSIS', 'POSTGRESQL']
ProgrammingLanguage: 
['SQL', 'PYTHON', 'JAVA', 'JAVASCRIPT', 'DOCKER', 'KAFKA', 'SPARK']
Tool: 
['POSTGRESQL', 'DOCKER', 'ORACLE', 'SQL', 'DATA ANALYSIS', 'RABBITMQ', 'DATABASES', 'ANALYSIS']
