## Load libraries & Declare global variables

In [1]:
import pandas as pd
from neo4j import GraphDatabase
import csv


JOB_LIST = ["Software Architect", "Business Analyst", "Data Scientist", "Game Development",
            "Database Administrator", "Data Engineer", "Data Analysts","Software Engineer", "Web Development"]
DEFAULT_PATH = "/Users/nguyenvanviet/Work/Courses/DataGen/MyPaperDataSet/"
COMPETENCIES_LIST = ["Knowledge", "Platform", "Framework", "ProgrammingLanguage", "Tool"]
JOB_NAME = "Data Analysts"

## Connect to Neo4J database

In [2]:

uri = "bolt://localhost:7690"  # Adjust the URI based on your Neo4j server configuration
username = "neo4j"
password = "12345678"

# Create a Neo4j driver instance
driver = GraphDatabase.driver(uri, auth=(username, password))
driver

<neo4j._sync.driver.BoltDriver at 0x7fc4a12264f0>

## Query to get Career compentency


**Sample Query**\
\
MATCH (ce:Career {name: 'Data Engineer'})
OPTIONAL MATCH (c)-[:BELONG_TO_CAREER]->(ce)
OPTIONAL MATCH (c)-[:REQUIRE_KNOWLEDGE]->(knowledgeNode)
OPTIONAL MATCH (c)-[:REQUIRE_TOOL]->(toolNode)
OPTIONAL MATCH (c)-[:REQUIRE_FRAMEWORK]->(frameworkNode)
OPTIONAL MATCH (c)-[:REQUIRE_PLATFORM]->(platformNode)
OPTIONAL MATCH (c)-[:REQUIRE_PROGRAMMING_LANGUAGE]->(languageNode)

RETURN  knowledgeNode, toolNode, frameworkNode, platformNode, languageNode;

In [3]:

def capitalizeAfterWhitespace(input_string):
    words = input_string.split()
    capitalized_words = [word.capitalize() for word in words]
    return ' '.join(capitalized_words)



def getCareeerCompentency(careerName):

    # set up the query string
    careerName = capitalizeAfterWhitespace(careerName)
    query = f"MATCH (ce:Career {{name: '{careerName}' }})\
        OPTIONAL MATCH (c)-[:BELONG_TO_CAREER]->(ce)\
        OPTIONAL MATCH (c)-[:REQUIRE_KNOWLEDGE]->(knowledgeNode)\
        OPTIONAL MATCH (c)-[:REQUIRE_TOOL]->(toolNode)\
        OPTIONAL MATCH (c)-[:REQUIRE_FRAMEWORK]->(frameworkNode)\
        OPTIONAL MATCH (c)-[:REQUIRE_PLATFORM]->(platformNode)\
        OPTIONAL MATCH (c)-[:REQUIRE_PROGRAMMING_LANGUAGE]->(languageNode)\
        RETURN  COLLECT(DISTINCT knowledgeNode.knowledge) AS knowledgeProperties, COLLECT(DISTINCT toolNode.tool) AS toolProperties, \
               COLLECT(DISTINCT frameworkNode.framework) AS frameworkProperties, COLLECT(DISTINCT platformNode.platform) AS platformProperties, \
               COLLECT(DISTINCT languageNode.programmingLanguage) AS languageProperties;"
    
    # get the results query to dict
    result_dict = {}

    with driver.session() as session:
        result = session.run(query)
        for record in result:
            result_dict['knowledge'] = record['knowledgeProperties']
            result_dict['tool'] = record['toolProperties']
            result_dict['framework'] = record['frameworkProperties']
            result_dict['platform'] = record['platformProperties']
            result_dict['programmingLanguage'] = record['languageProperties']

    return result_dict




## Get && format compentency data before query

In [4]:
competency = getCareeerCompentency(JOB_NAME)


def fomatedString(sign, attr, st):
    if len(st) < 1: return ''
    formatted_string = '[' + ', '.join(f"'{item}'" for item in st) + ']'
    return sign + '.' + attr + " IN " + formatted_string

kl = "knowledge";           klRelation = "TEACH_KNOWLEDGE"
pl = "programmingLanguage"; plRelation = "TEACH_PROGRAMMING_LANGUAGE"
tl = "tool";                tlRelation = "TEACH_TOOL"
pf = "platform";            pfRelation = "TEACH_PLATFORM"
fw = "framework";           fwRelation = "TEACH_FRAMEWORK"

klCondition = fomatedString("kl", kl, competency[kl])
plCondition = fomatedString("pl", pl, competency[pl])
tlCondition = fomatedString("tl", tl, competency[tl])
pfCondition = fomatedString("pf", pf, competency[pf])
fwCondition = fomatedString("fw", fw, competency[fw])



# Query for course recommendation

### Query neo4j to dataframe

In [5]:
def queryNeo4jToDF(query):
    with driver.session() as session:
        result = session.run(query)

        # Convert result to DataFrame
        columns = result.keys()
        data = [record.values() for record in result]
        df = pd.DataFrame(data, columns=columns)

    return df

### Set Up Query

In [6]:
def toUpperFirstChar(s):
    return s[0].upper() + s[1:]

def queryEachCompe(sign, entity, attr, condition, relationship, outputName):
    if condition == "": condition = "1 = 1"
    return f"MATCH ({sign}:{entity}) where  {condition}\
        OPTIONAL MATCH (rc:Course)<-[:BELONG_TO_COURSE]-(course:FactCourse)-[:{relationship}]-> ({sign})\
        return  rc.name as `Course Name`,\
        rc.link as `Course Link`,\
        toInteger(rc.enroll) as Enroll, rc.rating as Rating,\
        COLLECT({sign}.{attr}) as `{outputName}` order by Enroll desc"


In [7]:
queryUltraProMax =  f"MATCH (pl:ProgrammingLanguage) where  {plCondition}\
    OPTIONAL MATCH (rc:Course)<-[:BELONG_TO_COURSE]-(course:FactCourse)-[:TEACH_PROGRAMMING_LANGUAGE]-> (pl)\
    OPTIONAL MATCH (kl:Knowledge) where  {klCondition}\
    OPTIONAL MATCH (course)-[:TEACH_KNOWLEDGE]-> (kl)\
    OPTIONAL MATCH (fw:Framework) where  {fwCondition}\
    OPTIONAL MATCH (course)-[:TEACH_FRAMEWORK]-> (fw)\
    OPTIONAL MATCH (pf:Platform) where  {pfCondition}\
    OPTIONAL MATCH (course)-[:TEACH_FRAMEWORK]-> (pf)\
    OPTIONAL MATCH (tl:Tool) where  {tlCondition}\
    OPTIONAL MATCH (course)-[:TEACH_FRAMEWORK]-> (tl)\
    return  rc.name as `Course Name`,\
        pl.programmingLanguage as `Programming Language`,  COLLECT(DISTINCT fw.framework) as Framework ,  COLLECT(DISTINCT kl.knowledge) as Knowledge,\
        COLLECT(DISTINCT pf.platform) as Platform ,  COLLECT(DISTINCT tl.tool) as Tool,\
        rc.link as `Course Link`,\
        toInteger(rc.enroll) as Enroll order by Enroll desc"

plOnly = queryEachCompe('pl', toUpperFirstChar(pl), pl, plCondition, plRelation, "Programming Language")
fwOnly = queryEachCompe('fw', toUpperFirstChar(fw), fw, fwCondition, fwRelation, "Framework")
tlOnly = queryEachCompe('tl', toUpperFirstChar(tl), tl, tlCondition, tlRelation, "Tool")
klOnly = queryEachCompe('kl', toUpperFirstChar(kl), kl, klCondition, klRelation, "Knowledge")
pfOnly = queryEachCompe('pf', toUpperFirstChar(pf), pf, pfCondition, pfRelation, "Platform")

print(queryUltraProMax)
print(pfOnly)


MATCH (pl:ProgrammingLanguage) where  pl.programmingLanguage IN ['SQL', 'R', 'PYTHON']    OPTIONAL MATCH (rc:Course)<-[:BELONG_TO_COURSE]-(course:FactCourse)-[:TEACH_PROGRAMMING_LANGUAGE]-> (pl)    OPTIONAL MATCH (kl:Knowledge) where  kl.knowledge IN ['DATA ANALYSIS', 'STATISTIC', 'REPORTING', 'BI', 'DATA WAREHOUSE', 'BIG DATA', 'MACHINE LEARNING', 'DATA VISUALIZATION', 'DATA MODELING', 'DATA MINING']    OPTIONAL MATCH (course)-[:TEACH_KNOWLEDGE]-> (kl)    OPTIONAL MATCH (fw:Framework) where  fw.framework IN ['SPARK', 'PANDAS', 'HADOOP']    OPTIONAL MATCH (course)-[:TEACH_FRAMEWORK]-> (fw)    OPTIONAL MATCH (pf:Platform) where  pf.platform IN ['GOOGLE ANALYTICS', 'AWS']    OPTIONAL MATCH (course)-[:TEACH_FRAMEWORK]-> (pf)    OPTIONAL MATCH (tl:Tool) where  tl.tool IN ['POWER BI', 'TABLEAU', 'SAS']    OPTIONAL MATCH (course)-[:TEACH_FRAMEWORK]-> (tl)    return  rc.name as `Course Name`,        pl.programmingLanguage as `Programming Language`,  COLLECT(DISTINCT fw.framework) as Framework

## Result

In [8]:
res = []

def queryProcess(query):
    df = queryNeo4jToDF(query)
    df = df.dropna(subset=['Course Name'])
    df['Enroll'].fillna(1, inplace=True)
    return df

res.append(queryProcess(pfOnly))
res.append(queryProcess(fwOnly))
res.append(queryProcess(klOnly))
res.append(queryProcess(tlOnly))
res.append(queryProcess(plOnly))

finalResult = res[0]

for i in range(1, len(res)):
    finalResult = finalResult.merge(res[i], how='outer', on=['Course Name', 'Course Link', 'Enroll', 'Rating'])
#finalResult.fillna("", inplace=True)
finalResult.head(10)


Unnamed: 0,Course Name,Course Link,Enroll,Rating,Platform,Framework,Knowledge,Tool,Programming Language
0,Cloud Data Engineering,https://www.coursera.org/learn/cloud-data-engi...,1,4.5,[AWS],,[MACHINE LEARNING],,[PYTHON]
1,Administrador Cloud Computing con Amazon Web S...,https://www.udemy.com/course/administrador-clo...,1,4.0,[AWS],,,,
2,Communicating Data Science Results,https://www.coursera.org/learn/data-results,1,3.6,[AWS],,[BIG DATA],,
3,Free Web Development Tutorial - Learn How to C...,https://www.udemy.com/course/learn-how-to-crea...,1,4.0,[GOOGLE ANALYTICS],,,,
4,Exploring ?and ?Preparing ?your ?Data with Big...,https://www.coursera.org/learn/gcp-exploring-p...,1,4.7,[GOOGLE ANALYTICS],,[BIG DATA],,[SQL]
5,Business Website for Beginners and Solopreneurs,https://www.udemy.com/course/the-complete-busi...,1,3.6,[GOOGLE ANALYTICS],,,,
6,Web Analytics,https://www.udemy.com/course/web-analytics2/,1,4.0,[GOOGLE ANALYTICS],,,,
7,Scalable Machine Learning on Big Data using Ap...,https://www.coursera.org/learn/machine-learnin...,1,3.8,,[SPARK],"[MACHINE LEARNING, BIG DATA]",,"[SQL, PYTHON]"
8,Big Data Modeling and Management Systems,https://www.coursera.org/learn/big-data-manage...,1,4.4,,[SPARK],[BIG DATA],,
9,Machine Learning With Big Data,https://www.coursera.org/learn/big-data-machin...,1,4.6,,[SPARK],"[MACHINE LEARNING, BIG DATA]",,


## Display results

In [9]:
finalResult['Matched'] = 5 - finalResult.isnull().sum(axis=1)


sorted_result = finalResult.sort_values(by=['Matched', 'Enroll', 'Rating'], ascending=[False, False, False])
sorted_result.fillna("-", inplace=True)
sorted_result['Course Link'] = "Click here"
print (len(sorted_result))
sorted_result.head(10)


168


Unnamed: 0,Course Name,Course Link,Enroll,Rating,Platform,Framework,Knowledge,Tool,Programming Language,Matched
101,Introduction to Accounting Data Analytics and ...,Click here,1,4.8,-,-,[BIG DATA],[TABLEAU],"[PYTHON, R]",3
4,Exploring ?and ?Preparing ?your ?Data with Big...,Click here,1,4.7,[GOOGLE ANALYTICS],-,[BIG DATA],-,[SQL],3
16,Big Data Analysis with Scala and Spark,Click here,1,4.7,-,"[SPARK, HADOOP]","[DATA ANALYSIS, BIG DATA]",-,[PYTHON],3
35,How to Win a Data Science Competition: Learn f...,Click here,1,4.7,-,[PANDAS],[MACHINE LEARNING],-,[PYTHON],3
32,Data Analysis Using Python,Click here,1,4.6,-,[PANDAS],"[DATA ANALYSIS, DATA VISUALIZATION]",-,[PYTHON],3
0,Cloud Data Engineering,Click here,1,4.5,[AWS],-,[MACHINE LEARNING],-,[PYTHON],3
26,Spatial Data Science and Applications,Click here,1,4.4,-,[HADOOP],[DATA ANALYSIS],-,[R],3
83,Data Management and Visualization,Click here,1,4.4,-,-,[DATA ANALYSIS],[SAS],[PYTHON],3
11,Data Manipulation at Scale: Systems and Algori...,Click here,1,4.3,-,"[SPARK, HADOOP]","[DATA ANALYSIS, BIG DATA]",-,[SQL],3
31,AI Workflow: Business Priorities and Data Inge...,Click here,1,4.3,-,[PANDAS],[MACHINE LEARNING],-,[PYTHON],3


In [10]:

# Apply formatting to the DataFrame for text alignment and background color
styled_df = sorted_result.head(10).reset_index(drop=True).style.set_table_styles([
    {'selector': 'th', 'props': [('text-align', 'center')]},
    {'selector': 'td', 'props': [('text-align', 'left')]},
    {'selector': 'table', 'props': [('background-color', 'lightgray')]}
])
styled_df.format({'Enroll': '{:,.0f}'.format, }\)
styled_df.format({'Rating': '{:,.1f}'.format})

# Print the formatted DataFrame using head()
styled_df

Unnamed: 0,Course Name,Course Link,Enroll,Rating,Platform,Framework,Knowledge,Tool,Programming Language,Matched
0,Introduction to Accounting Data Analytics and Visualization,Click here,1,4.8,-,-,['BIG DATA'],['TABLEAU'],"['PYTHON', 'R']",3
1,Exploring ?and ?Preparing ?your ?Data with BigQuery,Click here,1,4.7,['GOOGLE ANALYTICS'],-,['BIG DATA'],-,['SQL'],3
2,Big Data Analysis with Scala and Spark,Click here,1,4.7,-,"['SPARK', 'HADOOP']","['DATA ANALYSIS', 'BIG DATA']",-,['PYTHON'],3
3,How to Win a Data Science Competition: Learn from Top Kagglers,Click here,1,4.7,-,['PANDAS'],['MACHINE LEARNING'],-,['PYTHON'],3
4,Data Analysis Using Python,Click here,1,4.6,-,['PANDAS'],"['DATA ANALYSIS', 'DATA VISUALIZATION']",-,['PYTHON'],3
5,Cloud Data Engineering,Click here,1,4.5,['AWS'],-,['MACHINE LEARNING'],-,['PYTHON'],3
6,Spatial Data Science and Applications,Click here,1,4.4,-,['HADOOP'],['DATA ANALYSIS'],-,['R'],3
7,Data Management and Visualization,Click here,1,4.4,-,-,['DATA ANALYSIS'],['SAS'],['PYTHON'],3
8,Data Manipulation at Scale: Systems and Algorithms,Click here,1,4.3,-,"['SPARK', 'HADOOP']","['DATA ANALYSIS', 'BIG DATA']",-,['SQL'],3
9,AI Workflow: Business Priorities and Data Ingestion,Click here,1,4.3,-,['PANDAS'],['MACHINE LEARNING'],-,['PYTHON'],3
