## Import libraries

In [1]:
import pandas as pd
from neo4j import GraphDatabase
import csv
import ast



## Define the global variable

In [2]:
JOB_LIST = ["Software Architect", "Business Analyst", "Data Scientist", "Game Development",
            "Database Administrator", "Data Engineer", "Data Analysts","Software Engineer", "Web Development",
            "Devops Engineer", "Network Engineer", "UI Designer", "Tester", "Mobile Developer", "Backend Developer", "Frontend Developer"]
DEFAULT_PATH = "/Users/nguyenvanviet/Work/Courses/DataGen/RecommandationCourseWeb-Neo4j/MyPaperDataSet/ETLCode/GDWData/"
COMPETENCIES_LIST = ["Knowledge", "Platform", "Framework", "ProgrammingLanguage", "Tool"]
WEB_LIST = ["Indeed", "CareerBuilder"]

## connect to neo4j

In [3]:

uri = "bolt://localhost:7687"  # Adjust the URI based on your Neo4j server configuration
username = "neo4j"
password = "12345678"

# Create a Neo4j driver instance
driver = GraphDatabase.driver(uri, auth=(username, password))
driver


<neo4j._sync.driver.BoltDriver at 0x7f8170d78430>

## Convert query to pandas dataframe

In [4]:
def queryToDataFrame(query):
    
    with driver.session() as session:
        result = session.run(query)
        columns = result.keys()
        data = [record for record in result]

    return pd.DataFrame(data, columns=columns)



---

# Get data area
---

In [5]:
def queryToCsv(query, fileName):
    fileName = DEFAULT_PATH + fileName
    with driver.session() as session:
        result = session.run(query)
        columns = result.keys()
        data = [record for record in result]

    df = pd.DataFrame(data, columns=columns)
    df.to_csv(fileName, index=False)
    return df


### Get max, min Id
def getMinMaxIdOfEntity(entityName):
    query = f"match(n:{entityName}) return min(id(n)) as minId, max(id(n)) as maxId"
    data = queryToDataFrame(query)
    maxId = data["maxId"][0]
    minId = data["minId"][0]
    return minId - 1, maxId + 1

In [18]:

def parse_knowledge(knowledge_str):
    try:
        # Convert the string representation of the list to an actual list
        return ast.literal_eval(knowledge_str)
    except Exception as e:
        print(f"Error parsing knowledge column: {e}")
        return []
    

def separateKnowledge(fileName):
    df = pd.read_csv(fileName)

    df['Knowledge'] = df['Knowledge'].apply(parse_knowledge)

    exploded_df = df.explode('Knowledge')
    exploded_df.to_csv(fileName, index=False)

## Get Job Fact family

In [23]:
min, max = getMinMaxIdOfEntity("FactJobPosting")
step = 50
## ----------------------------------------------------------------
## for job data without compentency
## ----------------------------------------------------------------



def timeConvert(data):
    if ("Date" not in data.columns): return data
    data['Time'] = data.apply(lambda row: f"{row['Year']}-{row['Month']:02d}-{row['Date']:02d}", axis=1)
    return data.drop(["Date", "Month", "Year" ], axis=1)

    
jobDataFileName = DEFAULT_PATH + "JobDataForRDW.csv"

jobData = pd.DataFrame()
for i in range(min-1, max+step + 1, step):
    jobQuery = f''' match(f: FactJobPosting)-[:Belong_to_career]->(c: Career)
    where id(f) < {i+step} and id(f)>= {i}
    optional match(f)-[:Published_on]->(w:Website)
    optional match(f)-[:Recruited_by]->(o:Organization)
    optional match(f)-[:Posted_at_time]->(d:Date)-[:part_of]->(m:Month)-[:part_of]->(y:Year)
    optional  match(f)-[:Located_at]->(l: Location)
        optional match(f)-[:Required_programmingLanguage]->(pl : ProgrammingLanguage) where pl.level is null
        optional match(f)-[:Required_knowledge]->(kl: Knowledge)
        optional match(f)-[:Required_tool]->(tl: Tool)
        optional match(f)-[:Required_framework]->(fw: Framework) where fw.level is null
        optional match(f)-[:Required_platform]->(pf: Platform)
        return pl.programmingLanguage as ProgrammingLanguage, 
            fw.framework as Framework,
            collect(distinct kl.knowledge) as Knowledge, tl.tool as Tool, 
            pf.platform as Platform,
            f.totalJobPost as jobCount, c.name as Career, l.location as Location, o.name as Organization,
        w.name as Web,  d.day as Date, m.month as Month, y.year as Year, id(f) as ID
 '''
    jobDataTemp= queryToDataFrame(jobQuery)
    jobData = pd.concat([jobData, jobDataTemp], ignore_index=True, sort=False)
    print(f"Loading process {(i-min)/(max-min) * 100} %", end='\r')


jobData.to_csv(jobDataFileName, index=False)
jobData.head(10)



Loading process 100.37065637065636 %4 %

Unnamed: 0,ProgrammingLanguage,Framework,Knowledge,Tool,Platform,jobCount,Career,Location,Organization,Web,Date,Month,Year,ID
0,SQL,SSIS,"[BUSINESS INTELLIGENCE, DATA WAREHOUSE, DATA V...",TABLEAU,GOOGLE ANALYTICS,3,Database Administrator,"Saint Louis Park, MN 55416 (Wolfe Park area)",AAA Minneapolis,Indeed,14,1,2024,41
1,SQL,SSIS,"[BUSINESS INTELLIGENCE, DATA WAREHOUSE, DATA V...",TABLEAU,TABLEAU,3,Database Administrator,"Saint Louis Park, MN 55416 (Wolfe Park area)",AAA Minneapolis,Indeed,14,1,2024,41
2,SQL,SSIS,"[BUSINESS INTELLIGENCE, DATA WAREHOUSE, DATA V...",TABLEAU,DATA ANALYSIS,3,Database Administrator,"Saint Louis Park, MN 55416 (Wolfe Park area)",AAA Minneapolis,Indeed,14,1,2024,41
3,SQL,SSIS,"[BUSINESS INTELLIGENCE, DATA WAREHOUSE, DATA V...",TABLEAU,GOOGLE DATA STUDIO,3,Database Administrator,"Saint Louis Park, MN 55416 (Wolfe Park area)",AAA Minneapolis,Indeed,14,1,2024,41
4,SQL,SSIS,"[BUSINESS INTELLIGENCE, DATA WAREHOUSE, DATA V...",TABLEAU,GOOGLE,3,Database Administrator,"Saint Louis Park, MN 55416 (Wolfe Park area)",AAA Minneapolis,Indeed,14,1,2024,41
5,GOOGLE DATA STUDIO,SSIS,"[BUSINESS INTELLIGENCE, DATA WAREHOUSE, DATA V...",TABLEAU,GOOGLE ANALYTICS,3,Database Administrator,"Saint Louis Park, MN 55416 (Wolfe Park area)",AAA Minneapolis,Indeed,14,1,2024,41
6,GOOGLE DATA STUDIO,SSIS,"[BUSINESS INTELLIGENCE, DATA WAREHOUSE, DATA V...",TABLEAU,TABLEAU,3,Database Administrator,"Saint Louis Park, MN 55416 (Wolfe Park area)",AAA Minneapolis,Indeed,14,1,2024,41
7,GOOGLE DATA STUDIO,SSIS,"[BUSINESS INTELLIGENCE, DATA WAREHOUSE, DATA V...",TABLEAU,DATA ANALYSIS,3,Database Administrator,"Saint Louis Park, MN 55416 (Wolfe Park area)",AAA Minneapolis,Indeed,14,1,2024,41
8,GOOGLE DATA STUDIO,SSIS,"[BUSINESS INTELLIGENCE, DATA WAREHOUSE, DATA V...",TABLEAU,GOOGLE DATA STUDIO,3,Database Administrator,"Saint Louis Park, MN 55416 (Wolfe Park area)",AAA Minneapolis,Indeed,14,1,2024,41
9,GOOGLE DATA STUDIO,SSIS,"[BUSINESS INTELLIGENCE, DATA WAREHOUSE, DATA V...",TABLEAU,GOOGLE,3,Database Administrator,"Saint Louis Park, MN 55416 (Wolfe Park area)",AAA Minneapolis,Indeed,14,1,2024,41


## Get Course Fact family

In [22]:
# load all data of fact course

min, max = getMinMaxIdOfEntity("FactCourse")
step = 50
max = max + step + 1
fileName = DEFAULT_PATH + "courseDataForRDW.csv"
data = pd.DataFrame()
for i in range(min-1, max, step):
    jobQuery = f''' match(f: FactCourse)-[:Belong_to_course]->(c: Course)
    where id(f) < {i+step} and id(f)>= {i}
    optional match(f)-[:Posted_on]->(w:Website)
    optional match(f)-[:Belong_to]->(o:Organization)
    optional match(c)-[:Taught_by]->(i: Instructor)
    optional match(f)-[:Taught_programmingLanguage]->(pl : ProgrammingLanguage) 
    optional match(f)-[:Taught_knowledge]->(kl: Knowledge) 
    optional match(f)-[:Taught_platform]->(pf: Platform)
    optional match(f)-[:Taught_tool]->(tl: Tool) 
    optional match(f)-[:Taught_framework]->(fw: Framework)
    return c.name as CourseName, c.link as Link, c.level as Level, c.duration as Duration, c.price as Price, f.enroll as Enroll, f.rating as Rate,
        pl.programmingLanguage as ProgrammingLanguage, pl.level as ProgrammingLanguageLevel,
        kl.knowledge as Knowledge, fw.framework as Framework, fw.level as FrameworkLevel, tl.tool as Tool, 
        pf.platform as Platform, o.name as Organization, w.name as Website, w.url as LinkToWebsite, id(f) as ID
 '''
    dataTemp= queryToDataFrame(jobQuery)
    data = pd.concat([data, dataTemp], ignore_index=True, sort=False)
    print(f"Loading process {(i-min)/(max-min) * 100} %", end='\r')

data.to_csv(fileName, index=False)
data.head(10)


Loading process 99.97619047619047 %%8 %

Unnamed: 0,CourseName,Link,Level,Duration,Price,Enroll,Rate,ProgrammingLanguage,ProgrammingLanguageLevel,Knowledge,Framework,FrameworkLevel,Tool,Platform,Organization,Website,LinkToWebsite,ID
0,Advanced Data Science Capstone,https://www.coursera.org/learn/advanced-data-s...,ADVANCED,9.0,0.0,13,4.6,,,DATA EXPLORATION,,,,,IBM,coursera,www.coursera.org,0
1,Advanced Data Science Capstone,https://www.coursera.org/learn/advanced-data-s...,ADVANCED,9.0,0.0,13,4.6,,,ALGORITHMS,,,,,IBM,coursera,www.coursera.org,0
2,Advanced Data Science Capstone,https://www.coursera.org/learn/advanced-data-s...,ADVANCED,9.0,0.0,13,4.6,,,MACHINE LEARNING,,,,,IBM,coursera,www.coursera.org,0
3,Advanced Data Science Capstone,https://www.coursera.org/learn/advanced-data-s...,ADVANCED,9.0,0.0,13,4.6,,,DEEP LEARNING,,,,,IBM,coursera,www.coursera.org,0
4,Advanced Data Science Capstone,https://www.coursera.org/learn/advanced-data-s...,ADVANCED,9.0,0.0,13,4.6,,,VISUALIZATION,,,,,IBM,coursera,www.coursera.org,0
5,Advanced Data Science Capstone,https://www.coursera.org/learn/advanced-data-s...,ADVANCED,9.0,0.0,13,4.6,,,PARALLEL DATA PROCESSING,,,,,IBM,coursera,www.coursera.org,0
6,Accounting Data Analytics with Python,https://www.coursera.org/learn/accounting-data...,INTERMEDIATE,32.0,0.0,8,4.1,PYTHON,INTERMEDIATE,DESCRIPTIVE STATISTICS,PANDAS,INTERMEDIATE,,,University of Illinois at Urbana-Champaign,coursera,www.coursera.org,4
7,Accounting Data Analytics with Python,https://www.coursera.org/learn/accounting-data...,INTERMEDIATE,32.0,0.0,8,4.1,PYTHON,INTERMEDIATE,DATABASES,PANDAS,INTERMEDIATE,,,University of Illinois at Urbana-Champaign,coursera,www.coursera.org,4
8,Accounting Data Analytics with Python,https://www.coursera.org/learn/accounting-data...,INTERMEDIATE,32.0,0.0,8,4.1,PYTHON,INTERMEDIATE,DATA ANALYTIC,PANDAS,INTERMEDIATE,,,University of Illinois at Urbana-Champaign,coursera,www.coursera.org,4
9,Accounting Data Analytics with Python,https://www.coursera.org/learn/accounting-data...,INTERMEDIATE,32.0,0.0,8,4.1,PYTHON,INTERMEDIATE,RELATIONAL DATABASES,PANDAS,INTERMEDIATE,,,University of Illinois at Urbana-Champaign,coursera,www.coursera.org,4


## Get Competency family

In [19]:

min, max = getMinMaxIdOfEntity("ProgrammingLanguage")
step = 10
max = max + step + 1
data = pd.DataFrame()

for i in range(min - 1, max, step):
    query = f'''match(pl : ProgrammingLanguage)     where id(pl) < {i+step} and id(pl)>= {i}
            optional match(pl)-[:Have_framework]->(fw: Framework)
            optional match(pl)-[:Use_tool]->(tl: Tool)
            optional match(pl)-[:Relate_to_knowledge]->(kl: Knowledge)
            optional match(fw)-[:Deploy_to_platform]->(pf: Platform)
            return pl.programmingLanguage as ProgrammingLanguage, pl.level as ProgrammingLanguageLevel,
                fw.framework as Framework, fw.level as FrameworkLevel,
                collect(distinct kl.knowledge) as Knowledge, tl.tool as Tool, 
                pf.platform as Platform
        '''
    dataTemp= queryToDataFrame(query)
    data = pd.concat([data, dataTemp], ignore_index=True, sort=False)
    print(f"Loading process {(i-min)/(max-min) * 100} %", end='\r')  
    
              

fileName = DEFAULT_PATH + "competencyDataForRDW.csv"

data.to_csv(fileName, index=False)


Loading process 99.88147646461226 %%% %

In [20]:

print(fileName)
separateKnowledge(fileName)

/Users/nguyenvanviet/Work/Courses/DataGen/RecommandationCourseWeb-Neo4j/MyPaperDataSet/ETLCode/GDWData/competencyDataForRDW.csv


## Get User family

In [19]:
query = f'''
        match(u:User)-[:Known_programmingLanguage]->(pl:ProgrammingLanguage)
        optional match(u)-[:Known_knowledge]->(kl:Knowledge)
        optional match(u)-[:Known_Platform]->(pf:Platform)
        optional match(u)-[:Known_frameworks]->(fw:Framework)
        optional match(u)-[:Known_tool]->(tl:Tool)
        return pl.programmingLanguage as ProgrammingLanguage, pl.level as ProgrammingLanguageLevel,
            fw.framework as Framework, fw.level as FrameworkLevel,
            kl.knowledge as Knowledge, tl.tool as Tool, 
            pf.platform as Platform,
            u.username as Username, u.name as FullName, u.email as Email, u.dateOfBirth as DateOfBirth, u.phone as Phone
        '''

fileName = DEFAULT_PATH + "userDataForRDW.csv"

dataC = queryToDataFrame(query)
print(len(dataC))
dataC.to_csv(fileName, index=False)

1080


---
### Dim infor
---

In [11]:
query = "Match(o:Organization) return o.name as Organization"
queryToCsv(query, "DimData/Organization.csv")

Unnamed: 0,Organization
0,AAA Minneapolis
1,ACB - Ngân Hàng TMCP Á Châu
2,ADPMN IT SOLUTIONS
3,IBM
4,Duke University
...,...
678,iTalent PLUS
679,iVision
680,minware
681,sweetgreen


In [15]:
min, max = getMinMaxIdOfEntity("FactJobPosting")
print("max" , max)
print("min" , min)

max 6515
min 40
