In [1]:
import time
import pandas as pd
from ast import literal_eval

import warnings
# Settings the warnings to be ignored 
warnings.filterwarnings('ignore') 

import sys
sys.path.insert(1, 'C:/Users/tom/projects/skill-skeleton/utils/')
sys.path.insert(2, 'C:/Users/tom/projects/skill-skeleton/utils/neo4j/')
from connection import Neo4jConnection
import query as query
import manage as manage
import kb_util



def add_skills(rows, batch_size=10000):
    # Adds skill nodes to the Neo4j graph as a batch job.  

    query = '''UNWIND $rows AS row   
    MERGE (Skill {name: row.skills})
    RETURN count(*) as total
    '''     
    
    return insert_data(query, rows, batch_size)

def fix_new_skills(rows, batch_size=10000):
    
    parameters = {'cat': "BD-ML-AI"}   
    
    query = '''UNWIND $rows AS row    
    MATCH (s:Skill {name: row.skills})
    WHERE s.category IS NULL
    SET s.category = $cat   
    RETURN count(*) as total
    '''      
    
    return insert_data(query, rows, batch_size, parameters)


def add_cvs(rows, batch_size=10000):
    # Adds cvs nodes to the Neo4j db as a batch job.

    query = '''UNWIND $rows AS row
    MERGE (cv:CV {id: row.id, url: row.url, resume: row.data})
    RETURN count(*) as total
    '''
    return insert_data(query, rows, batch_size)


def add_skill_links(rows, batch_size=5000):
    # Adds profile nodes and (:CV)--(:Skill)
    # relationships to the Neo4j graph as a batch job.  (Note the smaller batch
    # size due to the fact that this function is adding much more data than the
    # add_skills() function.)

    query = '''
    UNWIND $rows as row   
    WITH row
    MATCH (s:Skill {name: row.skills})   
    MATCH (cv:CV {id: row.id})
    MERGE (cv)-[:CONTAINS]->(s)
    RETURN count(distinct s) as total
    '''

    return insert_data(query, rows, batch_size)


def add_profile_link(rows, batch_size=5000):
    # Adds profile nodes and (:CV)--(:Profile)
    # relationships to the Neo4j graph as a batch job.  (Note the smaller batch
    # size due to the fact that this function is adding much more data than the
    # add_skills() function.)

    query = '''   
    UNWIND $rows as row   
    WITH row
    MATCH (p:Profile {name: row.profile})   
    MATCH (cv:CV {id: row.id})
    MERGE (cv)-[:IS]->(p)
    RETURN count(distinct cv) as total
    '''

    return insert_data(query, rows, batch_size)

def add_profile_skill_links(rows, batch_size=5000):
    # Adds profile nodes and (:Profile)--(:Skill)
    # relationships to the Neo4j graph as a batch job.  (Note the smaller batch
    # size due to the fact that this function is adding much more data than the
    # add_skills() function.)

    query = '''
    UNWIND $rows as row   
    WITH row
    MATCH (p:Profile {name: row.profile})   
    MATCH (s:Skill {name: row.skills})
    MERGE (p)-[:HAS]->(s)
    RETURN count(distinct s) as total
    '''

    return insert_data(query, rows, batch_size)


def insert_data(query, rows, batch_size = 10000, parameters=None):
    # Function to handle the updating the Neo4j database in batch mode.

    total = 0
    batch = 0
    start = time.time()
    result = None

    while batch * batch_size < len(rows):

        if parameters is None:
            param = {'rows': rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')}        
        else:
            param = parameters | {'rows': rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')}

        print(param)
        
        res = conn.query(query, parameters=param)
        total += res[0]['total']
        batch += 1
        result = {"total":total, "batches":batch, "time":time.time()-start}
        print(result)

    return result


def get_skills_by_cv(df):    
    return df[['id','skills']]


def get_skills_by_profile(df):
    return df[['profile','skills']]
     

def get_cvs(df):       
    return df.drop_duplicates()
    

def populate_db(df):    
     
    df = df[df['skills']!="[]"]   
    cvs = get_cvs(df)    
       
    
    cv_skills = get_skills_by_cv(df)    
    cv_skills['skills'] = cv_skills['skills'].apply(kb_util.fix)
    cv_skills['skills'] = cv_skills['skills'].apply(literal_eval)    
    exploded_cv_skills = cv_skills.explode('skills').drop_duplicates().dropna()    
    
    profile_skills = get_skills_by_profile(df)
    profile_skills['skills'] = profile_skills['skills'].apply(kb_util.fix)    
    profile_skills['skills'] = profile_skills['skills'].apply(literal_eval)      
    exploded_profile_skills = profile_skills.explode('skills').drop_duplicates().dropna()
    
    
    add_cvs(cvs)        
    add_skills(exploded_cv_skills)
    add_skill_links(exploded_cv_skills)
    
    add_profile_link(cvs)
    add_profile_skill_links(exploded_profile_skills)
    fix_new_skills(exploded_cv_skills)
        
    
def configure_db():
    conn.query('CREATE CONSTRAINT cvs IF NOT EXISTS FOR (cv:CV) REQUIRE cv.id IS UNIQUE')
    conn.query('CREATE TEXT INDEX cv_text_index_id IF NOT EXISTS FOR (cv:CV) ON (cv.id)')
    conn.query('CREATE TEXT INDEX skill_text_index_name IF NOT EXISTS FOR (s:Skill) ON (s.name)')


def setup_db(file):
    df = pd.read_csv(file,delimiter='|',dtype={'profile':str, 'url':str, 'id':str, 'data':str, 'skills':str})
    configure_db()
    populate_db(df)


def delete_cv_skill_link():
    conn.query('MATCH (cv:CV)-[e:CONTAINS]->(s:Skill) delete e')


def delete_cv_profile_link():
    conn.query('MATCH (cv:CV)-[e:IS]->(p:Profile) delete e')


def delete_cvs():
    conn.query('MATCH (cv:CV) DETACH DELETE cv')
    
    
def delete_skills(category):    
    parameters = {'cat': category}    
    conn.query('MATCH (s:Skill) WHERE s.category = $cat DETACH DELETE s', parameters=parameters)


def refresh_matcher_and_scores(conn):
    manage.refresh_all_profile_skill_scores(conn)
    
    skill_without_duplicates = query.get_all_skills(conn)
    kb_util.create_matcher_from_db('C:/Users/tom/projects/skill-skeleton/models/NER/finalized_matcher.sav',skill_without_duplicates, save=True)

    
   
conn = Neo4jConnection(uri="bolt://localhost:7687", 
                       user="neo4j",              
                       pwd="neo4jneo4j")    

In [6]:
#delete_cv_skill_link()
#delete_cvs()
#delete_skills("BD-ML-AI")
#delete_cv_profile_link()

setup_db("livecareer-final-data-DE-IT2.csv")
#refresh_matcher_and_scores(conn)

{'rows': [{'profile': 'Data Engineer', 'url': 'https://www.livecareer.com/resume-search/r/data-engineer-iii-e867612a74144501b4d70b51d5b2f7e9', 'id': '5533667039977703598659155174781970519', 'data': 'Jessica    Claire                                   609 Johnson Ave       49204     Tulsa     OK   100 Montgomery St 10th Floor    H   555 4321000    C       resumesampleexamplecom    Date of Birth         India                      single                    Summary      Seasoned Senior Data Engineer possessing indepth knowledge of RDBSM environments ETL techniques architecture data modeling and integration between systems Offering 10 years of background managing various aspects of development design and delivery of database solutions Analytical passionate and aspiring leader bringing proven communication and organizational abilities Seeking a fulltime remote position        Skills            Expertise in Microsoft SQL Server Management Studio 20052019     Microsoft Certified Professional D