In [32]:
import pandas as pd

In [54]:
# Read JD dataset from CSV files, Run EDA Analysis
list_data_fields = ['LinkedIn/output/JD_Business Intelligence.csv','LinkedIn/output/JD_Data Analyst.csv',
                    'LinkedIn/output/JD_Data Architect.csv','LinkedIn/output/JD_Data Engineer.csv',
                    'LinkedIn/output/JD_Data Scientist.csv','LinkedIn/output/JD_Business Analyst.csv','LinkedIn/output/JD_Database Administrator.csv']

list_software_fields = ['LinkedIn/output/JD_Back End Developer.csv','LinkedIn/output/JD_Back-end Developer.csv',
                        'LinkedIn/output/JD_DevOps.csv','LinkedIn/output/JD_Front End Developer.csv','LinkedIn/output/JD_Front-end Developer.csv',
                        'LinkedIn/output/JD_Product Manager.csv','LinkedIn/output/JD_Software Engineer.csv','LinkedIn/output/JD_Tester.csv']

In [55]:
def read_and_clean_single_role(input_path=''):
    df = pd.read_csv(input_path,sep='|')

    # Drop Null Values for description column
    df = df.dropna(axis=0,subset=['description'])

    # Remove the same description & company & jobTitle & location & type
    df = df.drop_duplicates(subset=['jobTitle','location','company','type','description'],keep='first')

    return df

def create_field_DataFrame(input_list = list):
    list_dfs=[]
    for item in input_list:
        print(item)
        df_tmp = read_and_clean_single_role('../'+item)
        list_dfs.append(df_tmp)

    df = pd.concat(list_dfs)
    df = df.reset_index()
    return df

<h3>READ Data Science JDs in English</h3>

In [53]:
def read_JDs(path_jobs_en='',list_fields_input=list):
    # list_jd_all_fields = list_data_fields + list_software_fields
    df1 = create_field_DataFrame(list_fields_input)
    df2 = pd.read_csv(path_jobs_en)
    return df1,df2

In [52]:
df1,df2 = read_JDs('../DataPreprocessing/ds_job_lang.csv')
df1['language'] = df2['language']

LinkedIn/output/JD_Business Intelligence.csv
LinkedIn/output/JD_Data Analyst.csv
LinkedIn/output/JD_Data Architect.csv
LinkedIn/output/JD_Data Engineer.csv
LinkedIn/output/JD_Data Scientist.csv
LinkedIn/output/JD_Business Analyst.csv
LinkedIn/output/JD_Database Administrator.csv
LinkedIn/output/JD_Back End Developer.csv
LinkedIn/output/JD_Back-end Developer.csv
LinkedIn/output/JD_DevOps.csv
LinkedIn/output/JD_Front End Developer.csv
LinkedIn/output/JD_Front-end Developer.csv
LinkedIn/output/JD_Product Manager.csv
LinkedIn/output/JD_Software Engineer.csv
LinkedIn/output/JD_Tester.csv


In [38]:
df_ds = df1[df1['language']=='en'].reset_index(drop=True)

In [39]:
df_ds

Unnamed: 0,index,id,jobTitle,location,company,type,description,language
0,0,2975673294,Data Engineer in Business Intelligence Team (B...,"Ho Chi Minh City, Ho Chi Minh City, Vietnam",Agoda,Full-time · Entry level,About the job\nAbout Agoda\n\nAgoda is an onli...,en
1,1,2993666500,"Business Intelligence Intern, Kobiton","Ho Chi Minh City, Ho Chi Minh City, Vietnam","KMS Technology, Inc.",Internship · Internship,About the job\nThis job is sourced from a job ...,en
2,2,2975674182,"Business Intelligence Developer, Product Team ...","Ho Chi Minh City, Ho Chi Minh City, Vietnam",Agoda,Full-time · Associate,About the job\nAbout Agoda\n\nAgoda is an onli...,en
3,3,2993740820,"Business Intelligence (Edtech Company, Upto 10...","Hanoi, Hanoi, Vietnam",X3english Limited Company,"₫15,000,000/month - ₫23,000,000/month · Full-t...",About the job\nThis job is sourced from a job ...,en
4,4,2945707463,Senior Business Intelligence Analyst (0824),"Hanoi, Hanoi, Vietnam",Techcombank (TCB),Full-time · Associate,About the job\nJob Purpose\nThe Job Holder Res...,en
...,...,...,...,...,...,...,...,...
427,16,2990265063,"Database Administrator (DBA), based in Da Nang","Đà Nang, Da Nang City, Vietnam","KMS Technology, Inc.",Full-time · Associate,About the job\nAs a senior DBA you will have t...,en
428,17,3008519916,Junior/Senior Database Administrator (DBA),"Ho Chi Minh City, Ho Chi Minh City, Vietnam","KMS Technology, Inc.",Full-time · Mid-Senior level,About the job\nAs a senior DBA you will have t...,en
429,18,2698740162,[Local Product] Database Administrator,"Ho Chi Minh City, Ho Chi Minh City, Vietnam",Shopee,Full-time · Mid-Senior level,About the job\nAbout The Team\n\nThe Product D...,en
430,19,2973389208,"[Local Product] Database Administrator (MySQL,...","Ho Chi Minh City, Ho Chi Minh City, Vietnam",Shopee,Full-time · Entry level,About the job\nWhat Will You Do\nResponsible f...,en


In [40]:
keywords_tier_1  = ["yêu cầu","requirements","need to","qualifications","must have",'looking for']
keywords_tier_2  = ["skills","experience","success"] 


In [41]:
def find_substring_pos(text='',substring=''):
    result = text.lower().find(substring)
    # print(result)
    return result

In [136]:
find_substring_pos("Requirements","requirements")

0

In [42]:
def get_min_key_pos(input_dict:dict):
    # Get key-value whose value is minimum and value > -1
    min_key = -1
    min_value = max(input_dict.values())
    for key in input_dict.keys():
        if  input_dict[key] <= min_value and input_dict[key] > -1:
            # print(key,input_dict[key])
            min_value = input_dict[key]
            min_key = key
    # print(min_key,min_value)
    return min_key,min_value

In [14]:
d = {'skills': -1, 'experience': -1,'looking for': -1, 'success': -1}
get_min_key_pos(d)

(-1, -1)

In [43]:
def cut_jd_requirements(text=''):
    requirements_text = ''
    dictionary_tier_1 = {}
    dictionary_tier_2 = {}
    for item in keywords_tier_1:
        dictionary_tier_1[item] = find_substring_pos(text,item)
    for item in keywords_tier_2:
        dictionary_tier_2[item] = find_substring_pos(text,item)
    
    k1,v1 = get_min_key_pos(dictionary_tier_1)
    k2,v2 = get_min_key_pos(dictionary_tier_2)
    # print(dictionary_tier_1)
    # print(dictionary_tier_2)
    if v1 != -1:
        requirements_text = text[v1:-1]
    else:
        requirements_text = text[v2:-1]
    # l = list(dictionary_tier_1.values())
    # print(type(l),l)
    return requirements_text

In [44]:
def add_column_requirements(df:pd.DataFrame):
    cut_description_list = []
    for i in range(len(df)):
        text = df.loc[i]['description']
        requirements_text = cut_jd_requirements(text=text)
        cut_description_list.append(requirements_text)

    df['desc'] = cut_description_list
    return df

In [45]:
df3 = add_column_requirements(df_ds)
df3.rename(columns={'id':'_id'},inplace=True)
df3 = df3[['_id','jobTitle','location','company','type','desc']]

In [46]:
df3.drop_duplicates(subset=['_id'],keep='first',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [47]:
df3

Unnamed: 0,_id,jobTitle,location,company,type,desc
0,2975673294,Data Engineer in Business Intelligence Team (B...,"Ho Chi Minh City, Ho Chi Minh City, Vietnam",Agoda,Full-time · Entry level,requirements. Our ultimate goal is to enable a...
1,2993666500,"Business Intelligence Intern, Kobiton","Ho Chi Minh City, Ho Chi Minh City, Vietnam","KMS Technology, Inc.",Internship · Internship,need to be successful: General: * 4th year/ ne...
2,2975674182,"Business Intelligence Developer, Product Team ...","Ho Chi Minh City, Ho Chi Minh City, Vietnam",Agoda,Full-time · Associate,requirements. Our ultimate goal is to enable a...
3,2993740820,"Business Intelligence (Edtech Company, Upto 10...","Hanoi, Hanoi, Vietnam",X3english Limited Company,"₫15,000,000/month - ₫23,000,000/month · Full-t...",experiences with Data Analysis or Business Ana...
4,2945707463,Senior Business Intelligence Analyst (0824),"Hanoi, Hanoi, Vietnam",Techcombank (TCB),Full-time · Associate,requirements from within and outside of the tr...
...,...,...,...,...,...,...
427,2990265063,"Database Administrator (DBA), based in Da Nang","Đà Nang, Da Nang City, Vietnam","KMS Technology, Inc.",Full-time · Associate,Qualifications\n\n1+ years of experience with ...
428,3008519916,Junior/Senior Database Administrator (DBA),"Ho Chi Minh City, Ho Chi Minh City, Vietnam","KMS Technology, Inc.",Full-time · Mid-Senior level,Qualifications\n\nGeneral requirement:\n\nAt l...
429,2698740162,[Local Product] Database Administrator,"Ho Chi Minh City, Ho Chi Minh City, Vietnam",Shopee,Full-time · Mid-Senior level,Requirements\nBachelor's degree or higher in C...
430,2973389208,"[Local Product] Database Administrator (MySQL,...","Ho Chi Minh City, Ho Chi Minh City, Vietnam",Shopee,Full-time · Entry level,experience with SQL query/SQL programming\nGoo...


In [48]:
import pymongo
myclient = pymongo.MongoClient("mongodb://mongoadmin:admin@13.67.48.201:27017/")
mydb = myclient["LinkedIn"]
mycol = mydb["JobsDescription"]
mycol.insert_many(df3.to_dict('records'))

BulkWriteError: batch op errors occurred, full error: {'writeErrors': [{'index': 218, 'code': 11000, 'keyPattern': {'_id': 1}, 'keyValue': {'_id': 2970953472}, 'errmsg': 'E11000 duplicate key error collection: LinkedIn.JobsDescription index: _id_ dup key: { _id: 2970953472 }', 'op': {'_id': 2970953472, 'jobTitle': '(Senior/Expert) Software Engineer/ML/Data Scientist - Search Platform', 'location': 'Ho Chi Minh City, Ho Chi Minh City, Vietnam', 'company': 'TIKI', 'type': 'Full-time · Associate', 'desc': 'looking for a specific product on Tiki, it is our technology that delivers you outstanding search results.\n\nChallenges we are facing:\nHow to serve search responses in high performance and fault tolerant manner while supporting 10k product updates per second in realtime.\nHow to extract as much valuable information as possible from non-structured product data and product reviews written in natural language.\nHow to turn that information into ranking features that help to increase search relevancy for our customers.\nHow to process customer queries and extract the right features to help the ranking system to produce better results.\nHow to integrate “rich” results like recommended filters, sellers or custom mini apps into search in a way that is helpful for customers.\nHow to make sure that the improvements are "truly" there with the right A/B testing process.\nWith all those challenges we apply cutting edge engineering, machine learning and statistical methods, so working with us is a great opportunity to apply your knowledge while also learn something new at the same time.\n\nRequirements:\nKnowledge of basic algorithms and data structures\nKnowledge of C++ and/or Java\n4+ years of experience\nExperience with high-load systems\nExperience with NLP, Computer Vision and/or recommendation systems\nExperience with cloud services (is a plus)\nGood score on codeforces or good rank on Kaggle (is a plus'}}], 'writeConcernErrors': [], 'nInserted': 218, 'nUpserted': 0, 'nMatched': 0, 'nModified': 0, 'nRemoved': 0, 'upserted': []}