In [32]:
import pandas as pd

In [54]:
# Read JD dataset from CSV files, Run EDA Analysis
list_data_fields = ['LinkedIn/output/JD_Business Intelligence.csv','LinkedIn/output/JD_Data Analyst.csv',
                    'LinkedIn/output/JD_Data Architect.csv','LinkedIn/output/JD_Data Engineer.csv',
                    'LinkedIn/output/JD_Data Scientist.csv','LinkedIn/output/JD_Business Analyst.csv','LinkedIn/output/JD_Database Administrator.csv']

list_software_fields = ['LinkedIn/output/JD_Back End Developer.csv','LinkedIn/output/JD_Back-end Developer.csv',
                        'LinkedIn/output/JD_DevOps.csv','LinkedIn/output/JD_Front End Developer.csv','LinkedIn/output/JD_Front-end Developer.csv',
                        'LinkedIn/output/JD_Product Manager.csv','LinkedIn/output/JD_Software Engineer.csv','LinkedIn/output/JD_Tester.csv']

In [55]:
def read_and_clean_single_role(input_path=''):
    df = pd.read_csv(input_path,sep='|')

    # Drop Null Values for description column
    df = df.dropna(axis=0,subset=['description'])

    # Remove the same description & company & jobTitle & location & type
    df = df.drop_duplicates(subset=['jobTitle','location','company','type','description'],keep='first')

    return df

def create_field_DataFrame(input_list = list):
    list_dfs=[]
    for item in input_list:
        print(item)
        df_tmp = read_and_clean_single_role('../'+item)
        list_dfs.append(df_tmp)

    df = pd.concat(list_dfs)
    df = df.reset_index()
    return df

<h3>READ Data Science JDs in English</h3>

In [56]:
def read_JDs(path_jobs_en='',list_fields_input=list):
    # list_jd_all_fields = list_data_fields + list_software_fields
    df1 = create_field_DataFrame(list_fields_input)
    df2 = pd.read_csv(path_jobs_en)
    return df1,df2

In [58]:
df1_ds,df2_ds = read_JDs('../DataPreprocessing/ds_job_lang.csv',list_fields_input=list_data_fields)
df1_ds['language'] = df2_ds['language']
df_ds = df1_ds[df1_ds['language']=='en'].reset_index(drop=True)

LinkedIn/output/JD_Business Intelligence.csv
LinkedIn/output/JD_Data Analyst.csv
LinkedIn/output/JD_Data Architect.csv
LinkedIn/output/JD_Data Engineer.csv
LinkedIn/output/JD_Data Scientist.csv
LinkedIn/output/JD_Business Analyst.csv
LinkedIn/output/JD_Database Administrator.csv


In [60]:
df1_sw,df2_sw = read_JDs('../DataPreprocessing/sw_job_lang.csv',list_fields_input=list_software_fields)
df1_sw['language'] = df2_sw['language']
df_sw = df1_sw[df1_sw['language']=='en'].reset_index(drop=True)

LinkedIn/output/JD_Back End Developer.csv
LinkedIn/output/JD_Back-end Developer.csv
LinkedIn/output/JD_DevOps.csv
LinkedIn/output/JD_Front End Developer.csv
LinkedIn/output/JD_Front-end Developer.csv
LinkedIn/output/JD_Product Manager.csv
LinkedIn/output/JD_Software Engineer.csv
LinkedIn/output/JD_Tester.csv


In [61]:
df_ds

Unnamed: 0,index,id,jobTitle,location,company,type,description,language
0,0,2975673294,Data Engineer in Business Intelligence Team (B...,"Ho Chi Minh City, Ho Chi Minh City, Vietnam",Agoda,Full-time · Entry level,About the job\nAbout Agoda\n\nAgoda is an onli...,en
1,1,2993666500,"Business Intelligence Intern, Kobiton","Ho Chi Minh City, Ho Chi Minh City, Vietnam","KMS Technology, Inc.",Internship · Internship,About the job\nThis job is sourced from a job ...,en
2,2,2975674182,"Business Intelligence Developer, Product Team ...","Ho Chi Minh City, Ho Chi Minh City, Vietnam",Agoda,Full-time · Associate,About the job\nAbout Agoda\n\nAgoda is an onli...,en
3,3,2993740820,"Business Intelligence (Edtech Company, Upto 10...","Hanoi, Hanoi, Vietnam",X3english Limited Company,"₫15,000,000/month - ₫23,000,000/month · Full-t...",About the job\nThis job is sourced from a job ...,en
4,4,2945707463,Senior Business Intelligence Analyst (0824),"Hanoi, Hanoi, Vietnam",Techcombank (TCB),Full-time · Associate,About the job\nJob Purpose\nThe Job Holder Res...,en
...,...,...,...,...,...,...,...,...
427,16,2990265063,"Database Administrator (DBA), based in Da Nang","Đà Nang, Da Nang City, Vietnam","KMS Technology, Inc.",Full-time · Associate,About the job\nAs a senior DBA you will have t...,en
428,17,3008519916,Junior/Senior Database Administrator (DBA),"Ho Chi Minh City, Ho Chi Minh City, Vietnam","KMS Technology, Inc.",Full-time · Mid-Senior level,About the job\nAs a senior DBA you will have t...,en
429,18,2698740162,[Local Product] Database Administrator,"Ho Chi Minh City, Ho Chi Minh City, Vietnam",Shopee,Full-time · Mid-Senior level,About the job\nAbout The Team\n\nThe Product D...,en
430,19,2973389208,"[Local Product] Database Administrator (MySQL,...","Ho Chi Minh City, Ho Chi Minh City, Vietnam",Shopee,Full-time · Entry level,About the job\nWhat Will You Do\nResponsible f...,en


In [62]:
df_sw

Unnamed: 0,index,id,jobTitle,location,company,type,description,language
0,0,3008981684,Back End Developer,"Ho Chi Minh City, Vietnam",Sendo.vn,Full-time,About the job\nJob description\nWrite scalable...,en
1,3,2983100486,Back End Developer,"Tân Bình, Ho Chi Minh City, Vietnam",WATA,Full-time,About the job\nWATA is actively looking for Mi...,en
2,6,2992995558,Back End Developer,"Ho Chi Minh City, Vietnam",Axon Active - Agile Offshore Software Developm...,Full-time,About the job\nDescription\nComing to Axon Act...,en
3,7,3008544604,Back End Developer,"Ho Chi Minh City, Vietnam",Swop,Full-time,About the job\n[SAIGON- VIETNAM] SWOP.COMPANY\...,en
4,8,3012111122,Back End Developer,"Ho Chi Minh City, Vietnam",MLTech Soft,Full-time,About the job\nJOB DESCRIPTION\nFresher Back-E...,en
...,...,...,...,...,...,...,...,...
642,191,2920919416,Senior Automation Tester,Da Nang Metropolitan Area,EPAM Anywhere,Full-time · Mid-Senior level,About the job\nApply to this vacancy and get a...,en
643,192,2920916624,Senior Automation Tester,"Haiphong, Hai Phong City, Vietnam",EPAM Anywhere,Full-time · Mid-Senior level,About the job\nApply to this vacancy and get a...,en
644,193,2920917626,Senior Automation Tester,"Hanoi, Hanoi, Vietnam",EPAM Anywhere,Full-time · Mid-Senior level,About the job\nApply to this vacancy and get a...,en
645,194,2690306027,QC Manager (Tester),"Hanoi, Hanoi, Vietnam",DAC Data Science Vietnam,Full-time,About the job\nJob Description\nOversee all te...,en


In [98]:
df_total_en = pd.concat([df_ds, df_sw], axis=0)

In [99]:
df_total_en.drop_duplicates(subset=['id'],keep='first',inplace=True)

In [104]:
df_total_en.reset_index(drop=True,inplace=True)

In [105]:
df_total_en.rename(columns={'id':'_id'},inplace=True)
# df_total_en

In [106]:
df_total_en

Unnamed: 0,index,_id,jobTitle,location,company,type,description,language
0,0,2975673294,Data Engineer in Business Intelligence Team (B...,"Ho Chi Minh City, Ho Chi Minh City, Vietnam",Agoda,Full-time · Entry level,About the job\nAbout Agoda\n\nAgoda is an onli...,en
1,1,2993666500,"Business Intelligence Intern, Kobiton","Ho Chi Minh City, Ho Chi Minh City, Vietnam","KMS Technology, Inc.",Internship · Internship,About the job\nThis job is sourced from a job ...,en
2,2,2975674182,"Business Intelligence Developer, Product Team ...","Ho Chi Minh City, Ho Chi Minh City, Vietnam",Agoda,Full-time · Associate,About the job\nAbout Agoda\n\nAgoda is an onli...,en
3,3,2993740820,"Business Intelligence (Edtech Company, Upto 10...","Hanoi, Hanoi, Vietnam",X3english Limited Company,"₫15,000,000/month - ₫23,000,000/month · Full-t...",About the job\nThis job is sourced from a job ...,en
4,4,2945707463,Senior Business Intelligence Analyst (0824),"Hanoi, Hanoi, Vietnam",Techcombank (TCB),Full-time · Associate,About the job\nJob Purpose\nThe Job Holder Res...,en
...,...,...,...,...,...,...,...,...
1064,191,2920919416,Senior Automation Tester,Da Nang Metropolitan Area,EPAM Anywhere,Full-time · Mid-Senior level,About the job\nApply to this vacancy and get a...,en
1065,192,2920916624,Senior Automation Tester,"Haiphong, Hai Phong City, Vietnam",EPAM Anywhere,Full-time · Mid-Senior level,About the job\nApply to this vacancy and get a...,en
1066,193,2920917626,Senior Automation Tester,"Hanoi, Hanoi, Vietnam",EPAM Anywhere,Full-time · Mid-Senior level,About the job\nApply to this vacancy and get a...,en
1067,194,2690306027,QC Manager (Tester),"Hanoi, Hanoi, Vietnam",DAC Data Science Vietnam,Full-time,About the job\nJob Description\nOversee all te...,en


In [107]:
keywords_tier_1  = ["yêu cầu","requirements","need to","qualifications","must have",'looking for']
keywords_tier_2  = ["skills","experience","success"] 


In [108]:
def find_substring_pos(text='',substring=''):
    result = text.lower().find(substring)
    # print(result)
    return result

In [109]:
find_substring_pos("Requirements","requirements")

0

In [110]:
def get_min_key_pos(input_dict:dict):
    # Get key-value whose value is minimum and value > -1
    min_key = -1
    min_value = max(input_dict.values())
    for key in input_dict.keys():
        if  input_dict[key] <= min_value and input_dict[key] > -1:
            # print(key,input_dict[key])
            min_value = input_dict[key]
            min_key = key
    # print(min_key,min_value)
    return min_key,min_value

In [111]:
d = {'skills': -1, 'experience': -1,'looking for': -1, 'success': -1}
get_min_key_pos(d)

(-1, -1)

In [112]:
def cut_jd_requirements(text=''):
    requirements_text = ''
    dictionary_tier_1 = {}
    dictionary_tier_2 = {}
    for item in keywords_tier_1:
        dictionary_tier_1[item] = find_substring_pos(text,item)
    for item in keywords_tier_2:
        dictionary_tier_2[item] = find_substring_pos(text,item)
    
    k1,v1 = get_min_key_pos(dictionary_tier_1)
    k2,v2 = get_min_key_pos(dictionary_tier_2)
    # print(dictionary_tier_1)
    # print(dictionary_tier_2)
    if v1 != -1:
        requirements_text = text[v1:-1]
    else:
        requirements_text = text[v2:-1]
    # l = list(dictionary_tier_1.values())
    # print(type(l),l)
    return requirements_text

In [113]:
def add_column_requirements(df:pd.DataFrame):
    cut_description_list = []
    for i in range(len(df)):
        text = df.loc[i]['description']
        requirements_text = cut_jd_requirements(text=text)
        cut_description_list.append(requirements_text)

    df['desc'] = cut_description_list
    return df

In [114]:
print(type(df_total_en))

<class 'pandas.core.frame.DataFrame'>


In [115]:

df = add_column_requirements(df_total_en)
# df3.rename(columns={'id':'_id'},inplace=True)
df = df[['_id','jobTitle','location','company','type','desc']]

In [116]:
df

Unnamed: 0,_id,jobTitle,location,company,type,desc
0,2975673294,Data Engineer in Business Intelligence Team (B...,"Ho Chi Minh City, Ho Chi Minh City, Vietnam",Agoda,Full-time · Entry level,requirements. Our ultimate goal is to enable a...
1,2993666500,"Business Intelligence Intern, Kobiton","Ho Chi Minh City, Ho Chi Minh City, Vietnam","KMS Technology, Inc.",Internship · Internship,need to be successful: General: * 4th year/ ne...
2,2975674182,"Business Intelligence Developer, Product Team ...","Ho Chi Minh City, Ho Chi Minh City, Vietnam",Agoda,Full-time · Associate,requirements. Our ultimate goal is to enable a...
3,2993740820,"Business Intelligence (Edtech Company, Upto 10...","Hanoi, Hanoi, Vietnam",X3english Limited Company,"₫15,000,000/month - ₫23,000,000/month · Full-t...",experiences with Data Analysis or Business Ana...
4,2945707463,Senior Business Intelligence Analyst (0824),"Hanoi, Hanoi, Vietnam",Techcombank (TCB),Full-time · Associate,requirements from within and outside of the tr...
...,...,...,...,...,...,...
1064,2920919416,Senior Automation Tester,Da Nang Metropolitan Area,EPAM Anywhere,Full-time · Mid-Senior level,looking for a remote Senior Automation Tester ...
1065,2920916624,Senior Automation Tester,"Haiphong, Hai Phong City, Vietnam",EPAM Anywhere,Full-time · Mid-Senior level,looking for a remote Senior Automation Tester ...
1066,2920917626,Senior Automation Tester,"Hanoi, Hanoi, Vietnam",EPAM Anywhere,Full-time · Mid-Senior level,looking for a remote Senior Automation Tester ...
1067,2690306027,QC Manager (Tester),"Hanoi, Hanoi, Vietnam",DAC Data Science Vietnam,Full-time,requirements/change in requirements of the Pro...


In [117]:
import pymongo
myclient = pymongo.MongoClient("mongodb://mongoadmin:admin@13.67.48.201:27017/")
mydb = myclient["LinkedIn"]
mycol = mydb["JobsDescription"]
mycol.insert_many(df.to_dict('records'))

<pymongo.results.InsertManyResult at 0x1234e3f70>