In [1]:
import pandas as pd

In [2]:
df1 = pd.read_csv('../data/jobposts/20250525-vancouver.csv')
df2 = pd.read_csv('../data/jobposts/20250526-vancouver.csv')

df = pd.concat([df1, df2])

In [3]:
# count
print(df.shape)
print(df['job_url'].unique().shape)

(38240, 6)
(21087,)


In [14]:
# clean keywords
tmp = df['position'].str.lower() + ' ' + df['company'].str.lower()
tmp = tmp.apply(lambda text : ''.join(c if c.isalpha() else ' ' for c in text))
tmp = tmp.str.split().apply(set)
df['keywords'] = tmp.apply(lambda kws: {w for w in kws if len(w) > 2})

# get word frequencies
keywords = ' '.join(df['keywords'].str.join(' ')).split()
wf_data = [{'keyword' : w, 'count' : keywords.count(w)} for w in set(keywords)]

# rank keywords
wf = pd.DataFrame(wf_data)
wf = wf.sort_values('count', ascending=False)
wf['rank'] = range(len(wf))
wf = wf.set_index('keyword')

rank_dict = wf.to_dict()['rank']
df['word_ranks'] = df['keywords'].apply(lambda words : [rank_dict[word] for word in words if word in rank_dict])
df['min_rank'] = df['word_ranks'].apply(lambda x : min(x) if x else None)

# characteristic rank of job in Vancouver
df['mean_rank'] = df['word_ranks'].apply(lambda x : sum(x)/len(x) if x else None).round(2)
df = df.sort_values('mean_rank')

df['kws_rank'] = df['keywords'].apply(
    lambda kws : ', '.join(
        f'{kw} ({rank_dict[kw]:,})' for kw in sorted(kws, key=lambda kw:rank_dict[kw])
        )
    )

df['job_count'] = 1
pvt = df.pivot_table(
    index=['position', 'company', 'location', 'min_rank', 'mean_rank', 'kws_rank'],
    values='job_count',
    aggfunc='count'
    )
pvt = pvt.reset_index()
pvt = pvt.sort_values('mean_rank')
pvt = pvt.reset_index(drop=True)

In [13]:
df

Unnamed: 0,position,company,location,status,job_url,firm_url,keywords,word_ranks,min_rank,mean_rank,kws_rank,job_count
5127,Community Health Nurse - Registered Nurse,Fraser Health Authority,"Port Moody, British Columbia, Canada",Be an early applicant,https://ca.linkedin.com/jobs/view/community-he...,https://ca.linkedin.com/company/fraser-health-...,"{health, community, authority, fraser, registe...","[0, 35, 1, 2, 10, 5]",0,8.83,"health (0), authority (1), fraser (2), nurse (...",1
5072,Community Health Nurse - Registered Nurse,Fraser Health Authority,"Burnaby, British Columbia, Canada",Be an early applicant,https://ca.linkedin.com/jobs/view/community-he...,https://ca.linkedin.com/company/fraser-health-...,"{health, community, authority, fraser, registe...","[0, 35, 1, 2, 10, 5]",0,8.83,"health (0), authority (1), fraser (2), nurse (...",1
5090,Community Health Nurse - Registered Nurse,Fraser Health Authority,"Burnaby, British Columbia, Canada",Be an early applicant,https://ca.linkedin.com/jobs/view/community-he...,https://ca.linkedin.com/company/fraser-health-...,"{health, community, authority, fraser, registe...","[0, 35, 1, 2, 10, 5]",0,8.83,"health (0), authority (1), fraser (2), nurse (...",1
5072,Community Health Nurse - Registered Nurse,Fraser Health Authority,"Burnaby, British Columbia, Canada",Be an early applicant,https://ca.linkedin.com/jobs/view/community-he...,https://ca.linkedin.com/company/fraser-health-...,"{health, community, authority, fraser, registe...","[0, 35, 1, 2, 10, 5]",0,8.83,"health (0), authority (1), fraser (2), nurse (...",1
5090,Community Health Nurse - Registered Nurse,Fraser Health Authority,"Burnaby, British Columbia, Canada",Be an early applicant,https://ca.linkedin.com/jobs/view/community-he...,https://ca.linkedin.com/company/fraser-health-...,"{health, community, authority, fraser, registe...","[0, 35, 1, 2, 10, 5]",0,8.83,"health (0), authority (1), fraser (2), nurse (...",1
...,...,...,...,...,...,...,...,...,...,...,...,...
17362,nanny,Sujata,"White Rock, British Columbia, Canada",Be an early applicant,https://ca.linkedin.com/jobs/view/nanny-at-suj...,https://in.linkedin.com/company/sujata?trk=pub...,"{nanny, sujata}","[4996, 8266]",4996,6631.00,"nanny (4,996), sujata (8,266)",1
20877,Representante de AtenciÃ³n al Cliente de Distr...,NFI,"Delta, British Columbia, Canada",Be an early applicant,https://ca.linkedin.com/jobs/view/representant...,https://www.linkedin.com/company/nfi?trk=publi...,"{atenciã, cliente, nfi, distribuciã, represent...","[8297, 8313, 2544, 7870, 8406]",2544,7086.00,"nfi (2,544), distribuciã (7,870), atenciã (8,2...",1
16857,Policier- FR,CN,"North Vancouver, British Columbia, Canada",Be an early applicant,https://ca.linkedin.com/jobs/view/policier-fr-...,https://ca.linkedin.com/company/cn?trk=public_...,{policier},[7237],7237,7237.00,"policier (7,237)",1
16857,Policier- FR,CN,"North Vancouver, British Columbia, Canada",Be an early applicant,https://ca.linkedin.com/jobs/view/policier-fr-...,https://ca.linkedin.com/company/cn?trk=public_...,{policier},[7237],7237,7237.00,"policier (7,237)",1


In [None]:
includes = ['policy', 'research']
excludes = ['assistant', 'university']

# initialize condition
cond = pvt['kws_rank'].isna()

for w in includes:
    cond = cond | (pvt['kws_rank'].str.contains(w))

for w in excludes:
    cond = cond & (~pvt['kws_rank'].str.contains(w))


pvt[cond].reset_index(drop=True)

Unnamed: 0,position,company,min_rank,mean_rank,kws_rank,job_count
0,Regional Medical Director - Learning and Research,Fraser Health Authority,0,75.00,"health (0), authority (1), fraser (2), and (13...",2
1,"Coordinator, Research Finance",Fraser Health Authority,0,76.67,"health (0), authority (1), fraser (2), coordin...",2
2,Senior Research Engineer,RBC,6,100.00,"senior (6), engineer (9), research (167), rbc ...",2
3,"Analyst, Research",ITjobs.ca,44,106.00,"analyst (44), itjobs (107), research (167)",2
4,"Research Projects Coordinator, Medical Oncolog...",Provincial Health Services Authority,0,107.82,"health (0), authority (1), vancouver (4), serv...",2
...,...,...,...,...,...,...
70,Medical Image Annotation & Research Associate,Prenuvo,18,2037.67,"associate (18), medical (62), research (167), ...",2
71,Co-op Researcher - Multimodal Large Language M...,Huawei Canada,11,2083.70,"canada (11), language (282), huawei (759), res...",2
72,Senior Security Researcher,Truffle Security Co.,6,2222.00,"senior (6), security (90), researcher (1,328),...",2
73,Research Associate in Computational Mathematics,TIMEZYX,18,2519.00,"associate (18), research (167), mathematics (2...",2
