In [6]:
import pandas as pd
pd.set_option('display.max_rows', 100)  # Change 100 to your desired number

In [2]:
# df1 = pd.read_csv('../data/jobposts/20250525-vancouver.csv')
# df2 = pd.read_csv('../data/jobposts/20250526-vancouver.csv')

# df = pd.concat([df1, df2])

df = pd.read_csv('../data/jobposts/20250623-vancouver.csv')

In [3]:
# count
print(df.shape)
print(df['job_url'].unique().shape)

(13027, 6)
(13027,)


In [4]:
# clean keywords
tmp = df['position'].str.lower() + ' ' + df['company'].str.lower()
tmp = tmp.apply(lambda text : ''.join(c if c.isalpha() else ' ' for c in text))
tmp = tmp.str.split().apply(set)
df['keywords'] = tmp.apply(lambda kws: {w for w in kws if len(w) > 2})

# get word frequencies
keywords = ' '.join(df['keywords'].str.join(' ')).split()
wf_data = [{'keyword' : w, 'count' : keywords.count(w)} for w in set(keywords)]

# rank keywords
wf = pd.DataFrame(wf_data)
wf = wf.sort_values('count', ascending=False)
wf['rank'] = range(len(wf))
wf = wf.set_index('keyword')

rank_dict = wf.to_dict()['rank']
df['word_ranks'] = df['keywords'].apply(lambda words : [rank_dict[word] for word in words if word in rank_dict])
df['min_rank'] = df['word_ranks'].apply(lambda x : min(x) if x else None)

# characteristic rank of job in Vancouver
df['mean_rank'] = df['word_ranks'].apply(lambda x : sum(x)/len(x) if x else None).round(2)
df = df.sort_values('mean_rank')

df['kws_rank'] = df['keywords'].apply(
    lambda kws : ', '.join(
        f'{kw} ({rank_dict[kw]:,})' for kw in sorted(kws, key=lambda kw:rank_dict[kw])
        )
    )

df['job_count'] = 1
pvt = df.pivot_table(
    index=['position', 'company', 'location', 'min_rank', 'mean_rank', 'kws_rank'],
    values='job_count',
    aggfunc='count'
    )
pvt = pvt.reset_index()
pvt = pvt.sort_values('mean_rank')
pvt = pvt.reset_index(drop=True)

In [None]:
includes = ['policy', 'research'] # or
excludes = ['assistant', 'university'] # and

# initialize condition
cond = pvt['kws_rank'].isna()

for w in includes:
    cond = cond | (pvt['kws_rank'].str.contains(w))

for w in excludes:
    cond = cond & (~pvt['kws_rank'].str.contains(w))

pvt[cond].reset_index(drop=True)

Unnamed: 0,position,company,location,min_rank,mean_rank,kws_rank,job_count
0,"Coordinator, Research Finance",Fraser Health Authority,"Surrey, British Columbia, Canada",0,50.5,"health (0), authority (1), fraser (4), coordin...",1
1,Regional Medical Director - Learning and Research,Fraser Health Authority,"Surrey, British Columbia, Canada",0,68.78,"health (0), authority (1), fraser (4), and (11...",1
2,"Research Projects Coordinator, Medical Oncolog...",Provincial Health Services Authority,"Vancouver, British Columbia, Canada",0,91.45,"health (0), authority (1), vancouver (2), serv...",1
3,Senior Research Engineer,RBC,"Vancouver, British Columbia, Canada",14,97.75,"senior (14), engineer (47), research (125), rb...",1
4,Research Specialist,Deloitte,"Vancouver, British Columbia, Canada",23,119.0,"specialist (23), research (125), deloitte (209)",1
5,"Administrator, Research Administration and Ope...",BC Cancer,"Vancouver, British Columbia, Canada",11,147.67,"and (11), operations (71), cancer (123), resea...",1
6,"Clinical Trials Nurse Coordinator, Research Cl...",Provincial Health Services Authority,"Vancouver, British Columbia, Canada",0,159.55,"health (0), authority (1), vancouver (2), nurs...",1
7,"Co-op Student, Research Genome â Clinical - ...",Provincial Health Services Authority,"Vancouver, British Columbia, Canada",0,163.4,"health (0), authority (1), vancouver (2), serv...",1
8,"Co-op Student, Research Genome - Operations - ...",Provincial Health Services Authority,"Vancouver, British Columbia, Canada",0,166.1,"health (0), authority (1), services (6), provi...",2
9,"Unit Clerk, Research Clinical Trials - BC Canc...",Provincial Health Services Authority,"Vancouver, British Columbia, Canada",0,169.82,"health (0), authority (1), vancouver (2), serv...",2
