In [33]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import numpy as np
import pandas as pd
import string
from pandarallel import pandarallel

pandarallel.initialize()

path = r"./resources/software_developer_united_states_1971_20191023_1.csv"


INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [34]:
def extract_description(df: pd.DataFrame):
    s = df["job_description"]
    s = s.parallel_apply(
        lambda s: s.lower()
        .translate(str.maketrans("", "", string.punctuation + "\xa0"))
        .translate(str.maketrans(string.whitespace, " " * len(string.whitespace)))
    )
    return s


def tokenize(s: pd.Series):
    return s.parallel_apply(lambda s: word_tokenize(s))


def transformation_pipe(df: pd.DataFrame):
    s = extract_description(df)
    s = tokenize(s)
    return s

def f7(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]


In [35]:
rows = pd.read_csv(path)
descriptions = transformation_pipe(rows)
descriptions


0       [the, chosen, sr, software, developer, will, b...
1       [position, c, lead, software, developer, locat...
2       [senior, software, developer, hoboken, nj, sta...
3       [our, client, a, multinational, publishing, an...
4       [position, c, lead, software, developer, locat...
                              ...                        
9995    [software, developer, –, asheville, nc, positi...
9996    [business, group, highlights, civilian, state,...
9997    [job, description, the, candidate, must, be, e...
9998    [please, only, apply, if, you, do, not, need, ...
9999    [company, information, solid, reputation, pass...
Name: job_description, Length: 10000, dtype: object

In [39]:
def inverse_indexing(parsed_description):
    print("stemming")
    stemmed_description = stem_descriptions(parsed_description)
    all_unique_term = list(set([element for list_ in stemmed_description.values for element in list_]))

    invert_idx = {}
    print("indexing")
    for i, s in enumerate(all_unique_term):
        invert_idx[s] = set(stemmed_description.loc[stemmed_description.apply(lambda x: s in
    x)].index)
    print("complete")
    return invert_idx

def stem_descriptions(parsed_description):
    sw_set = set(stopwords.words()) - {"c"}
    no_sw_description = parsed_description.apply(
        lambda x: [w for w in x if w not in sw_set]
    )
    ps = PorterStemmer()
    return no_sw_description.apply(
        lambda x: f7([ps.stem(w) for w in x])
    )

def search(invert_idx, query):
    ps = PorterStemmer()
    processed_query = [s.lower() for s in query.split()]
    stemmed = [ps.stem(s) for s in processed_query]
    matched = list(set.intersection(*[invert_idx[s] for s in stemmed]))
    return matched



In [40]:
from multiprocessing import Pool
from collections import ChainMap

splitted_descriptions = np.array_split(descriptions, 8)
invert_idx = {}
with Pool(8) as p:
    invert_idx = p.map(inverse_indexing, splitted_descriptions)
invert_idx
inverse_index = dict(ChainMap(*invert_idx))
inverse_index


  return bound(*args, **kwds)


stemming
stemming
stemming
stemming
stemming
stemming
stemming
stemming
indexing
indexing
indexing
indexing
indexing
indexing
indexing
indexing
complete
complete
complete
complete
complete
complete
complete
complete


{'cool': {213,
  224,
  311,
  316,
  345,
  481,
  551,
  552,
  554,
  556,
  573,
  594,
  692,
  706,
  707,
  737,
  739,
  741,
  748,
  754,
  757,
  759,
  761,
  763,
  781,
  785,
  787,
  791,
  835,
  853,
  857,
  939,
  989,
  1013,
  1079,
  1089,
  1112,
  1190},
 '9th': {3232},
 'bootstrap': {1,
  4,
  17,
  35,
  36,
  39,
  60,
  61,
  73,
  74,
  75,
  133,
  148,
  155,
  167,
  182,
  184,
  221,
  230,
  255,
  258,
  274,
  333,
  383,
  384,
  394,
  422,
  424,
  441,
  452,
  454,
  481,
  557,
  563,
  564,
  571,
  583,
  605,
  606,
  610,
  615,
  680,
  690,
  709,
  721,
  799,
  804,
  808,
  827,
  837,
  849,
  851,
  857,
  858,
  865,
  869,
  881,
  924,
  941,
  943,
  994,
  1015,
  1031,
  1044,
  1052,
  1055,
  1059,
  1064,
  1067,
  1076,
  1099,
  1107,
  1116,
  1228},
 'compris': {20,
  56,
  83,
  156,
  157,
  256,
  265,
  362,
  381,
  454,
  458,
  465,
  652,
  868,
  943,
  1003,
  1083,
  1143,
  1168,
  1180,
  1185},
 'icf': {3

In [41]:
query = 'java oracle'
matched = search(inverse_index, query)
print(descriptions.loc[matched].apply(lambda x: ' '.join(x)).head(10).to_markdown())


|      | job_description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                