In [63]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pandas as pd
from pandarallel import pandarallel
import string
import numpy as np
import timeit

pandarallel.initialize()

path = r"./resources/software_developer_united_states_1971_20191023_1.csv"


INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [64]:
def extract_description(df: pd.DataFrame):
    s = df["job_description"]
    s = s.parallel_apply(
        lambda s: s.lower()
        .translate(str.maketrans("", "", string.punctuation + "\xa0"))
        .translate(str.maketrans(string.whitespace, " " * len(string.whitespace)))
    )
    return s


def tokenize(s: pd.Series):
    return s.parallel_apply(lambda s: set(word_tokenize(s)))

def remove_stopwords(s: pd.Series):
    stopwords_set = set(stopwords.words())
    return s.parallel_apply(lambda s: s.difference(stopwords_set))

def filter_word_length(s: pd.Series, n: int):
    return s.parallel_apply(lambda s: {word for word in s if len(word) >= n})

def stem(s: pd.Series):
    ps = PorterStemmer()
    return s.parallel_apply(lambda s: {ps.stem(word) for word in s})


def transformation_pipe(df: pd.DataFrame):
    s = extract_description(df)
    s = tokenize(s)
    s = remove_stopwords(s)
    s = filter_word_length(s, 3)
    s = stem(s)
    return s


In [65]:
rows = pd.read_csv(path)
descriptions = transformation_pipe(rows)
descriptions


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def benchmark(size):
    cv = CountVectorizer(analyzer=lambda x: x)
    print(f"{size=} counting")
    X = cv.fit_transform(descriptions[:size])
    XX = X.toarray()
    print(f"{size=} starting")
    result = [timeit.timeit(lambda: np.matmul(XX, XX.T), number=1),
    timeit.timeit(lambda: X.todok()*X.T.todok(),number=1),
    timeit.timeit(lambda: X.tolil()*X.T.tolil(),number=1),
    timeit.timeit(lambda: X.tocoo()*X.T.tocoo(),number=1),
    timeit.timeit(lambda: X.tocsc()*X.T.tocsc(),number=1)]
    print(f"{size=} finished")
    return pd.Series(result, index=["array", "dok", "lil", "coo", "csc"])



In [None]:
sizes = [len(descriptions) // 2 ** n for n in range(0, 6)]

perf_df = pd.DataFrame(sizes, columns=["size"])
perf_df


Unnamed: 0,size
0,312
1,625
2,1250
3,2500
4,5000
5,10000


In [None]:
perf_df.join(perf_df["size"].parallel_apply(lambda x: benchmark(x)))



size=312 countingsize=625 countingsize=1250 counting
size=5000 countingsize=10000 counting
size=2500 counting



size=312 starting
size=625 starting
size=1250 starting
size=2500 starting
size=312 finished
size=625 finished
size=1250 finished


KeyboardInterrupt: 