In [1]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import numpy as np
import re
import pandas as pd
import string
from pandarallel import pandarallel
import typing
from scipy import sparse

pandarallel.initialize()
nltk.download("stopwords")
nltk.download("punkt")

%load_ext line_profiler


INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mansmooth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mansmooth/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 12-17 - Bag of words

In [2]:
def create_stem_cache(cleaned_description: pd.DataFrame):
    tokenized_description = cleaned_description.parallel_apply(
        lambda s: word_tokenize(s)
    )
    concated: np.ndarray = pd.unique(tokenized_description.explode())
    ps = PorterStemmer()
    stem_cache = {}
    for s in concated:
        stem_cache[s] = ps.stem(s)
    return stem_cache


def create_custom_preprocessor(stop_dict: set[str], stem_cache):
    def custom_preprocessor(s):
        ps = PorterStemmer()
        s = re.sub(r"[^A-Za-z]", " ", s)
        s = re.sub(r"\s+", " ", s)
        s = word_tokenize(s)
        s = set(s).difference(stop_dict)
        s = [word for word in s if len(word) > 2]
        s = [stem_cache[w] if w in stem_cache else ps.stem(w) for w in s]
        s = " ".join(s)
        return s

    return custom_preprocessor


def sk_vectorize(texts, cleaned_description, stop_dict, stem_cache):
    my_custom_preprocessor = create_custom_preprocessor(stop_dict, stem_cache)
    vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor)
    vectorizer.fit(cleaned_description)
    query = vectorizer.transform(texts)
    print(query)
    print(vectorizer.inverse_transform(query))


def extract_description(df: pd.DataFrame):
    s = df["job_description"]
    s = s.parallel_apply(
        lambda s: s.lower()
        .translate(str.maketrans("", "", string.punctuation + "\xa0"))
        .translate(str.maketrans(string.whitespace, " " * len(string.whitespace)))
    )
    return s


In [44]:
m1 = pd.read_csv("./resources/software_developer_united_states_1971_20191023_1.csv")
cleaned_description = extract_description(m1)
cleaned_description.drop_duplicates(inplace=True)
stem_cache = create_stem_cache(cleaned_description)
stop_dict = set(stopwords.words("english"))


In [44]:
sk_vectorize(
    ["python is simpler than java", "java is simpler than python"], cleaned_description, stop_dict, stem_cache
)


  (0, 13947)	1
  (0, 21383)	1
  (0, 24234)	1
  (1, 13947)	1
  (1, 21383)	1
  (1, 24234)	1
[array(['java', 'python', 'simpler'], dtype='<U124'), array(['java', 'python', 'simpler'], dtype='<U124')]


In [48]:
my_custom_preprocessor = create_custom_preprocessor(stop_dict, stem_cache)
bigram_vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor, ngram_range=(1, 2))
bigram_vectorizer.fit(cleaned_description)
print(len(bigram_vectorizer.get_feature_names_out()))


333891


In [49]:
my_custom_preprocessor = create_custom_preprocessor(stop_dict, stem_cache)
trigram_vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor, ngram_range=(1, 3))
trigram_vectorizer.fit(cleaned_description)
print(len(trigram_vectorizer.get_feature_names_out()))


1067597


# 25 - tf-idf

In [24]:
my_custom_preprocessor = create_custom_preprocessor(stop_dict, stem_cache)
vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor)
X: sparse.csr_matrix = vectorizer.fit_transform(cleaned_description)
X


<10000x30513 sparse matrix of type '<class 'numpy.int64'>'
	with 1825009 stored elements in Compressed Sparse Row format>

In [14]:
def log10_1p(x: np.matrix[any, any]) -> np.matrix[any, any]:
   return np.log1p(x) / np.log(10)

N = len(cleaned_description)

df = X.sum(axis=0)
idf = log10_1p((N / df))
tf = log10_1p(X)

tf_idf: sparse.coo_matrix = tf.multiply(idf)


In [15]:
X_df = pd.DataFrame.sparse.from_spmatrix(tf_idf, columns=vectorizer.get_feature_names_out())
max_term = X_df.sum().sort_values()[-20:].sort_index().index
X_df[max_term].head(20)


Unnamed: 0,applic,code,design,environ,experi,includ,new,requir,respons,servic,skill,softwar,solut,system,team,technolog,test,use,work,year
0,0.099334,0.162478,0.120492,0.112247,0.089573,0.095101,0.0,0.067702,0.094726,0.104807,0.106943,0.09422,0.108942,0.0,0.088365,0.087232,0.169165,0.148854,0.064118,0.156659
1,0.062673,0.0,0.076022,0.0,0.089573,0.0,0.127458,0.067702,0.094726,0.104807,0.106943,0.09422,0.108942,0.0,0.088365,0.13826,0.0,0.0,0.064118,0.098841
2,0.099334,0.162478,0.0,0.0,0.089573,0.0,0.0,0.067702,0.0,0.0,0.106943,0.09422,0.108942,0.09219,0.088365,0.0,0.13406,0.0,0.101624,0.098841
3,0.0,0.162478,0.0,0.112247,0.089573,0.0,0.0,0.0,0.0,0.0,0.0,0.09422,0.108942,0.09219,0.140055,0.0,0.084583,0.0,0.064118,0.0
4,0.062673,0.0,0.076022,0.0,0.089573,0.0,0.127458,0.067702,0.094726,0.104807,0.106943,0.09422,0.108942,0.0,0.088365,0.13826,0.0,0.0,0.064118,0.098841
5,0.099334,0.102512,0.076022,0.0,0.089573,0.0,0.0,0.067702,0.094726,0.0,0.106943,0.09422,0.0,0.0,0.0,0.087232,0.0,0.0,0.0,0.098841
6,0.062673,0.0,0.076022,0.0,0.089573,0.0,0.0,0.067702,0.0,0.0,0.106943,0.09422,0.0,0.09219,0.140055,0.087232,0.0,0.093916,0.064118,0.098841
7,0.062673,0.102512,0.076022,0.0,0.089573,0.0,0.127458,0.067702,0.094726,0.0,0.106943,0.09422,0.108942,0.09219,0.17673,0.087232,0.13406,0.148854,0.101624,0.098841
8,0.125346,0.162478,0.076022,0.177907,0.089573,0.095101,0.127458,0.107306,0.094726,0.104807,0.169501,0.09422,0.172669,0.146117,0.088365,0.13826,0.13406,0.093916,0.101624,0.098841
9,0.0,0.102512,0.076022,0.0,0.0,0.150731,0.0,0.067702,0.094726,0.0,0.0,0.09422,0.0,0.09219,0.140055,0.087232,0.084583,0.093916,0.0,0.0


# 27 - Bigram, Trigram Performance

In [31]:
my_custom_preprocessor =create_custom_preprocessor(stop_dict, stem_cache)
bigram_vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor, ngram_range=(2, 2))
bi_X: sparse.csr_matrix = bigram_vectorizer.fit_transform(cleaned_description)
bi_X


<10000x302996 sparse matrix of type '<class 'numpy.int64'>'
	with 2020211 stored elements in Compressed Sparse Row format>

In [29]:
def log10_1p(x: np.matrix[any, any]) -> np.matrix[any, any]:
   return np.log1p(x) / np.log(10)

bi_N = len(cleaned_description)

bi_df = bi_X.sum(axis=0)
bi_idf = log10_1p((bi_N / bi_df))
bi_tf = log10_1p(bi_X)

bi_tf_idf: sparse.coo_matrix = bi_tf.multiply(bi_idf)


In [36]:
bi_X_df = pd.DataFrame.sparse.from_spmatrix(bi_tf_idf, columns=bigram_vectorizer.get_feature_names_out())
bi_max_term = bi_X_df.sum().sort_values()[-20:].sort_index().index
bi_X_df[bi_max_term].head(20)


Unnamed: 0,comput inform,degre base,develop part,employ servic,environ develop,java posit,nation work,practic develop,requir respons,requir support,skill requir,softwar creat,softwar provid,strong test,system technolog,team solut,technolog say,test web,use technolog,web field
0,0.0,0.0,0.0,0.0,0.2049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.244844,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167149,0.0,0.0,0.0,0.0,0.235233,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.27624,0.0,0.0,0.0,0.167149,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.2049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.212829,0.0,0.0,0.0,0.0,0.242438,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167149,0.0,0.0,0.0,0.0,0.235233,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.244844,0.0
7,0.236843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25431,0.0,0.167149,0.0,0.0,0.0,0.0,0.235233,0.0,0.0,0.244844,0.0
8,0.0,0.0,0.0,0.241727,0.0,0.0,0.0,0.0,0.0,0.0,0.167149,0.281294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.274829,0.0,0.0,0.0,0.0,0.212829,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
my_custom_preprocessor =create_custom_preprocessor(stop_dict, stem_cache)
trigram_vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor, ngram_range=(3, 3))
tri_X: sparse.csr_matrix = trigram_vectorizer.fit_transform(cleaned_description)
tri_X


<10000x731723 sparse matrix of type '<class 'numpy.int64'>'
	with 2010453 stored elements in Compressed Sparse Row format>

In [38]:
def log10_1p(x: np.matrix[any, any]) -> np.matrix[any, any]:
   return np.log1p(x) / np.log(10)

tri_N = len(cleaned_description)

tri_df = tri_X.sum(axis=0)
tri_idf = log10_1p((tri_N / tri_df))
tri_tf = log10_1p(tri_X)

tri_tf_idf: sparse.coo_matrix = tri_tf.multiply(tri_idf)


In [39]:
tri_X_df = pd.DataFrame.sparse.from_spmatrix(tri_tf_idf, columns=trigram_vectorizer.get_feature_names_out())
tri_max_term = tri_X_df.sum().sort_values()[-20:].sort_index().index
tri_X_df[tri_max_term].head(20)


Unnamed: 0,abil strong test,comput inform analysi,develop part implement,ensur nation work,environ write develop,gender degre base,ident ensur nation,need custom qualif,problem team solut,requir respons project,skill requir requir,skill requir respons,softwar provid creat,strong test web,system team solut,team great solut,test web field,time use technolog,understand java posit,use technolog say
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.442416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.411178,0.0,0.315651,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.393868,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
tri_sum = tri_X_df.sum().sort_values(ascending=False)
tri_sum


skill requir respons       309.969290
comput inform analysi      231.006186
softwar provid creat       216.313802
team great solut           208.445272
skill requir requir        203.629780
                              ...    
hire javascript passion      1.204133
hire javascript node         1.204133
hire javascript modifi       1.204133
hire javascript method       1.204133
lead technolog data          1.204133
Length: 731723, dtype: Sparse[float64, 0]

Trigram is slower than Bigram which is slower than Unigram due to increasing amount of features.