In [1]:
from bs4 import BeautifulSoup
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from collections import Counter
from itertools import compress

In [2]:
is_working_with_easy_dataset = False

In [3]:
if is_working_with_easy_dataset:
    input_train_filename = "jd_easy_train.csv"
    input_test_filename = "jd_difficult_train.csv"
else:
    input_train_filename = "jd_difficult_train.csv"
    input_test_filename = "jd_difficult_test.csv"

jd_train = pd.read_csv(input_train_filename, keep_default_na=False)
jd_test = pd.read_csv(input_test_filename, keep_default_na=False)

In [4]:
def get_text(html_text):
    soup = BeautifulSoup(html_text)
    return soup.get_text()

def get_tokenized_text(txt):
    return nltk.word_tokenize(txt)

def get_nouns_and_adjs(tokenized_text):
    noun_and_adj_tags = [
        "NN", "NNS", "NNP", "NNPS",
        "JJ", "JJR", "JJS"]
    word_tags = nltk.pos_tag(tokenized_text)
    return [word_tag[1] in noun_and_adj_tags for word_tag in word_tags]

def lemmatize(word):
    return WordNetLemmatizer().lemmatize(word)

def get_top_lemmatized_noun_adj(dataset, num_stems):
    # Get stems by category
    stems_by_cat = {}
    for index, data in dataset.iterrows():
        category = data[0]
        is_noun_or_adj = data[3]
        lemmatized_tokens = data[4]
        lemmatized_nouns_and_adj = list(compress(lemmatized_tokens, is_noun_or_adj))
        if category not in stems_by_cat:
            stems_by_cat[category] = Counter()
        stems_by_cat[category].update(lemmatized_nouns_and_adj)

    # Filter to top num_stems per category
    top_stems_by_cat = {}
    for category, stems in stems_by_cat.items():
        top_stems_by_cat[category] = list(map(lambda x: x[0], stems.most_common(num_stems)))
    
    # Get bag of unique stems among top stems for all categories
    top_stems = set()
    for stems in top_stems_by_cat.values():
        for stem in stems:
            top_stems.add(stem)
    top_stems = list(top_stems)
    return top_stems

def get_tfidf(dataset, top_stems):
    vectorizer = TfidfVectorizer(vocabulary=top_stems)
    vectors = vectorizer.fit_transform(dataset.lemmatized_tokens.map(lambda x: " ".join(x)))
    return vectors.toarray()

In [5]:
jd_train["tokens"] = jd_train.description.map(
    lambda x: get_text(x)).map(
    lambda x: get_tokenized_text(x))

In [6]:
jd_train["is_noun_or_adj"] = [get_nouns_and_adjs(text) for text in jd_train.tokens]

In [7]:
jd_train["lemmatized_tokens"] = jd_train.tokens.map(
    lambda x: [lemmatize(word) for word in x])

In [8]:
jd_train.head()

Unnamed: 0,category,description,tokens,is_noun_or_adj,lemmatized_tokens
0,software+engineer,"<div class=""jobsearch-jobDescriptionText"" dir=...","[Well-established, firm, in, the, semiconducto...","[True, True, False, False, True, True, True, T...","[Well-established, firm, in, the, semiconducto..."
1,software+engineer,"<div class=""jobsearch-jobDescriptionText"" dir=...","[A, world, empowered, by, autonomy, ., We, bui...","[False, True, False, False, True, False, False...","[A, world, empowered, by, autonomy, ., We, bui..."
2,software+engineer,"<div class=""jobsearch-jobDescriptionText"" dir=...","[Hi, there, ,, our, client, is, currently, loo...","[True, False, False, False, True, False, False...","[Hi, there, ,, our, client, is, currently, loo..."
3,ui+ux,"<div class=""jobsearch-jobDescriptionText"" dir=...","[Technical, Skill, :, UX/UI, /, HTML, /, SQLÊ,...","[True, True, False, True, True, True, True, Tr...","[Technical, Skill, :, UX/UI, /, HTML, /, SQLÊ,..."
4,ui+ux,"<div class=""jobsearch-jobDescriptionText"" dir=...","[Do, you, want, to, be, a, part, of, a, global...","[False, False, False, False, False, False, Tru...","[Do, you, want, to, be, a, part, of, a, global..."


In [9]:
top_stems = get_top_lemmatized_noun_adj(jd_train, 50)
len(top_stems)

100

In [10]:
description_top_stem_vector_train = pd.DataFrame(get_tfidf(jd_train, top_stems), columns=top_stems)
bag_train = pd.concat([jd_train.category, description_top_stem_vector_train], axis=1)
bag_train.head()



Unnamed: 0,category,assessment,Management,Work,cyber,ÊÊ,Data,control,design,Experience,...,Technology,role,Java,process,Strong,skill,software,business,Ability,experience
0,software+engineer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.143276,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.345453,0.0,0.0,0.32394
1,software+engineer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.061751,0.0,...,0.0,0.082574,0.0,0.0,0.0,0.057996,0.0,0.125683,0.0,0.511923
2,software+engineer,0.0,0.0,0.0,0.0,0.0,0.0,0.290684,0.499494,0.0,...,0.0,0.0,0.0,0.203606,0.0,0.0,0.100361,0.0,0.0,0.062741
3,ui+ux,0.120286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.160089,0.0,0.0732,0.0,0.112439,0.0,0.0,0.0,0.225563
4,ui+ux,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.115759,0.0,0.10586,0.0,0.040652,0.0,0.0,0.0,0.097861


In [11]:
jd_test["tokens"] = jd_test.description.map(
    lambda x: get_text(x)).map(
    lambda x: get_tokenized_text(x))

jd_test["is_noun_or_adj"] = [get_nouns_and_adjs(text) for text in jd_test.tokens]

jd_test["lemmatized_tokens"] = jd_test.tokens.map(
    lambda x: [lemmatize(word) for word in x])

description_top_stem_vector_test = pd.DataFrame(get_tfidf(jd_test, top_stems), columns=top_stems)
bag_test = pd.concat([jd_test.category, description_top_stem_vector_test], axis=1)
bag_test.head()



Unnamed: 0,category,assessment,Management,Work,cyber,ÊÊ,Data,control,design,Experience,...,Technology,role,Java,process,Strong,skill,software,business,Ability,experience
0,data+analyst,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.248572,0.0,0.523485,0.0,0.064633
1,cyber+security,0.0,0.0,0.0,0.175017,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.151252
2,software+engineer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.102496,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038436
3,ui+ux,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.147327,0.0,...,0.0,0.0,0.0,0.0,0.0,0.141651,0.044691,0.335602,0.0,0.110494
4,ui+ux,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.102945,0.0,...,0.0,0.0,0.0,0.0,0.0,0.148468,0.0,0.0,0.0,0.115812


In [12]:
if is_working_with_easy_dataset:
    output_train_filename = "bag_easy_train.csv"
    output_test_filename = "bag_easy_test.csv"
else:
    output_train_filename = "bag_difficult_train.csv"
    output_test_filename = "bag_difficult_test.csv"

bag_train.to_csv(output_train_filename, index=False)
bag_test.to_csv(output_test_filename, index=False)