In [1]:
from bs4 import BeautifulSoup
import nltk
from nltk.stem import WordNetLemmatizer
import pandas as pd
from collections import Counter

In [2]:
is_working_with_easy_dataset = True

In [3]:
if is_working_with_easy_dataset:
    input_train_filename = "jd_easy_train.csv"
    input_test_filename = "jd_difficult_train.csv"
else:
    input_train_filename = "jd_difficult_train.csv"
    input_test_filename = "jd_difficult_test.csv"

jd_train = pd.read_csv(input_train_filename, keep_default_na=False)
jd_test = pd.read_csv(input_test_filename, keep_default_na=False)

In [9]:
def get_text(html_text):
    soup = BeautifulSoup(html_text)
    return soup.get_text()

def get_tokenized_text(txt):
    return nltk.word_tokenize(txt)

def get_nouns_and_adjs(tokenized_text):
    noun_and_adjs_tags = [
        "NN", "NNS", "NNP", "NNPS",
        "JJ", "JJR", "JJS"]
    return [word for (word, pos) in nltk.pos_tag(tokenized_text) if pos in noun_and_adjs_tags]

def lemmatize(word):
    return WordNetLemmatizer().lemmatize(word)

def get_top_stems(dataset, num_stems):
    # Get stems by category
    stems_by_cat = {}
    for index, data in dataset.iterrows():
        category = data[0]
        description_stems = data[2]
        if category not in stems_by_cat:
            stems_by_cat[category] = Counter()
        stems_by_cat[category].update(description_stems)

    # Filter to top num_stems per category
    top_stems_by_cat = {}
    for category, stems in stems_by_cat.items():
        top_stems_by_cat[category] = list(map(lambda x: x[0], stems.most_common(num_stems)))
    
    # Get bag of unique stems among top stems for all categories
    top_stems = set()
    for stems in top_stems_by_cat.values():
        for stem in stems:
            top_stems.add(stem)
    top_stems = list(top_stems)
    return top_stems

def project_docs_to_top_stems(docs, top_stems):
    vectors = []
    for doc in docs:
        vectors.append(project_doc_to_top_stems(doc, top_stems))
    return vectors

def project_doc_to_top_stems(doc, top_stems):
    vector = []
    for stem in top_stems:
        vector.append(doc.count(stem))
    return vector

In [10]:
jd_train["stems"] = jd_train.description.map(
    lambda x: get_text(x)).map(
    lambda x: get_tokenized_text(x)).map(
    lambda x: get_nouns_and_adjs(x)).map(
    lambda x: [lemmatize(word) for word in x])

jd_train.head()

Unnamed: 0,category,description,stems
0,software+engineer,"<div class=""jobsearch-jobDescriptionText"" dir=...","[Roles, Responsibilities, Responsibilities, Re..."
1,arts,"<div class=""jobsearch-jobDescriptionText"" dir=...","[LECTURER, –, GAME, ART, –, SINGAPORE, CAMPUS,..."
2,hr,"<div class=""jobsearch-jobDescriptionText"" dir=...","[HR, PROJECT, SPECIALIST, team, Singapore, ope..."
3,arts,"<div class=""jobsearch-jobDescriptionText"" dir=...","[Job, Description, client, interior, designing..."
4,arts,"<div class=""jobsearch-jobDescriptionText"" dir=...","[Responsibilities, Design, EDM, email, campaig..."


In [7]:
top_stems = get_top_stems(jd_train, 50)
len(top_stems)

116

In [11]:
description_top_stem_vector_train = pd.DataFrame(project_docs_to_top_stems(jd_train.description, top_stems), columns=top_stems)
bag_train = pd.concat([jd_train.category, description_top_stem_vector_train], axis=1)
bag_train

Unnamed: 0,category,role,solution,resume,data,Good,application,ability,design,skill,...,change,’,account,company,Sales,Diploma,industry,video,code,plan
0,software+engineer,0,3,0,1,0,1,0,2,0,...,1,0,0,0,0,0,0,0,1,0
1,arts,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,hr,0,0,0,7,1,1,1,0,2,...,2,0,0,0,0,1,0,0,0,0
3,arts,0,0,1,0,0,0,0,4,1,...,0,0,0,1,0,1,2,3,0,0
4,arts,0,0,0,0,1,0,0,3,1,...,0,0,0,0,0,0,0,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3177,software+engineer,0,3,0,1,4,3,0,3,2,...,0,0,0,0,0,1,1,0,1,0
3178,sales,0,1,0,0,0,0,0,0,0,...,0,2,0,0,3,0,0,0,0,0
3179,arts,0,0,0,0,0,0,0,9,3,...,0,1,0,1,0,0,0,0,0,0
3180,hr,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,1,0,0,0,1


In [13]:
jd_test.description = jd_test.description.map(lambda x : get_text(x))
description_top_stem_vector_test = pd.DataFrame(project_docs_to_top_stems(jd_test.description, top_stems), columns=top_stems)
bag_test = pd.concat([jd_test.category, description_top_stem_vector_test], axis=1)
bag_test

Unnamed: 0,category,role,solution,resume,data,Good,application,ability,design,skill,...,change,’,account,company,Sales,Diploma,industry,video,code,plan
0,software+engineer,0,0,0,0,0,2,0,0,0,...,0,0,0,1,0,1,2,0,0,0
1,software+engineer,0,0,0,11,0,0,0,0,1,...,1,2,0,1,0,0,0,0,0,1
2,software+engineer,0,0,0,0,0,0,0,7,0,...,0,0,0,1,0,1,0,0,0,0
3,ui+ux,2,1,0,0,1,1,1,0,1,...,0,0,0,0,0,1,0,0,0,1
4,ui+ux,2,0,0,2,0,0,1,0,1,...,0,0,0,3,0,0,1,1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3175,software+engineer,1,9,0,0,2,9,10,1,7,...,0,0,0,0,0,0,1,0,2,0
3176,cyber+security,1,0,0,2,2,3,0,0,4,...,0,2,0,1,0,0,1,0,0,1
3177,ui+ux,0,2,0,0,1,1,0,1,0,...,0,0,0,0,1,0,0,0,0,0
3178,data+analyst,2,4,0,3,0,7,7,3,5,...,1,5,2,0,0,0,2,0,0,0


In [14]:
if is_working_with_easy_dataset:
    output_train_filename = "bag_easy_train.csv"
    output_test_filename = "bag_easy_test.csv"
else:
    output_train_filename = "bag_difficult_train.csv"
    output_test_filename = "bag_difficult_test.csv"

bag_train.to_csv(output_train_filename, index=False)
bag_test.to_csv(output_test_filename, index=False)