In [24]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
from collections import Counter

In [25]:
is_working_with_easy_dataset = True

In [26]:
if is_working_with_easy_dataset:
    input_train_filename = "jd_easy_train.csv"
    input_test_filename = "jd_difficult_train.csv"
else:
    input_train_filename = "jd_difficult_train.csv"
    input_test_filename = "jd_difficult_test.csv"

jd_train = pd.read_csv(input_train_filename, keep_default_na=False)
jd_test = pd.read_csv(input_test_filename, keep_default_na=False)

In [27]:
def remove_html(txt):
    txt = txt.lower()
    txt = txt.split("Job Type")
    txt = txt[0]
    txt = re.findall("\>(.*?)\<", txt)
    txt = " ".join(txt)
    txt = txt.split()
    stop_words = stopwords.words("english")
    txt = [x for x in txt if x not in stop_words]
    txt = " ".join(txt)
    return txt

def get_tokenized_text(txt):
    return nltk.word_tokenize(txt)

def get_nouns_and_adjs(tokenized_text):
    noun_and_adjs_tags = [
        "NN", "NNS", "NNP", "NNPS",
        "JJ", "JJR", "JJS"]
    return [word for (word, pos) in nltk.pos_tag(tokenized_text) if pos in noun_and_adjs_tags]

def lemmatize(word, lemmatizer):
    return lemmatizer.lemmatize(word)

def get_top_stems(dataset, num_stems):
    # Get stems by category
    stems_by_cat = {}
    for index, data in dataset.iterrows():
        category = data[0]
        description_stems = data[2]
        if category not in stems_by_cat:
            stems_by_cat[category] = Counter()
        stems_by_cat[category].update(description_stems)

    # Filter to top num_stems per category
    top_stems_by_cat = {}
    for category, stems in stems_by_cat.items():
        top_stems_by_cat[category] = list(map(lambda x: x[0], stems.most_common(num_stems)))
    
    # Get bag of unique stems among top stems for all categories
    top_stems = set()
    for stems in top_stems_by_cat.values():
        for stem in stems:
            top_stems.add(stem)
    top_stems = list(top_stems)
    return top_stems

def project_docs_to_top_stems(docs, top_stems):
    vectors = []
    for doc in docs:
        vectors.append(project_doc_to_top_stems(doc, top_stems))
    return vectors

def project_doc_to_top_stems(doc, top_stems):
    vector = []
    for stem in top_stems:
        vector.append(doc.count(stem))
    return vector

In [28]:
lemmatizer = WordNetLemmatizer()
jd_train["stems"] = jd_train.description.map(
    lambda x: [lemmatize(word, lemmatizer) for word in get_nouns_and_adjs(get_tokenized_text(x))])

jd_train.head()

Unnamed: 0,category,description,stems
0,software+engineer,"<div class=""jobsearch-jobDescriptionText"" dir=...","[<, div, class=, jobsearch-jobDescriptionText,..."
1,software+engineer,"<div class=""jobsearch-jobDescriptionText"" dir=...","[<, div, class=, jobsearch-jobDescriptionText,..."
2,software+engineer,"<div class=""jobsearch-jobDescriptionText"" dir=...","[<, div, class=, jobsearch-jobDescriptionText,..."
3,ui+ux,"<div class=""jobsearch-jobDescriptionText"" dir=...","[<, div, class=, jobsearch-jobDescriptionText,..."
4,ui+ux,"<div class=""jobsearch-jobDescriptionText"" dir=...","[<, div, class=, jobsearch-jobDescriptionText,..."


In [29]:
top_stems = get_top_stems(jd_train, 50)
len(top_stems)

89

In [30]:
description_top_stem_vector_train = pd.DataFrame(project_docs_to_top_stems(jd_train.description, top_stems), columns=top_stems)
bag_train = pd.concat([jd_train.category, description_top_stem_vector_train], axis=1)
bag_train

Unnamed: 0,test,development,service,br/,analytics,technology,system,ul,Job,/,...,ÊÊ,Singapore,Security,b,jobDescriptionText,OT,class=,audit,li,cyber
0,1,2,0,1,0,0,0,12,1,35,...,0,1,0,29,2,0,5,0,40,0
1,0,0,2,1,0,1,3,21,1,52,...,0,1,0,47,2,0,1,0,55,0
2,0,0,0,4,0,0,0,41,1,62,...,0,0,0,41,2,1,1,0,48,0
3,0,0,0,1,0,1,0,71,2,90,...,0,1,5,29,2,0,1,0,74,0
4,0,0,0,12,0,0,0,21,0,82,...,0,0,0,104,2,0,8,0,69,0


In [31]:
jd_test.description = jd_test.description.map(lambda x : remove_html(x))
description_top_stem_vector_test = pd.DataFrame(project_docs_to_top_stems(jd_test.description, top_stems), columns=top_stems)
bag_test = pd.concat([jd_test.category, description_top_stem_vector_test], axis=1)
bag_test

Unnamed: 0,test,development,service,br/,analytics,technology,system,ul,Job,/,...,ÊÊ,Singapore,Security,b,jobDescriptionText,OT,class=,audit,li,cyber
0,0,3,1,0,1,0,0,3,0,3,...,0,0,0,22,0,0,0,0,12,0
1,0,0,1,0,0,0,1,1,0,5,...,0,0,0,11,0,0,0,0,8,1
2,0,0,0,0,0,2,7,4,0,9,...,0,0,0,18,0,0,0,0,9,0
3,6,6,0,0,0,2,4,5,0,2,...,0,0,0,33,0,0,0,0,23,0
4,2,0,2,0,0,1,0,1,0,12,...,0,0,0,44,0,0,0,0,13,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
790,1,0,4,0,0,1,0,2,0,2,...,0,0,0,14,0,0,0,0,11,0
791,1,3,0,0,5,4,1,1,0,0,...,0,0,0,23,0,0,0,0,16,2
792,0,2,0,0,0,0,0,3,0,2,...,0,0,0,22,0,0,0,0,17,0
793,0,2,1,0,0,0,0,4,0,1,...,0,0,0,14,0,0,0,0,3,0


In [32]:
if is_working_with_easy_dataset:
    output_train_filename = "bag_easy_train.csv"
    output_test_filename = "bag_easy_test.csv"
else:
    output_train_filename = "bag_difficult_train.csv"
    output_test_filename = "bag_difficult_test.csv"

bag_train.to_csv(output_train_filename, index=False)
bag_test.to_csv(output_test_filename, index=False)