In [1]:
from bs4 import BeautifulSoup
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from collections import Counter
from itertools import compress

In [2]:
is_working_with_easy_dataset = True

In [3]:
if is_working_with_easy_dataset:
    input_train_filename = "jd_easy_train.csv"
    input_test_filename = "jd_difficult_train.csv"
else:
    input_train_filename = "jd_difficult_train.csv"
    input_test_filename = "jd_difficult_test.csv"

jd_train = pd.read_csv(input_train_filename, keep_default_na=False)
jd_test = pd.read_csv(input_test_filename, keep_default_na=False)

In [4]:
def get_text(html_text):
    soup = BeautifulSoup(html_text)
    return soup.get_text()

def get_tokenized_text(txt):
    return nltk.word_tokenize(txt)

def get_nouns_and_adjs(tokenized_text):
    noun_and_adj_tags = [
        "NN", "NNS", "NNP", "NNPS",
        "JJ", "JJR", "JJS"]
    word_tags = nltk.pos_tag(tokenized_text)
    return [word_tag[1] in noun_and_adj_tags for word_tag in word_tags]

def lemmatize(word):
    return WordNetLemmatizer().lemmatize(word)

def get_top_lemmatized_noun_adj(dataset, num_stems):
    # Get stems by category
    stems_by_cat = {}
    for index, data in dataset.iterrows():
        category = data[0]
        is_noun_or_adj = data[3]
        lemmatized_tokens = data[4]
        lemmatized_nouns_and_adj = list(compress(lemmatized_tokens, is_noun_or_adj))
        if category not in stems_by_cat:
            stems_by_cat[category] = Counter()
        stems_by_cat[category].update(lemmatized_nouns_and_adj)

    # Filter to top num_stems per category
    top_stems_by_cat = {}
    for category, stems in stems_by_cat.items():
        top_stems_by_cat[category] = list(map(lambda x: x[0], stems.most_common(num_stems)))
    
    # Get bag of unique stems among top stems for all categories
    top_stems = set()
    for stems in top_stems_by_cat.values():
        for stem in stems:
            top_stems.add(stem)
    top_stems = list(top_stems)
    return top_stems

def get_tfidf(dataset, top_stems):
    vectorizer = TfidfVectorizer(vocabulary=top_stems)
    vectors = vectorizer.fit_transform(dataset.lemmatized_tokens.map(lambda x: " ".join(x)))
    return vectors.toarray()

In [5]:
jd_train["tokens"] = jd_train.description.map(
    lambda x: get_text(x)).map(
    lambda x: get_tokenized_text(x))

In [6]:
jd_train["is_noun_or_adj"] = [get_nouns_and_adjs(text) for text in jd_train.tokens]

In [7]:
jd_train["lemmatized_tokens"] = jd_train.tokens.map(
    lambda x: [lemmatize(word) for word in x])

In [8]:
jd_train.head()

Unnamed: 0,category,description,tokens,is_noun_or_adj,lemmatized_tokens
0,software+engineer,"<div class=""jobsearch-jobDescriptionText"" dir=...","[Roles, &, Responsibilities, Responsibilities,...","[True, False, True, True, False, True, False, ...","[Roles, &, Responsibilities, Responsibilities,..."
1,arts,"<div class=""jobsearch-jobDescriptionText"" dir=...","[LECTURER, –, GAME, ART, –, SINGAPORE, CAMPUS,...","[True, True, True, True, True, True, True, Fal...","[LECTURER, –, GAME, ART, –, SINGAPORE, CAMPUS,..."
2,hr,"<div class=""jobsearch-jobDescriptionText"" dir=...","[HR, PROJECT, SPECIALIST, Our, team, in, Singa...","[True, True, True, False, True, False, True, F...","[HR, PROJECT, SPECIALIST, Our, team, in, Singa..."
3,arts,"<div class=""jobsearch-jobDescriptionText"" dir=...","[Job, Description, Our, client, is, in, the, i...","[True, True, False, True, False, False, False,...","[Job, Description, Our, client, is, in, the, i..."
4,arts,"<div class=""jobsearch-jobDescriptionText"" dir=...","[Responsibilities, Design, EDM, ,, email, camp...","[True, True, True, False, True, True, False, T...","[Responsibilities, Design, EDM, ,, email, camp..."


In [9]:
top_stems = get_top_lemmatized_noun_adj(jd_train, 50)
len(top_stems)

116

In [10]:
description_top_stem_vector_train = pd.DataFrame(get_tfidf(jd_train, top_stems), columns=top_stems)
bag_train = pd.concat([jd_train.category, description_top_stem_vector_train], axis=1)
bag_train.head()



Unnamed: 0,category,initiative,account,quality,creative,test,Work,system,Photoshop,solution,...,new,candidate,good,asset,experience,project,graphic,Good,matter,order
0,software+engineer,0.0,0.0,0.134146,0.0,0.0,0.0,0.370467,0.0,0.392305,...,0.199652,0.0,0.0,0.0,0.130771,0.0,0.0,0.0,0.0,0.0
1,arts,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.06027,0.0,0.168427,0.0,0.0,0.0
2,hr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.076998,0.0,0.14348,0.331219,0.0,0.0,0.122337,0.0
3,arts,0.0,0.0,0.0,0.113493,0.0,0.0,0.0,0.0,0.0,...,0.0,0.072032,0.0,0.0,0.092451,0.0,0.258356,0.0,0.0,0.0
4,arts,0.0,0.0,0.0,0.149018,0.0,0.0,0.0,0.0,0.0,...,0.046332,0.0,0.097715,0.0,0.030347,0.0,0.254421,0.0,0.077626,0.0


In [11]:
jd_test["tokens"] = jd_test.description.map(
    lambda x: get_text(x)).map(
    lambda x: get_tokenized_text(x))

jd_test["is_noun_or_adj"] = [get_nouns_and_adjs(text) for text in jd_test.tokens]

jd_test["lemmatized_tokens"] = jd_test.tokens.map(
    lambda x: [lemmatize(word) for word in x])

description_top_stem_vector_test = pd.DataFrame(get_tfidf(jd_test, top_stems), columns=top_stems)
bag_test = pd.concat([jd_test.category, description_top_stem_vector_test], axis=1)
bag_test.head()



Unnamed: 0,category,initiative,account,quality,creative,test,Work,system,Photoshop,solution,...,new,candidate,good,asset,experience,project,graphic,Good,matter,order
0,software+engineer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.14912,0.0,0.286011,0.277,0.0,0.0,0.0,0.0
1,software+engineer,0.0,0.0,0.0,0.0,0.0,0.0,0.19869,0.0,0.0,...,0.070526,0.0,0.072206,0.0,0.507803,0.0,0.0,0.0,0.0,0.0
2,software+engineer,0.0,0.0,0.0,0.0,0.0,0.0,0.322524,0.0,0.0,...,0.0,0.129112,0.0,0.0,0.074936,0.108862,0.0,0.0,0.0,0.0
3,ui+ux,0.0,0.0,0.0,0.0,0.111267,0.0,0.0,0.0,0.082686,...,0.083054,0.09367,0.085034,0.0,0.271825,0.39489,0.0,0.0,0.0,0.191427
4,ui+ux,0.0,0.0,0.032258,0.0,0.0,0.0,0.0,0.0,0.0,...,0.050744,0.0,0.0,0.04503,0.049824,0.120635,0.061388,0.0,0.0,0.116957


In [12]:
if is_working_with_easy_dataset:
    output_train_filename = "bag_easy_train.csv"
    output_test_filename = "bag_easy_test.csv"
else:
    output_train_filename = "bag_difficult_train.csv"
    output_test_filename = "bag_difficult_test.csv"

bag_train.to_csv(output_train_filename, index=False)
bag_test.to_csv(output_test_filename, index=False)