# Training

In [14]:
from bag_of_words import *

In [15]:
is_working_with_easy_dataset = False

In [16]:
if is_working_with_easy_dataset:
    input_train_filename = "jd_easy_train.csv"
    input_test_filename = "jd_easy_test.csv"
else:
    input_train_filename = "jd_difficult_train.csv"
    input_test_filename = "jd_difficult_test.csv"

jd_train = pd.read_csv(input_train_filename, keep_default_na=False)
jd_test = pd.read_csv(input_test_filename, keep_default_na=False)

In [17]:
jd_train["tokens"] = jd_train.description.map(
    lambda x: get_text(x)).map(
    lambda x: get_tokenized_text(x))

In [18]:
jd_train["is_noun_or_adj"] = [get_nouns_and_adjs(text) for text in jd_train.tokens]

In [19]:
jd_train["lemmatized_tokens"] = jd_train.tokens.map(
    lambda x: [lemmatize(word) for word in x])

In [20]:
jd_train.head()

Unnamed: 0,category,description,tokens,is_noun_or_adj,lemmatized_tokens
0,software+engineer,"<div class=""jobsearch-jobDescriptionText"" dir=...","[Well-established, firm, in, the, semiconducto...","[True, True, False, False, True, True, True, T...","[Well-established, firm, in, the, semiconducto..."
1,software+engineer,"<div class=""jobsearch-jobDescriptionText"" dir=...","[A, world, empowered, by, autonomy, ., We, bui...","[False, True, False, False, True, False, False...","[A, world, empowered, by, autonomy, ., We, bui..."
2,software+engineer,"<div class=""jobsearch-jobDescriptionText"" dir=...","[Hi, there, ,, our, client, is, currently, loo...","[True, False, False, False, True, False, False...","[Hi, there, ,, our, client, is, currently, loo..."
3,ui+ux,"<div class=""jobsearch-jobDescriptionText"" dir=...","[Technical, Skill, :, UX/UI, /, HTML, /, SQLÊ,...","[True, True, False, True, True, True, True, Tr...","[Technical, Skill, :, UX/UI, /, HTML, /, SQLÊ,..."
4,ui+ux,"<div class=""jobsearch-jobDescriptionText"" dir=...","[Do, you, want, to, be, a, part, of, a, global...","[False, False, False, False, False, False, Tru...","[Do, you, want, to, be, a, part, of, a, global..."


In [21]:
top_stems = get_top_lemmatized_noun_adj(jd_train, 50)
len(top_stems)

100

In [22]:
if is_working_with_easy_dataset:
    top_stem_filename = "top_stems_easy.csv"
else:
    top_stem_filename = "top_stems_difficult.csv"
pd.DataFrame(top_stems).to_csv(top_stem_filename, index=False, header=False)

In [23]:
description_top_stem_vector_train = pd.DataFrame(get_tfidf(jd_train.lemmatized_tokens, top_stems), columns=top_stems)
bag_train = pd.concat([jd_train.category, description_top_stem_vector_train], axis=1)
bag_train.head()



Unnamed: 0,category,Job,knowledge,assessment,information,Experience,vulnerability,ÊÊ,Business,skill,...,solution,UX,quality,analytical,control,audit,new,team,opportunity,technology
0,software+engineer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117808,0.0,0.143129
1,software+engineer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.071098,0.152323,0.0,0.123375
2,software+engineer,0.0,0.080771,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.290684,0.0,0.0,0.068451,0.0,0.0
3,ui+ux,0.0,0.0,0.120286,0.073285,0.0,0.133085,0.0,0.0,0.112439,...,0.068614,0.0,0.0,0.0,0.0,0.0,0.068919,0.098438,0.0,0.059797
4,ui+ux,0.0,0.041995,0.0,0.0,0.0,0.0,0.0,0.0,0.040652,...,0.0,0.0,0.063359,0.0,0.0,0.0,0.09967,0.142358,0.229782,0.043239


# Testing

In [24]:
if is_working_with_easy_dataset:
    top_stem_filename = "top_stems_easy.csv"
else:
    top_stem_filename = "top_stems_difficult.csv"

top_stems = [x[0] for x in pd.read_csv(top_stem_filename, header=None).values.tolist()]

In [25]:
jd_test["tokens"] = jd_test.description.map(
    lambda x: get_text(x)).map(
    lambda x: get_tokenized_text(x))

jd_test["is_noun_or_adj"] = [get_nouns_and_adjs(text) for text in jd_test.tokens]

jd_test["lemmatized_tokens"] = jd_test.tokens.map(
    lambda x: [lemmatize(word) for word in x])

description_top_stem_vector_test = pd.DataFrame(get_tfidf(jd_test.lemmatized_tokens, top_stems), columns=top_stems)
bag_test = pd.concat([jd_test.category, description_top_stem_vector_test], axis=1)
bag_test.head()



Unnamed: 0,category,Job,knowledge,assessment,information,Experience,vulnerability,ÊÊ,Business,skill,...,solution,UX,quality,analytical,control,audit,new,team,opportunity,technology
0,data+analyst,0.0,0.173064,0.0,0.0,0.0,0.0,0.0,0.0,0.248572,...,0.0,0.0,0.0,0.132009,0.0,0.0,0.0,0.0,0.116499,0.0
1,cyber+security,0.0,0.10125,0.0,0.127235,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085353,0.136315,0.0
2,software+engineer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.175637,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13856,0.100835
3,ui+ux,0.0,0.073967,0.0,0.046475,0.0,0.0,0.0,0.0,0.141651,...,0.462844,0.0,0.05411,0.05642,0.0,0.0,0.042015,0.09353,0.049791,0.07247
4,ui+ux,0.0,0.051684,0.0,0.064949,0.0,0.0,0.0,0.0,0.148468,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043569,0.0,0.202553


In [26]:
if is_working_with_easy_dataset:
    output_train_filename = "bag_easy_train.csv"
    output_test_filename = "bag_easy_test.csv"
else:
    output_train_filename = "bag_difficult_train.csv"
    output_test_filename = "bag_difficult_test.csv"

bag_train.to_csv(output_train_filename, index=False)
bag_test.to_csv(output_test_filename, index=False)