In [None]:
# import library
from data_util import *
import pandas as pd
from sklearn.utils import shuffle
import csv
import sys
csv.field_size_limit(sys.maxsize)

# Data preparation

In [None]:
# create required datset using a company system id in string
# import time
# start_time = time.time()
# df = get_csv_by_system("417")
# end_time = time.time()
# print(end_time - start_time)

# Preprocessing

In [None]:
'''
This function is used to create a KD tree
from job titles embedding.
'''
from sklearn.neighbors import KDTree
def create_kd_tree(vecs):
    tree = KDTree(vecs, leaf_size=2)  
    return tree

In [None]:
'''
Given a job title query vector and a KD tree, this
function returns k nearest neighbour indexes in the
KD tree job titles embedding.
'''
from sklearn.neighbors import KDTree
def knn(k, tree, query):           
    dist, ind = tree.query(query, k=k)     
    return ind

In [None]:
def filter_job_titles(row, job_titles_distinct):
    if row['job_title'] in job_titles_distinct:
        return row;
    return np.nan

In [None]:
'''
This function filters all the job titles with insufficient
CV data(ie. the number of rows of CVs are less than a predefined
threshold such as 90 here).
'''
def filter_by_cv_count_for_job_title(df):
    threshold = 90
    df_job_group = df[['job_title', 'text']].groupby('job_title')
    df_filtered_cv_count = df_job_group.filter(lambda x: len(x) > threshold) 
    job_titles_distinct = set(df_filtered_cv_count['job_title'])
    df_filtered = df.apply(lambda row: filter_job_titles(row, job_titles_distinct),\
                           axis=1).dropna()
    assert len(df_filtered_cv_count) == len(df_filtered)
    return df_filtered

In [None]:
'''
Loading data from a file path.
'''
import pandas as pd
from sklearn.utils import shuffle
file_path = 'data/417_CVs_outcome.csv'
df = pd.read_csv(file_path)
df = filter_by_cv_count_for_job_title(df)
df['interview'] = df['interview'] * 1
# df = df.sample(n=35000)
# df_interviewed = df.query('interview == 1')
# df_failed = df.query('interview == 0')
# df = pd.concat([df_interviewed.sample(n=50), df_failed.sample(n=50)])
# df.to_csv('data/CVs_outcome_total.csv', index=False)

In [None]:
# file_path = 'data/CVs_outcome_total.csv'
# df = pd.read_csv(file_path)

In [None]:
df = df.apply(lambda row: normalize_jobtitle(row), axis=1)

In [None]:
job_vec_dict = create_jobtitle_vec_dict(df['job_title'])
save_obj(job_vec_dict, "data/job_vec_dict.pickle")

In [None]:
dict_values = list(job_vec_dict.values())
tree = create_kd_tree(dict_values)
save_obj(tree, "data/kd_tree.pickle")

In [None]:
df = df.apply(lambda row: encoder_jobtitle(row, job_vec_dict), axis=1)

In [None]:
df.head()

In [None]:
import math
def split_dataframe(df):
    start, end = 0, math.floor(len(df) * 0.6)
    train = df[start:end]
    start, end = math.floor(len(df) * 0.6), math.floor(len(df) * 0.8)
    val = df[start:end]
    start, end = math.floor(len(df) * 0.8), len(df)
    test = df[start:end]
    return train, val, test

In [None]:
'''
Split data into training, validation and test set with
proportions 60%, 20% and 20% respectively.
'''
df = df[["text", "interview", "job_title_vec"]]
df_interviewed = df.query('interview == 1')
df_failed = df.query('interview == 0')
data_interviewed_split = split_dataframe(df_interviewed)
data_failed_split = split_dataframe(df_failed)
train = shuffle(pd.concat([data_interviewed_split[0], data_failed_split[0]]))
val = shuffle(pd.concat([data_interviewed_split[1], data_failed_split[1]]))
test = shuffle(pd.concat([data_interviewed_split[2], data_failed_split[2]]))
train.to_csv('data/CVs_outcome_train.csv', index=False)
val.to_csv('data/CVs_outcome_val.csv', index=False)
test.to_csv('data/CVs_outcome_test.csv', index=False)

In [None]:
'''
This function calculates the weight for the loss function to use.
The weight is calculated by counting all the job title and interview
state pairs and normalizing the counts.
'''
from multiprocessing import Pool
import time
from tqdm import tqdm_notebook
import torch
from ast import literal_eval
def get_loss_weight(df):
    weight_dict = collections.defaultdict(float)
    job_title_vecs = df['job_title_vec']
    interviews = df['interview']
    start_time = time.time()
    pool = Pool()
    for job_title_vec, interview in tqdm_notebook(zip(job_title_vecs, interviews)):
        job_title_vec = literal_eval(str(job_title_vec))
        pair = (torch.tensor(job_title_vec), torch.tensor(float(interview)))
        weight_dict[str(pair)] += 1.0
        
    length = len(df)
    for key, val in weight_dict.items():
        weight_dict[key] = val / length
    pool.close()
    pool.join()
    end_time = time.time()
    print(end_time - start_time)
    return weight_dict

In [None]:
import pandas as pd
train = pd.read_csv('data/CVs_outcome_train.csv')
val = pd.read_csv('data/CVs_outcome_val.csv')
test = pd.read_csv('data/CVs_outcome_test.csv')
data_combined = pd.concat([train, val, test])
weight_dict = get_loss_weight(data_combined)

In [None]:
# Model configurations
epochs = 50
embedding_dim = 50
hidden_dim = 150

batch_size = 16
label_dim = len(list(job_vec_dict.values())[0])

In [None]:
# loading data into batches using pytorch.
from torchtext import data
text_field = data.Field(sequential=True, tokenize=text_tokenize, lower=True)
label_field = data.RawField(preprocessing=label_field_preprocessing,
                            postprocessing=label_field_postprocessing)

In [None]:
train_iter, dev_iter, test_iter = load_data(text_field, label_field, batch_size)

# Model implementation

In [None]:
import numpy as np
from sklearn import metrics
import torch
def calc_score(labels, preds):
    labels = np.array([label.cpu().data.clone().numpy() for label in labels])
    preds = np.array([pred.cpu().data.clone().numpy() for pred in preds])
    score = metrics.roc_auc_score(labels, preds)
    return score

In [None]:
def calc_accuracy(labels, preds):   
    hits = 0
    for i in range(len(labels)):
        if labels[i] == preds[i]:
            hits += 1.0
    return hits / len(labels)

In [None]:
def weighted_mse_loss(preds, labels, weight):
    return torch.sum(weight * (preds - labels) ** 2)

In [None]:
import numpy as np
def get_batch_loss_weight(vec_label, state_label, weight_dict):
    batch_size = vec_label.size()[0]
    weight_batch = []
    for i in range(batch_size):
        key = str((vec_label[i], state_label[i]))
        weight_batch.append(weight_dict[key])
    return torch.tensor(np.array(weight_batch, dtype=np.float32))

In [None]:
import os
def save_checkpoint(state, path):
    print("=> Saving a new model to path: " + path)
    if os.path.exists(path):
        os.system('rm '+ path)
    torch.save(state, path)  # save checkpoint

In [None]:
def reload_checkpoint(path):
    checkpoint = torch.load(path)
    return checkpoint

In [None]:
def load_model(load_checkpoint, path):
    model = Model(embedding_dim=embedding_dim, hidden_dim=hidden_dim,
                label_size=label_dim, batch_size=batch_size, 
                pretrained_vec=text_field.vocab.vectors)
    if torch.cuda.is_available():
        model.cuda("cuda")
    if load_checkpoint:
        checkpoint = reload_checkpoint(path)
        model.load_state_dict(checkpoint['state_dict'])
    return model

In [None]:
from tqdm import tqdm_notebook
import torch.nn.functional as F
def train_epoch(model, train_iter, loss_function, epoch, weight_dict):
    optimizer = optim.Adam([para for para in model.parameters() \
                            if para.requires_grad], lr=1e-3)
    model.train()
    avg_loss = 0.0
    all_labels = []
    all_preds = []
    for batch in tqdm_notebook(train_iter, desc='Train epoch ' + str(epoch + 1)):
        text, vec_label, state_label = batch.text.cuda("cuda"), \
            batch.job_title_vec.cuda("cuda"), batch.interview.cuda("cuda")
        all_labels += list(state_label.data)
        model.batch_size = text.data.shape[1]     
        model.hidden = model.init_hidden()
        model.zero_grad()
        outputs = model(text)
        state_preds = (F.cosine_similarity(outputs, vec_label) + 1) / 2
        all_preds += [pred for pred in state_preds]
        batch_weight = \
            get_batch_loss_weight(vec_label.cpu(), state_label.cpu(), weight_dict)
        loss_function.weight = batch_weight.cuda("cuda")
        loss = loss_function(state_preds, state_label)
        avg_loss += loss
        loss.backward()
#         clip_grad_norm(model.parameters(), 1)
#         lr = -1e-3
#         for para in model.parameters():
#             para.data.add_(lr, para.grad.data)
        optimizer.step()
        torch.cuda.empty_cache()
    avg_loss /= len(train_iter)
    score = calc_score(all_labels, all_preds)
    loss_function.weight = None        
    return avg_loss, score

In [None]:
def evaluate(model, data, loss_function, name):
    model.eval()
    avg_loss = 0.0
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for batch in data:
            text, vec_label, state_label = batch.text.cuda("cuda"), \
                batch.job_title_vec.cuda("cuda"), batch.interview.cuda("cuda")
            all_labels += list(state_label.data)
            model.batch_size = text.data.shape[1]     
            model.hidden = model.init_hidden()
            outputs = model(text)
            state_preds = (F.cosine_similarity(outputs, vec_label) + 1) / 2     
            all_preds += [pred for pred in state_preds]
            loss = loss_function(state_preds, state_label)
            avg_loss += loss
            torch.cuda.empty_cache()
        avg_loss /= len(data)
        score = calc_score(all_labels, all_preds)
    print(name + ': loss %.4f score %.4f' % (avg_loss, score))
    return avg_loss, score

In [None]:
import torch
import os
def predict(data):
    out_dir = os.path.abspath(os.path.join(os.path.curdir, "interview_models"))
    model_path = out_dir + '/best_model_gpu.pth'
    model = load_model(True, model_path)
    model.eval()
    all_preds = []
    for batch in data:
        text = batch.text.cuda("cuda")
        model.batch_size = text.data.shape[1]
        model.hidden = model.init_hidden()
        outputs = model(text)ss
        all_preds += [x for x in outputs]
        torch.cuda.empty_cache()
    return all_preds

In [None]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm

class Model(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, label_size, batch_size,
                 pretrained_vec, dropout=0.0):
        super(Model, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.dropout = dropout
        self.num_layers = 1
        self.embeddings = nn.Embedding(len(pretrained_vec), embedding_dim)
        self.embeddings.weight.data.copy_(pretrained_vec)
        self.embeddings.weight.requires_grad = False 

        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim,
                            num_layers = self.num_layers, bidirectional=True)
        self.hidden2label = nn.Linear(hidden_dim * 2, label_size)
        self.hidden = self.init_hidden()
        

    def init_hidden(self):
        # first is the hidden h
        # second is the cell c
        if torch.cuda.is_available():
            h0 = Variable(torch.zeros(2 * self.num_layers, self.batch_size, \
                                      self.hidden_dim).cuda("cuda"))
            c0 = Variable(torch.zeros(2 * self.num_layers, self.batch_size, \
                                      self.hidden_dim).cuda("cuda"))
        else:
            h0 = Variable(torch.zeros(2 * self.num_layers, self.batch_size, \
                                      self.hidden_dim))
            c0 = Variable(torch.zeros(2 * self.num_layers, self.batch_size, \
                                      self.hidden_dim))
        return (h0, c0)

    def forward(self, sentence):
#         x = self.embeddings(sentence).view(len(sentence), self.batch_size, -1)
        x = self.embeddings(sentence)
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        y = self.hidden2label(lstm_out[-1])
#         log_probs = F.log_softmax(y, dim=1)
        return y
    

In [None]:
load_checkpoint = False
best_dev_score = 0
start_epoch = 0
out_dir = os.path.abspath(os.path.join(os.path.curdir, "interview_models"))
model_path = out_dir + '/model_gpu.pth'
if load_checkpoint:
    checkpoint = reload_checkpoint(model_path)
    best_dev_score = checkpoint['dev_accuracy']
    start_epoch = checkpoint['start_epoch']
model = load_model(load_checkpoint, model_path)

In [None]:
import torch
import torch.nn as nn
from torch import optim
import time, random
import os
from tqdm import tqdm
import numpy as np

best_model = model

loss_function = nn.BCEWithLogitsLoss()
train_scores = []
dev_scores = []

out_dir = os.path.abspath(os.path.join(os.path.curdir, "interview_models"))
best_model_path = out_dir + '/best_model_gpu.pth'
result_path = out_dir + "/model_result_gpu.txt"

print("Writing to {}\n".format(out_dir))
if not os.path.exists(out_dir):
    os.makedirs(out_dir)
for epoch in range(start_epoch, epochs):
    if epoch == 0:
        result_file = open(result_path, "w")
    else:
        result_file = open(result_path, "a")       
    train_avg_loss, train_score = \
        train_epoch(model, train_iter, loss_function, epoch, weight_dict)
    tqdm.write('Train: loss %.4f score %.4f' % (train_avg_loss, train_score))
    result_file.write('Train: loss %.4f score %.4f\n' % (train_avg_loss, train_score))
    dev_avg_loss, dev_score = evaluate(model, dev_iter, loss_function, 'Dev')
    result_file.write('Dev: loss %.4f score %.4f\n' % (dev_avg_loss, dev_score))
    train_scores.append(train_score)
    dev_scores.append(dev_score)
 
    save_checkpoint({
        'start_epoch': epoch + 1,
        'state_dict': model.state_dict(),
        'dev_accuracy': best_dev_score
    }, model_path)
    
    if dev_score > best_dev_score:
        best_dev_score = dev_score
        best_model = model
        save_checkpoint({
            'start_epoch': epoch + 1,
            'state_dict': best_model.state_dict(),
            'dev_accuracy': best_dev_score
        }, best_model_path)
    result_file.close()
    
test_avg_loss, test_score = evaluate(best_model, test_iter, loss_function, 'Final Test')
result_file = open(result_path, "a")
result_file.write('Test: loss %.4f score %.4f\n' % (test_avg_loss, test_score))
result_file.close()

In [None]:
epochs = len(train_scores)

In [None]:
import matplotlib.pyplot as plt
epoch_nums = [i+1 for i in range(epochs)]
plt.plot(epoch_nums, train_scores, label='training score', color='r')
plt.plot(epoch_nums, dev_scores, label='validation score', color='b')
plt.xlabel('Epoch num')
plt.ylabel('Auc Roc score')
plt.title('Changes in training and validation scores when epoch num increases')
plt.legend()
plt.savefig('performance_gpu.png')

In [None]:
model.parameters

# Predict job title

In [None]:
import pandas as pd
df = pd.read_csv('data/CVs_outcome_test.csv')

In [None]:
preds = predict(test_iter).cpu()

In [None]:
preds

In [None]:
from multiprocessing import Pool
from tqdm import tqdm_notebook
# pool = Pool()
job_vec_dict = load_obj("data/job_vec_dict.pickle")
tree = load_obj("data/kd_tree.pickle")
dict_values = list(job_vec_dict.values())
dict_keys = list(job_vec_dict.keys())
titles_preds = []
for i in tqdm_notebook(range(len(preds))):
    query = np.array([preds[i].data.clone().numpy()])
    ind = list(knn(1, tree, query)[0])
    titles_preds.append(dict_keys[ind[0]])
# pool.close()
# pool.join() 

In [None]:
set(titles_preds)

# Data Analysis

In [None]:
titles_preds = list(set(titles_preds))
embeddings_preds = encode_sentences(titles_preds)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
embeddings_pca = pca.fit_transform(embeddings_preds)
print(sum(pca.explained_variance_ratio_))  

In [None]:
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt

n_components = 2
embeddings_tsne = TSNE(n_components=n_components, verbose=2).fit_transform(embeddings_pca)

In [None]:
# colors = ['#630C3A', '#39C8C6', '#D3500C', '#FFB139', "#00FFFF"]
fig, ax = plt.subplots(figsize=(100,100))
ax.scatter(embeddings_tsne[:,0], embeddings_tsne[:,1])
for i, txt in enumerate(titles_preds):
    ax.annotate(txt, (embeddings_tsne[i,0], embeddings_tsne[i,1]))
#     ax.scatter(embeddings_tsne[i,0], embeddings_tsne[i,1], color=colors[clusters.labels_[i]])
plt.xlabel("job title tnse x", fontsize=35)
plt.ylabel("job title tnse y", fontsize=35)
plt.title("visualisation of job average embedding into dimension of " + str(n_components), fontsize=35)
plt.show()