In [1]:
!pip install --upgrade gensim



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gensim
import sklearn
import nltk
from collections import Counter
%matplotlib inline

In [3]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"  # Synchronous error reporting
os.environ['TORCH_USE_CUDA_DSA'] = "1"    # Enable device-side assertions (if available)

In [4]:
tokenizer = nltk.WordPunctTokenizer()
data = pd.read_csv("./Train_rev1.zip", compression='zip', index_col=None)
data['Log1pSalary'] = np.log1p(data['SalaryNormalized']).astype('float32')
text_columns = ["Title", "FullDescription"]
categorical_columns = ["Category", "Company", "LocationNormalized", "ContractType", "ContractTime"]
TARGET_COLUMN = "Log1pSalary"
data[categorical_columns] = data[categorical_columns].fillna('NaN')

data["FullDescription"] = data["FullDescription"].apply(lambda x: ' '.join(
    tokenizer.tokenize(x)
).lower())

data["Title"] = data["Title"].fillna("")

data["Title"] = data["Title"].apply(lambda x: ' '.join(
    tokenizer.tokenize(x)
).lower())

In [5]:
token_counts = Counter()

for t in ' '.join(data["FullDescription"]).split() + ' '.join(data["Title"]).split():
    token_counts[t] += 1

min_count = 10

tokens = sorted(t for t, c in token_counts.items() if c >= min_count)

UNK, PAD = "UNK", "PAD"
tokens = [UNK, PAD] + tokens
token_to_id = {t : i for i, t in enumerate(tokens)}
UNK_IX, PAD_IX = map(token_to_id.get, [UNK, PAD])

In [6]:
def as_matrix(sequences, max_len=None):
    """ Convert a list of tokens into a matrix with padding """
    if isinstance(sequences[0], str):
        sequences = list(map(str.split, sequences))

    max_len = min(max(map(len, sequences)), max_len or float('inf'))

    matrix = np.full((len(sequences), max_len), np.int32(PAD_IX))
    for i,seq in enumerate(sequences):
        row_ix = [token_to_id.get(word, UNK_IX) for word in seq[:max_len]]
        matrix[i, :len(row_ix)] = row_ix

    return matrix

In [7]:
from sklearn.feature_extraction import DictVectorizer

top_companies, top_counts = zip(*Counter(data['Company']).most_common(1000))
recognized_companies = set(top_companies)
data["Company"] = data["Company"].apply(lambda comp: comp if comp in recognized_companies else "Other")

categorical_vectorizer = DictVectorizer(dtype=np.float32, sparse=False)
categorical_vectorizer.fit(data[categorical_columns].apply(dict, axis=1))

In [8]:
from sklearn.model_selection import train_test_split

data_train, data_val = train_test_split(data, test_size=0.2, random_state=42)
data_train.index = range(len(data_train))
data_val.index = range(len(data_val))

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F


device = 'cuda' if torch.cuda.is_available() else 'cpu'


def to_tensors(batch, device):
    batch_tensors = dict()
    for key, arr in batch.items():
        if key in ["FullDescription", "Title"]:
            batch_tensors[key] = torch.tensor(arr, device=device, dtype=torch.int64)
        else:
            batch_tensors[key] = torch.tensor(arr, device=device)
    return batch_tensors

def make_batch(data, max_len=None, word_dropout=0, device=device):
    """
    Creates a keras-friendly dict from the batch data.
    :param word_dropout: replaces token index with UNK_IX with this probability
    :returns: a dict with {'title' : int64[batch, title_max_len]
    """
    batch = {}
    batch["Title"] = as_matrix(data["Title"].values, max_len)
    batch["FullDescription"] = as_matrix(data["FullDescription"].values, max_len)
    batch['Categorical'] = categorical_vectorizer.transform(data[categorical_columns].apply(dict, axis=1))

    if word_dropout != 0:
        batch["FullDescription"] = apply_word_dropout(batch["FullDescription"], 1. - word_dropout)

    if TARGET_COLUMN in data.columns:
        batch[TARGET_COLUMN] = data[TARGET_COLUMN].values

    return to_tensors(batch, device)

def apply_word_dropout(matrix, keep_prop, replace_with=UNK_IX, pad_ix=PAD_IX,):
    dropout_mask = np.random.choice(2, np.shape(matrix), p=[keep_prop, 1 - keep_prop])
    dropout_mask &= matrix != pad_ix
    return np.choose(dropout_mask, [matrix, np.full_like(matrix, replace_with)])

In [10]:
import gensim.downloader as api
w2v = api.load('glove-twitter-100')
cnt = 0
pretrained_vecs = np.ndarray((len(tokens), 100))
for i, t in enumerate(tokens):
    if t in ['UNK', 'PAD']:
        pretrained_vecs[i] = w2v.get_vector(t.lower())
    elif t in w2v.key_to_index:
        pretrained_vecs[i] = w2v.get_vector(t)
    else:
        pretrained_vecs[i] = np.random.randn(100)

In [11]:
class SalaryPredictor(nn.Module):
    def __init__(self, n_tokens=len(tokens), n_cat_features=len(categorical_vectorizer.vocabulary_), hid_size=64, emb_dim=16):
        super().__init__()
        emb_dim, conv_out, k_size, cat_out = emb_dim, 200, 3, 256
        self.emb = nn.Embedding(num_embeddings=n_tokens, embedding_dim=emb_dim)
        self.conv_title = nn.Conv1d(emb_dim, conv_out, k_size)
        self.conv_descr = nn.Conv1d(emb_dim, conv_out, k_size)
        self.categ = nn.Linear(n_cat_features, cat_out)
        self.final = nn.Linear(cat_out + 2 * conv_out, 1)

    def forward(self, batch):
        title_embs = self.emb(batch['Title']).transpose(2, 1)
        descr_embs = self.emb(batch['FullDescription']).transpose(2, 1)
        title_conved = self.conv_title(title_embs)
        descr_conved = self.conv_descr(descr_embs)
        title_relued, descr_relued = map(F.relu, [title_conved, descr_conved])
        title_pooled, _ = torch.max(title_relued, 2)
        descr_pooled, _ = torch.max(descr_relued, 2)

        cat_result = F.relu(self.categ(batch['Categorical']))

        concat = torch.cat((title_pooled, descr_pooled, cat_result), 1)

        return torch.squeeze(self.final(concat))

In [12]:
model = SalaryPredictor().to(device)
batch = make_batch(data_train[:100], device=device)
criterion = nn.MSELoss()

dummy_pred = model(batch)
dummy_loss = criterion(dummy_pred, batch[TARGET_COLUMN])
assert dummy_pred.shape == torch.Size([100])
assert len(torch.unique(dummy_pred)) > 20, "model returns suspiciously few unique outputs. Check your initialization"
assert dummy_loss.ndim == 0 and 0. <= dummy_loss <= 250., "make sure you minimize MSE"

In [13]:
def iterate_minibatches(data, batch_size=256, shuffle=True, cycle=False, device=device, **kwargs):
    """ iterates minibatches of data in random order """
    while True:
        indices = np.arange(len(data))
        if shuffle:
            indices = np.random.permutation(indices)

        for start in range(0, len(indices), batch_size):
            batch = make_batch(data.iloc[indices[start : start + batch_size]], device=device, **kwargs)
            yield batch

        if not cycle: break

In [31]:
from tqdm.auto import tqdm

BATCH_SIZE = 32
EPOCHS = 1

In [15]:
def print_metrics(model, data, batch_size=BATCH_SIZE, name="", device=torch.device('cpu'), **kw):
    squared_error = abs_error = num_samples = 0.0
    model.eval()
    with torch.no_grad():
        for batch in iterate_minibatches(data, batch_size=batch_size, shuffle=False, device=device, **kw):
            batch_pred = model(batch)
            squared_error += torch.sum(torch.square(batch_pred - batch[TARGET_COLUMN]))
            abs_error += torch.sum(torch.abs(batch_pred - batch[TARGET_COLUMN]))
            num_samples += len(batch_pred)
    mse = squared_error.detach().cpu().numpy() / num_samples
    mae = abs_error.detach().cpu().numpy() / num_samples
    print("%s results:" % (name or ""))
    print("Mean square error: %.5f" % mse)
    print("Mean absolute error: %.5f" % mae)
    return mse, mae

In [30]:
model = SalaryPredictor(emb_dim=w2v.vector_size).to(device)
model.emb = nn.Embedding.from_pretrained(torch.FloatTensor(pretrained_vecs), freeze=False).to(device)
model.emb.requires_grad = True
criterion = nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

In [32]:
model.emb.requires_grad = True
for epoch in range(EPOCHS):
    print(f"epoch: {epoch}")
    model.train()
    for i, batch in tqdm(enumerate(
            iterate_minibatches(data_train, batch_size=BATCH_SIZE, device=device)),
            total=len(data_train) // BATCH_SIZE
        ):
        pred = model(batch)
        loss = criterion(pred, batch[TARGET_COLUMN])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print_metrics(model, data_val, device=device)

epoch: 0


  0%|          | 0/6119 [00:00<?, ?it/s]

 results:
Mean square error: 0.12411
Mean absolute error: 0.27132
