### Loading Train Set from File

In [1]:
import pickle
import numpy as np
from scipy.sparse import load_npz

In [2]:
with open('train_texts.pkl', 'rb') as f:
    train_texts = pickle.load(f)
    
train_cat = load_npz('train_cat.npz')

with open('train_label.npy', 'rb') as f:
    train_label = np.load(f)

In [3]:
max_label = np.max(train_label)

In [4]:
max_label = 62852500

In [5]:
train_label = train_label / max_label

### Train-Validation Split

In [7]:
import math

In [8]:
train_cat = train_cat.toarray()

In [9]:
np.random.seed(1)

In [10]:
train_len = len(train_texts)
indices = np.arange(train_len)
np.random.shuffle(indices, )
train_idx = indices[:math.floor(0.9*train_len)]
val_idx = indices[math.floor(0.9*train_len):]

In [11]:
val_texts = [train_texts[i] for i in val_idx]
val_cat = train_cat[val_idx]
val_label = train_label[val_idx]

In [12]:
train_texts = [train_texts[i] for i in train_idx]
train_cat = train_cat[train_idx]
train_label = train_label[train_idx]

In [13]:
val_cat.shape

(6884, 3238)

### Creating PyTorch Datasets

In [14]:
import torch as T
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [15]:
class Vocabulary:
    def __init__(self, min_df):
        self.itos = {0: '<pad>', 1: '<unk>'}
        self.stoi = {'<pad>': 0, '<unk>': 1}
        self.min_df = min_df
        self.tokenizer = hazm.WordTokenizer(
            join_verb_parts=False,
            separate_emoji=True,
            replace_links=True,
            replace_IDs=False,
            replace_emails=True,
            replace_numbers=False,
            replace_hashtags=False
        )

    def __len__(self):
        return len(self.itos)

    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 2

        for sentence in sentence_list:
            for word in self.tokenizer.tokenize(sentence):
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1

                if frequencies[word] == self.min_df:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenizer.tokenize(text)
        
        return [
            self.stoi[token] if token in self.stoi else self.stoi['<unk>']
            for token in tokenized_text
        ]

In [16]:
class DigikalaDataset(Dataset):
    def __init__(self, cat_mat, text_list, labels, vocab):
        self.cat_mat = cat_mat
        self.labels = labels
        self.vocab = vocab
            
        self.text_list = [self.vocab.numericalize(text) for text in text_list]
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, index):
        cat_row = self.cat_mat[index]
        numer_text = self.text_list[index]
        label = self.labels[index]
        return T.tensor(cat_row, dtype=T.float32), T.tensor(numer_text, dtype=T.long), label

In [17]:
class Collate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        cat_rows = [item[0] for item in batch]
        cat_rows = T.vstack(cat_rows)
        numer_texts = [item[1] for item in batch]
        text_lengths = [text.shape[0] for text in numer_texts]
        numer_texts = pad_sequence(numer_texts, batch_first=True, padding_value=self.pad_idx)
        labels = [item[2] for item in batch]
        labels = T.tensor(labels, dtype=T.float32)
        
        return cat_rows, numer_texts, labels, text_lengths

In [18]:
def get_loader(cat_mat, text_list, labels, vocab, batch_size=32, shuffle=True):
    
    dataset = DigikalaDataset(cat_mat, text_list, labels, vocab)
    
    pad_idx = dataset.vocab.stoi['<pad>']

    loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=Collate(pad_idx=pad_idx)
    )

    return loader

### Creating Loader Objects

In [19]:
import hazm

In [20]:
min_df=25
vocab = Vocabulary(min_df)
vocab.build_vocabulary(train_texts)

In [21]:
vocab.__len__()

1866

In [22]:
train_loader = get_loader(train_cat, train_texts, train_label, vocab, batch_size=256)

In [23]:
val_loader = get_loader(val_cat, val_texts, val_label, vocab, batch_size=128)

### Regressor

In [24]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F

In [25]:
class Regressor(nn.Module):
    def __init__(self, cat_dim, dict_dim, embedding_dim, hidden_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(dict_dim, embedding_dim, padding_idx=0, dtype=T.float32)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=2, bidirectional=True, batch_first=True)

        self.cat_shrink = nn.Sequential(
            nn.Linear(cat_dim, 500),
            nn.Tanh(),            
        )
                
        self.fc = nn.Sequential(
            nn.Linear(1000, 200),
            nn.ReLU(),
            nn.Linear(200, 40),
            nn.ReLU(),
            nn.Linear(40, 1),
            nn.Sigmoid()
        )
        
        
    def forward(self, cat, text, text_lengths):
        
        embedded = self.embedding(text)
                        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded, text_lengths, enforce_sorted=False, batch_first=True)
        
        packed_output, hidden = self.gru(packed_embedded)
                
        hidden = T.cat((hidden[-2], hidden[-1]), dim=1)
        
        cat_shrinked = self.cat_shrink(cat)
        
        lin_input = T.cat((cat_shrinked, hidden), dim=1)
        
        out = self.fc(lin_input)
        
        out = 2 * out - 0.5
        
        return out

In [26]:
CAT_DIM = train_cat.shape[1]
DICT_DIM = vocab.__len__()
EMBEDDING_DIM = 200
HIDDEN_DIM = 250

model = Regressor(CAT_DIM, DICT_DIM, EMBEDDING_DIM, HIDDEN_DIM)

### Training

In [27]:
import torch.optim as optim
import time
import torch.nn.functional as F
from sklearn.metrics import mean_absolute_percentage_error

In [28]:
optimizer = optim.Adam(model.parameters(), lr=0.003)
criterion = nn.MSELoss()

In [29]:
device = 'cuda'
criterion = criterion.to(device)
model = model.to(device)

In [30]:
def score(preds, y):
    return mean_absolute_percentage_error(y, preds)

In [31]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    preds = []
    labels = []
    
    model.train()
        
    for i, (cat, text, label, text_lengths) in enumerate(iterator):
        
        optimizer.zero_grad()
        
        cat = cat.to(device)
        text = text.to(device)
        label = label.to(device)
        
        predictions = model(cat, text, text_lengths).squeeze()
        loss = criterion(predictions, label)
                
        loss.backward()
        
        optimizer.step()
        
        preds += predictions.tolist()
        labels += label.tolist()
        
        epoch_loss += loss.item() * len(text)
        
    epoch_score = score(preds, labels)
                
    return epoch_loss / len(iterator.dataset), epoch_score

In [32]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    preds = []
    labels = []
    
    model.eval()
        
    for i, (cat, text, label, text_lengths) in enumerate(iterator):
                
        cat = cat.to(device)
        text = text.to(device)
        label = label.to(device)
        
        predictions = model(cat, text, text_lengths).squeeze()
        loss = criterion(predictions, label)
                
        preds += predictions.tolist()
        labels += label.tolist()
        
        epoch_loss += loss.item() * len(text)
        
    epoch_score = score(preds, labels)
                
    return epoch_loss / len(iterator.dataset), epoch_score

In [33]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 12

for epoch in range(N_EPOCHS):

    print(f'Epoch: {epoch+1}')
    
    start_time = time.time()
    
    train_loss, train_score = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_score = evaluate(model, val_loader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'    Train Loss: {train_loss:.5f} | Train Score: {train_score/10e8:.2f}')
    print(f'     Val. Loss: {valid_loss:.5f} |  Val. Score: {valid_score/10e8:.2f}')
    print()

Epoch: 1
Epoch Time: 0m 26s
    Train Loss: 0.79362 | Train Score: 2035.72
     Val. Loss: 0.96397 |  Val. Score: 2377.83

Epoch: 2
Epoch Time: 0m 26s
    Train Loss: 0.74519 | Train Score: 1790.26
     Val. Loss: 0.95377 |  Val. Score: 1799.79

Epoch: 3
Epoch Time: 0m 27s
    Train Loss: 0.73931 | Train Score: 1780.85
     Val. Loss: 0.95352 |  Val. Score: 1953.35

Epoch: 4
Epoch Time: 0m 28s
    Train Loss: 0.73907 | Train Score: 1748.82
     Val. Loss: 0.96189 |  Val. Score: 2237.20

Epoch: 5
Epoch Time: 0m 28s
    Train Loss: 0.73774 | Train Score: 1713.09
     Val. Loss: 0.95304 |  Val. Score: 2016.89

Epoch: 6
Epoch Time: 0m 28s
    Train Loss: 0.73475 | Train Score: 1604.86
     Val. Loss: 0.95173 |  Val. Score: 2153.64

Epoch: 7
Epoch Time: 0m 28s
    Train Loss: 0.73501 | Train Score: 1607.17
     Val. Loss: 0.95317 |  Val. Score: 2123.40

Epoch: 8
Epoch Time: 0m 27s
    Train Loss: 0.73265 | Train Score: 1588.78
     Val. Loss: 0.95146 |  Val. Score: 2113.34

Epoch: 9


### Saving The Model

In [31]:
T.save(model.state_dict(), 'model.torch')

In [32]:
with open('vocab_stoi.pkl', 'wb') as f:
    pickle.dump(vocab.stoi, f)

In [33]:
with open('vocab_itos.pkl', 'wb') as f:
    pickle.dump(vocab.itos, f)

In [187]:
max_label

2770000000

### Loading The Model

In [12]:
import pickle
import hazm

In [10]:
with open('vocab_stoi.pkl', 'rb') as f:
    vocab_stoi = pickle.load(f)

with open('vocab_itos.pkl', 'rb') as f:
    vocab_itos = pickle.load(f)

In [13]:
vocab = Vocabulary(10)
vocab.stoi = vocab_stoi
vocab.itos = vocab_itos

In [14]:
CAT_DIM = 23123
DICT_DIM = vocab.__len__()
EMBEDDING_DIM = 200
HIDDEN_DIM = 200

model = Regressor(CAT_DIM, DICT_DIM, EMBEDDING_DIM, HIDDEN_DIM)

In [15]:
model.load_state_dict(T.load('model.torch'))

<All keys matched successfully>

In [16]:
device = 'cuda'
model = model.to(device)

### Loading Test Data

In [47]:
from scipy.sparse import load_npz
import hazm
import numpy as np

In [48]:
with open('test_texts.pkl', 'rb') as f:
    test_texts = pickle.load(f)
    
test_cat = load_npz("test_cat.npz")

In [49]:
test_cat = test_cat.toarray()

In [50]:
test_loader = get_loader(test_cat, test_texts, np.zeros(len(test_texts)), vocab, batch_size=128, shuffle=False)

In [51]:
max_label = 2770000000

### Predicting

In [85]:
def predict(model, iterator):
    
    model.eval()
    
    preds = []
    
    with T.no_grad():
        for i, (cat, text, label, text_lengths) in enumerate(iterator):

            cat = cat.to(device)
            text = text.to(device)
            
            predictions = model(cat, text, text_lengths).squeeze()
            preds += predictions.tolist()
        
    return preds

In [None]:
preds = predict(model, test_loader)

In [41]:
preds = [int(p * max_label) for p in preds]

In [42]:
preds = [p if p > 0 else 0 for p in preds]

### Saving to File

In [89]:
import pandas as pd

In [90]:
test_df = pd.read_csv('./data/test.csv')

In [91]:
pred_df = pd.DataFrame()
pred_df['id'] = test_df['id']
pred_df['price'] = avg_preds

In [92]:
pred_df.head(50)

Unnamed: 0,id,price
0,0,17517130
1,1,17517130
2,2,8869557
3,3,17517130
4,4,17517130
5,5,17517130
6,6,17517130
7,7,17517130
8,8,17517130
9,9,17517130


In [93]:
pred_df.to_csv('output.csv', index=False)