### Loading from File

In [1]:
from scipy.sparse import load_npz
import pickle
import numpy as np

In [2]:
with open('train_texts.pkl', 'rb') as f:
    train_texts = pickle.load(f)
    
train_cat = load_npz("train_cat.npz")

with open('train_label.npy', 'rb') as f:
    train_label = np.load(f)

### Train-Validation Split

In [3]:
import math
import numpy as np

In [4]:
train_cat = train_cat.toarray()

In [5]:
train_len = len(train_texts)
indices = np.arange(train_len)
np.random.shuffle(indices)
train_idx = indices[:math.floor(0.9*train_len)]
val_idx = indices[math.floor(0.9*train_len):]

In [6]:
val_texts = [train_texts[i] for i in val_idx]
val_cat = train_cat[val_idx]
val_label = train_label[val_idx]

In [7]:
train_texts = [train_texts[i] for i in train_idx]
train_cat = train_cat[train_idx]
train_label = train_label[train_idx]

### Creating PyTorch Datasets

In [8]:
import torch as T
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [9]:
class Vocabulary:
    def __init__(self, min_df):
        self.itos = {0: '<pad>', 1: '<unk>'}
        self.stoi = {'<pad>': 0, '<unk>': 1}
        self.min_df = min_df
        self.tokenizer = hazm.WordTokenizer(
            join_verb_parts=False,
            separate_emoji=True,
            replace_links=True,
            replace_IDs=False,
            replace_emails=True,
            replace_numbers=False,
            replace_hashtags=False
        )

    def __len__(self):
        return len(self.itos)

    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 2

        for sentence in sentence_list:
            for word in self.tokenizer.tokenize(sentence):
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1

                if frequencies[word] == self.min_df:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenizer.tokenize(text)
        
        return [
            self.stoi[token] if token in self.stoi else self.stoi['<unk>']
            for token in tokenized_text
        ]

In [10]:
class DivarDataset(Dataset):
    def __init__(self, cat_mat, text_list, labels, vocab):
        self.cat_mat = cat_mat
        self.labels = labels
        self.vocab = vocab
            
        self.text_list = [self.vocab.numericalize(text) for text in text_list]
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, index):
        cat_row = self.cat_mat[index]
        numer_text = self.text_list[index]
        label = self.labels[index]
        return T.tensor(cat_row, dtype=T.float32), T.tensor(numer_text), label

In [11]:
class Collate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        cat_rows = [item[0] for item in batch]
        cat_rows = T.vstack(cat_rows)
        numer_texts = [item[1] for item in batch]
        text_lengths = [text.shape[0] for text in numer_texts]
        numer_texts = pad_sequence(numer_texts, batch_first=True, padding_value=self.pad_idx)
        labels = [item[2] for item in batch]
        labels = T.tensor(labels, dtype=T.float32)
        
        return cat_rows, numer_texts, labels, text_lengths

In [12]:
def get_loader(cat_mat, text_list, labels, vocab, batch_size=32, shuffle=True):
    
    dataset = DivarDataset(cat_mat, text_list, labels, vocab)
    
    pad_idx = dataset.vocab.stoi['<pad>']

    loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=Collate(pad_idx=pad_idx)
    )

    return loader

### Creating Loader Objects

In [13]:
import hazm

In [14]:
min_df=20
vocab = Vocabulary(min_df)
vocab.build_vocabulary(train_texts)

In [15]:
train_loader = get_loader(train_cat, train_texts, train_label, vocab, batch_size=128)

In [16]:
val_loader = get_loader(val_cat, val_texts, val_label, vocab, batch_size=128)

### Classifier

In [17]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F

In [18]:
class Classifier(nn.Module):
    def __init__(self, cat_dim, dict_dim, embedding_dim, n_filters, filter_sizes):
        super().__init__()
        
        self.embedding = nn.Embedding(dict_dim, embedding_dim, padding_idx=0, dtype=T.float32)
        
        self.convs = nn.ModuleList(
            [nn.Conv1d(in_channels = embedding_dim, 
                       out_channels = n_filters,
                       kernel_size = fs)
             for fs in filter_sizes]
        )
        
        self.cat_shrink = nn.Sequential(
            nn.Linear(cat_dim, 512),
            nn.ReLU(),
        )
        
        self.text_shrink = nn.Sequential(
            nn.Linear(len(filter_sizes)*n_filters, 512),
            nn.ReLU(),
        )
        
        self.fc = nn.Sequential(
            nn.Linear(1024, 300),
            nn.ReLU(),
            nn.Linear(300, 150),
            nn.ReLU(),
            nn.Linear(150, 1)
        )
        
        
    def forward(self, cat, text, text_lengths):
        
        embedded = self.embedding(text)
        embedded = embedded.permute(0, 2, 1)
        
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        text_emb = T.cat(pooled, dim = 1)
                
        text_shrinked = self.text_shrink(text_emb)
        
        # -----------------------------------------------------------------------------
        
        cat_shrinked = self.cat_shrink(cat)
        
        # -----------------------------------------------------------------------------
        
        lin_input = T.cat((cat_shrinked, text_shrinked), dim=1)
        
        return self.fc(lin_input)

In [19]:
CAT_DIM = train_cat.shape[1]
DICT_DIM = vocab.__len__()
EMBEDDING_DIM = 300
N_FILTERS = 200
FILTER_SIZES = [1, 2, 4]

model = Classifier(CAT_DIM, DICT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES)

### Training

In [20]:
import torch.optim as optim
from sklearn.metrics import roc_auc_score
import time

In [21]:
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

In [22]:
device = 'cuda'
criterion = criterion.to(device)
model = model.to(device)

In [23]:
def binary_accuracy(preds, y):
    rounded_preds = T.round(T.sigmoid(preds))
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc

In [24]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for i, (cat, text, label, text_lengths) in enumerate(iterator):
        
        if i % 500 == 0:
            print(f'    mini-batch {i}')
        
        optimizer.zero_grad()
        
        cat = cat.to(device)
        text = text.to(device)
        label = label.to(device)
        
        predictions = model(cat, text, text_lengths).squeeze(1)
                
        loss = criterion(predictions, label)
        acc = binary_accuracy(predictions, label)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
                
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [25]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    preds = []
    labels = []
    
    
    with T.no_grad():
    
        for i, (cat, text, label, text_lengths) in enumerate(iterator):

            cat = cat.to(device)
            text = text.to(device)
            label = label.to(device)
            
            predictions = model(cat, text, text_lengths).squeeze(1)
            
            preds += T.sigmoid(predictions).tolist()
            labels += label.tolist()
                            
            loss = criterion(predictions, label)
            acc = binary_accuracy(predictions, label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
    auc = roc_auc_score(labels, preds)
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator), auc

In [26]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

### Experiments
- batch size doesn't matter
- negative sampling makes things worse
- better use droput on each layer but with small amount (=0.1-0.2)
- cat_embedding didn't help at all
- better use shrink one text and cat to same size (=512), relu (not tanh)
- adam default lr is good
- text seperation helps a lot

In [32]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    print(f'Epoch: {epoch+1}')
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc, valid_auc = evaluate(model, val_loader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
       
    print(f'Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'    Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'     Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% | Val. AUC: {valid_auc:.4f}')
    print()

Epoch: 1
    mini-batch 0
    mini-batch 500
    mini-batch 1000
    mini-batch 1500
    mini-batch 2000
    mini-batch 2500
    mini-batch 3000
    mini-batch 3500
Epoch Time: 4m 39s
    Train Loss: 0.251 | Train Acc: 90.62%
     Val. Loss: 0.212 |  Val. Acc: 92.42% | Val. AUC: 0.9511

Epoch: 2
    mini-batch 0
    mini-batch 500
    mini-batch 1000
    mini-batch 1500
    mini-batch 2000
    mini-batch 2500
    mini-batch 3000
    mini-batch 3500
Epoch Time: 4m 20s
    Train Loss: 0.203 | Train Acc: 92.82%
     Val. Loss: 0.195 |  Val. Acc: 93.01% | Val. AUC: 0.9580

Epoch: 3
    mini-batch 0
    mini-batch 500
    mini-batch 1000
    mini-batch 1500
    mini-batch 2000
    mini-batch 2500
    mini-batch 3000
    mini-batch 3500
Epoch Time: 4m 22s
    Train Loss: 0.185 | Train Acc: 93.55%
     Val. Loss: 0.191 |  Val. Acc: 93.21% | Val. AUC: 0.9608

Epoch: 4
    mini-batch 0
    mini-batch 500
    mini-batch 1000
    mini-batch 1500
    mini-batch 2000
    mini-batch 2500
    mini-ba

### Saving The Model

In [82]:
T.save(model.state_dict(), 'conv.torch')
T.save(optimizer.state_dict(), 'conv_optim.torch')

In [84]:
with open('vocab_stoi.pkl', 'wb') as f:
    pickle.dump(vocab.stoi, f)

In [85]:
with open('vocab_itos.pkl', 'wb') as f:
    pickle.dump(vocab.itos, f)

In [86]:
print(CAT_DIM, DICT_DIM)

1431 9304


### Loading Test Data & The Model

In [93]:
from scipy.sparse import load_npz
import pickle
import hazm
import numpy as np

In [94]:
with open('test_texts.pkl', 'rb') as f:
    test_texts = pickle.load(f)
    
test_cat = load_npz("test_cat.npz")

In [95]:
test_cat = test_cat.toarray()

In [96]:
with open('vocab_stoi.pkl', 'rb') as f:
    vocab_stoi = pickle.load(f)

with open('vocab_itos.pkl', 'rb') as f:
    vocab_itos = pickle.load(f)

In [97]:
vocab = Vocabulary(20)
vocab.stoi = vocab_stoi
vocab.itos = vocab_itos

In [98]:
test_loader = get_loader(test_cat, test_texts, np.zeros(len(test_texts)), vocab, batch_size=128, shuffle=False)

In [101]:
CAT_DIM = 1431
DICT_DIM = 9304
EMBEDDING_DIM = 300
N_FILTERS = 200
FILTER_SIZES = [1, 2, 4]

model = Classifier(CAT_DIM, DICT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES)
model.load_state_dict(T.load('conv.torch'))

<All keys matched successfully>

In [None]:
optimizer.load_state_dict(T.load('conv_optim.torch'))

In [102]:
device = 'cuda'
model = model.to(device)

### Predicting

In [103]:
import pandas as pd

In [104]:
def predict(model, iterator):
    
    model.eval()
    
    preds = []
    
    with T.no_grad():
        for i, (cat, text, label, text_lengths) in enumerate(iterator):

            cat = cat.to(device)
            text = text.to(device)
            
            predictions = T.sigmoid(model(cat, text, text_lengths).squeeze(1))
            preds += predictions.tolist()
        
    return preds

In [105]:
preds = predict(model, test_loader)

In [106]:
test_df = pd.read_parquet('DMC-Phase1-Validation.parquet', engine='fastparquet')
pred_df = pd.DataFrame()
pred_df['post_id'] = test_df['post_id']
pred_df['predictions'] = preds

In [107]:
pred_df.head(5)

Unnamed: 0,post_id,predictions
0,c16685db-c7b2-403e-b56d-4a745d7e4686,0.977904
1,e65f2de9-acd2-4f03-9395-24f89e1fed32,0.656862
2,cdf973fe-0b45-49d5-b5d6-bbca65c87adc,0.076129
3,e29d3726-6f7e-42f2-9684-26f1cd3405f8,0.978551
4,37fb59d9-be82-4985-84ed-9132732b2144,0.001803


In [108]:
pred_df.to_csv('pred.csv', index=False)