### Loading from File

In [1]:
from scipy.sparse import load_npz
import pickle
import numpy as np

In [2]:
with open('train_texts.pkl', 'rb') as f:
    train_texts = pickle.load(f)
    
train_cat = load_npz("train_cat.npz")

with open('train_label.npy', 'rb') as f:
    train_label = np.load(f)

### Train-Validation Split

In [3]:
import math
import numpy as np

In [4]:
train_cat = train_cat.toarray()

In [5]:
np.random.seed(0)

In [6]:
train_len = len(train_texts)
indices = np.arange(train_len)
np.random.shuffle(indices, )
train_idx = indices[:math.floor(0.9*train_len)]
val_idx = indices[math.floor(0.9*train_len):]

In [7]:
val_texts = [train_texts[i] for i in val_idx]
val_cat = train_cat[val_idx]
val_label = train_label[val_idx]

In [8]:
train_texts = [train_texts[i] for i in train_idx]
train_cat = train_cat[train_idx]
train_label = train_label[train_idx]

### Creating PyTorch Datasets

In [9]:
import torch as T
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [10]:
class Vocabulary:
    def __init__(self, min_df):
        self.itos = {0: '<pad>', 1: '<unk>'}
        self.stoi = {'<pad>': 0, '<unk>': 1}
        self.min_df = min_df
        self.tokenizer = hazm.WordTokenizer(
            join_verb_parts=False,
            separate_emoji=True,
            replace_links=True,
            replace_IDs=False,
            replace_emails=True,
            replace_numbers=False,
            replace_hashtags=False
        )

    def __len__(self):
        return len(self.itos)

    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 2

        for sentence in sentence_list:
            for word in self.tokenizer.tokenize(sentence):
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1

                if frequencies[word] == self.min_df:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenizer.tokenize(text)
        tokenized_text = tokenized_text[:128]
        
        return [
            self.stoi[token] if token in self.stoi else self.stoi['<unk>']
            for token in tokenized_text
        ]

In [11]:
class DivarDataset(Dataset):
    def __init__(self, cat_mat, text_list, labels, vocab):
        self.cat_mat = cat_mat
        self.labels = labels
        self.vocab = vocab
            
        self.text_list = [self.vocab.numericalize(text) for text in text_list]
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, index):
        cat_row = self.cat_mat[index]
        numer_text = self.text_list[index]
        label = self.labels[index]
        return T.tensor(cat_row, dtype=T.float32), T.tensor(numer_text), label

In [12]:
class Collate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        cat_rows = [item[0] for item in batch]
        cat_rows = T.vstack(cat_rows)
        numer_texts = [item[1] for item in batch]
        text_lengths = [text.shape[0] for text in numer_texts]
        numer_texts = pad_sequence(numer_texts, batch_first=True, padding_value=self.pad_idx)
        labels = [item[2] for item in batch]
        labels = T.tensor(labels, dtype=T.long)
        
        return cat_rows, numer_texts, labels, text_lengths

In [13]:
def get_loader(cat_mat, text_list, labels, vocab, batch_size=32, shuffle=True):
    
    dataset = DivarDataset(cat_mat, text_list, labels, vocab)
    
    pad_idx = dataset.vocab.stoi['<pad>']

    loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=Collate(pad_idx=pad_idx)
    )

    return loader

### Creating Loader Objects

In [14]:
import hazm

In [15]:
min_df=25
vocab = Vocabulary(min_df)
vocab.build_vocabulary(train_texts)

In [16]:
train_loader = get_loader(train_cat, train_texts, train_label, vocab, batch_size=256)

In [17]:
val_loader = get_loader(val_cat, val_texts, val_label, vocab, batch_size=128)

### Classifier

In [18]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F

In [19]:
class Classifier(nn.Module):
    def __init__(self, cat_dim, dict_dim, embedding_dim, hidden_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(dict_dim, embedding_dim, padding_idx=0, dtype=T.float32)
        self.lstm = nn.GRU(embedding_dim, hidden_dim, num_layers=2, bidirectional=True, batch_first=True)
        
        self.cat_shrink = nn.Sequential(
            nn.Linear(cat_dim, 600),
            nn.Tanh(),            
        )
                
        self.fc = nn.Sequential(
            nn.Linear(1200, 400),
            nn.ReLU(),
            nn.Linear(400, 100),
            nn.ReLU(),
            nn.Linear(100, 9)
        )
        
        
    def forward(self, cat, text, text_lengths):
        
        embedded = self.embedding(text)
                        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded, text_lengths, enforce_sorted=False, batch_first=True)
        
        packed_output, hidden = self.lstm(packed_embedded)
                
        hidden = T.cat((hidden[-2], hidden[-1]), dim=1)

        cat_shrinked = self.cat_shrink(cat)
        
        lin_input = T.cat((cat_shrinked, hidden), dim=1)
        
        return self.fc(lin_input)

In [20]:
CAT_DIM = train_cat.shape[1]
DICT_DIM = train_loader.dataset.vocab.__len__()
EMBEDDING_DIM = 256
HIDDEN_DIM = 300

model = Classifier(CAT_DIM, DICT_DIM, EMBEDDING_DIM, HIDDEN_DIM)

### Training

In [21]:
import torch.optim as optim
from sklearn.metrics import roc_auc_score
import time
import torch.nn.functional as F

In [22]:
optimizer = optim.AdamW(model.parameters(), weight_decay=0.2)
criterion = nn.CrossEntropyLoss(weight=T.tensor([1.0, 2.0, 2.0, 2.0, 2.0, 4.0, 2.0, 2.0, 3.0]))

In [23]:
device = 'cuda'
criterion = criterion.to(device)
model = model.to(device)

In [24]:
def binary_accuracy(preds, y):
    rounded_preds = T.round(preds)
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc

In [25]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
        
    for i, (cat, text, label, text_lengths) in enumerate(iterator):
        
        if i % 500 == 0:
            print(f'    mini-batch {i}')
        
        optimizer.zero_grad()
        
        cat = cat.to(device)
        text = text.to(device)
        label = label.to(device)
        
        predictions = model(cat, text, text_lengths)
        loss = criterion(predictions, label)
        
        softmax_predictions = F.softmax(predictions, dim=1)
        binary_predictions = 1 - softmax_predictions[:, 0]
        binary_label = (label != 0).float()
        acc = binary_accuracy(binary_predictions, binary_label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item() * len(text)
        epoch_acc += acc.item() * len(text)
                
    return epoch_loss / len(iterator.dataset), epoch_acc / len(iterator.dataset)

In [26]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    preds = []
    labels = []
    
    
    with T.no_grad():
    
        for i, (cat, text, label, text_lengths) in enumerate(iterator):

            cat = cat.to(device)
            text = text.to(device)
            label = label.to(device)
            
            predictions = model(cat, text, text_lengths).squeeze(1)
                            
            loss = criterion(predictions, label)
            
            softmax_predictions = F.softmax(predictions, dim=1)
            binary_predictions = 1 - softmax_predictions[:, 0]
            binary_label = (label != 0).float()
            acc = binary_accuracy(binary_predictions, binary_label)
            
            preds += binary_predictions.tolist()
            labels += binary_label.tolist()

            epoch_loss += loss.item() * len(text)
            epoch_acc += acc.item() * len(text)
            
    auc = roc_auc_score(labels, preds)
        
    return epoch_loss / len(iterator.dataset), epoch_acc / len(iterator.dataset), auc

In [27]:
def train_early_stop(model, train_iterator, val_iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    best_auc = 0
        
    for i, (cat, text, label, text_lengths) in enumerate(train_iterator):
        
        if i % 250 == 0:
            _, _, valid_auc = evaluate(model, val_iterator, criterion)
            
            T.save(model.state_dict(), f'fine_gru_{valid_auc}.torch')
            print(f'    mini-batch {i} val auc= {valid_auc}')
            
        model.train()
        
        optimizer.zero_grad()
        
        cat = cat.to(device)
        text = text.to(device)
        label = label.to(device)
        
        predictions = model(cat, text, text_lengths)
        loss = criterion(predictions, label)
        
        softmax_predictions = F.softmax(predictions, dim=1)
        binary_predictions = 1 - softmax_predictions[:, 0]
        binary_label = (label != 0).float()
        acc = binary_accuracy(binary_predictions, binary_label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item() * len(text)
        epoch_acc += acc.item() * len(text)
                  
    return

In [28]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [29]:
N_EPOCHS = 3

for epoch in range(N_EPOCHS):

    print(f'Epoch: {epoch+1}')
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc, valid_auc = evaluate(model, val_loader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
       
    print(f'Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'    Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'     Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% | Val. AUC: {valid_auc:.4f}')
    print()

Epoch: 1
    mini-batch 0
    mini-batch 500
    mini-batch 1000
    mini-batch 1500
Epoch Time: 6m 43s
    Train Loss: 0.547 | Train Acc: 89.25%
     Val. Loss: 0.443 |  Val. Acc: 90.96% | Val. AUC: 0.9583

Epoch: 2
    mini-batch 0
    mini-batch 500
    mini-batch 1000
    mini-batch 1500
Epoch Time: 6m 43s
    Train Loss: 0.388 | Train Acc: 92.64%
     Val. Loss: 0.382 |  Val. Acc: 93.42% | Val. AUC: 0.9685

Epoch: 3
    mini-batch 0
    mini-batch 500
    mini-batch 1000
    mini-batch 1500
Epoch Time: 6m 48s
    Train Loss: 0.334 | Train Acc: 93.50%
     Val. Loss: 0.365 |  Val. Acc: 92.13% | Val. AUC: 0.9718



In [31]:
for param_group in optimizer.param_groups:
    param_group['lr'] = 0.0001

In [32]:
N_EPOCHS = 1

for epoch in range(N_EPOCHS):

    print(f'Epoch: {epoch+1}')
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc, valid_auc = evaluate(model, val_loader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
       
    print(f'Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'    Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'     Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% | Val. AUC: {valid_auc:.4f}')
    print()

Epoch: 1
    mini-batch 0
    mini-batch 500
    mini-batch 1000
    mini-batch 1500
Epoch Time: 6m 42s
    Train Loss: 0.228 | Train Acc: 95.31%
     Val. Loss: 0.330 |  Val. Acc: 94.06% | Val. AUC: 0.9767



### Evaluate

In [43]:
print(classification_report(labels, preds))

              precision    recall  f1-score   support

           0       0.97      0.96      0.96     41471
           1       0.82      0.86      0.84       170
           2       0.86      0.90      0.88      2740
           3       0.80      0.83      0.82      2143
           4       0.74      0.61      0.67       137
           5       0.67      0.71      0.69      2158
           6       0.97      0.96      0.97      2702
           7       0.78      0.83      0.80       267
           8       0.73      0.80      0.76      2249

    accuracy                           0.93     54037
   macro avg       0.81      0.83      0.82     54037
weighted avg       0.93      0.93      0.93     54037



### Saving The Model

In [30]:
T.save(model.state_dict(), 'gru_9767.torch')
T.save(optimizer.state_dict(), 'optim_gru_9763.torch')

In [31]:
with open('vocab_stoi_gru_9767.pkl', 'wb') as f:
    pickle.dump(vocab.stoi, f)

In [32]:
with open('vocab_itos_gru_9767.pkl', 'wb') as f:
    pickle.dump(vocab.itos, f)

In [33]:
print(CAT_DIM, DICT_DIM)

1340 10652


### Loading Test Data & The Model

In [36]:
from scipy.sparse import load_npz
import pickle
import hazm
import numpy as np

In [37]:
with open('test_texts.pkl', 'rb') as f:
    test_texts = pickle.load(f)
    
test_cat = load_npz("test_cat.npz")

In [38]:
test_cat = test_cat.toarray()

In [39]:
with open('vocab_stoi_gru_9767.pkl', 'rb') as f:
    vocab_stoi = pickle.load(f)

with open('vocab_itos_gru_9767.pkl', 'rb') as f:
    vocab_itos = pickle.load(f)

In [40]:
vocab = Vocabulary(25)
vocab.stoi = vocab_stoi
vocab.itos = vocab_itos

In [41]:
test_loader = get_loader(test_cat, test_texts, np.zeros(len(test_texts)), vocab, batch_size=256, shuffle=False)

In [42]:
CAT_DIM = 1340 
DICT_DIM = 10652
EMBEDDING_DIM = 256
HIDDEN_DIM = 300

model = Classifier(CAT_DIM, DICT_DIM, EMBEDDING_DIM, HIDDEN_DIM)
model.load_state_dict(T.load('gru_9767.torch'))

<All keys matched successfully>

In [None]:
optimizer.load_state_dict(T.load('optim_gru_9767.torch'))

In [43]:
device = 'cuda'
model = model.to(device)

### Predicting

In [44]:
import pandas as pd

In [45]:
def predict(model, iterator):
    
    model.eval()
    
    preds = []
    
    with T.no_grad():
        for i, (cat, text, label, text_lengths) in enumerate(iterator):

            cat = cat.to(device)
            text = text.to(device)
            
            predictions = model(cat, text, text_lengths)
            
            softmax_predictions = F.softmax(predictions, dim=1)
            binary_predictions = 1 - softmax_predictions[:, 0]
            
            preds += binary_predictions.tolist()
        
    return preds

In [46]:
preds = predict(model, test_loader)

  labels = T.tensor(labels, dtype=T.long)


In [47]:
preds = [1 - p for p in preds]

In [48]:
test_df = pd.read_parquet('DMC-phase2-validation.parquet', engine='fastparquet')
pred_df = pd.DataFrame()
pred_df['post_id'] = test_df['post_id']
pred_df['predictions'] = preds

In [49]:
pred_df.head(5)

Unnamed: 0,post_id,predictions
0,69a9fbfc-b710-4c5f-8f2f-67a204b235ea,0.987511
1,cb7b6f33-de2b-4e73-bc78-686c9e04ca00,0.970797
2,396bccd3-29da-489c-b607-afeec8ab53bd,0.979923
3,4e11ee23-d3fe-4c07-a66d-25c96e507cb2,0.996737
4,d78d9ccb-249b-465b-9f5a-3c88851ffc10,0.984913


In [50]:
pred_df.to_csv('pred.csv', index=False)