In [0]:
!wget https://raw.githubusercontent.com/HSE-CROSS-LING-DL/cma/master/data/trk-uncovered-train-transliterated.csv -O train.csv
!wget https://raw.githubusercontent.com/HSE-CROSS-LING-DL/cma/master/data/trk-uncovered-dev-transliterated.csv -O dev.csv
!wget https://raw.githubusercontent.com/HSE-CROSS-LING-DL/cma/master/data/trk-uncovered-test-transliterated.csv -O test.csv

In [0]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch

In [0]:
train = pd.read_csv('train.csv')
dev = pd.read_csv('dev.csv')
test = pd.read_csv('test.csv')

In [0]:
train = train.dropna()
train = train.reset_index(drop=True)
train.shape

(79228, 5)

In [0]:
dev = dev.dropna()
dev = dev.reset_index(drop=True)
dev.shape

(1245, 5)

In [0]:
test = test.dropna()
test = test.reset_index(drop=True)
test.shape

(12926, 5)

In [0]:
train.head(1)

Unnamed: 0,lang,word,lemma,pos,morph
0,tat,казак,казак,NOUN,Case=Nom


In [0]:
def agg_tags(series):
    return sorted(set(series.str.cat(sep='|').split('|')))

from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer

def preprocess_dataset(dataset_df):

    dataset_agg = dataset_df.groupby(["word", "pos"]).agg({"morph" : agg_tags, 
                                                    "word": "first", "pos": "first"})

    mlb = MultiLabelBinarizer()
    dataset_df_morph_mhe = pd.DataFrame(mlb.fit_transform(dataset_agg.morph),
                    columns=map(lambda x: "morph_mhe_"+x, mlb.classes_),
                    index=dataset_agg.index)

    oher = LabelBinarizer()

    dataset_df_pos_mhe = pd.DataFrame(oher.fit_transform(dataset_agg.pos),
                    columns=map(lambda x: "pos_mhe_"+x, oher.classes_),
                    index=dataset_agg.index)


    dataset_new = dataset_agg.join(dataset_df_morph_mhe).join(dataset_df_pos_mhe)
    
    return dataset_new

train = preprocess_dataset(train)
dev = preprocess_dataset(dev)
test = preprocess_dataset(test)
test["morph_mhe_Aspect=Imp"] = test["morph_mhe_Aspect=Impf"]
test.drop("morph_mhe_Aspect=Impf", 1, inplace=True)


In [0]:
train.shape

(49110, 64)

In [0]:
test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,morph,word,pos,morph_mhe_Aspect=Perf,morph_mhe_Case=Abl,morph_mhe_Case=Acc,morph_mhe_Case=Dat,morph_mhe_Case=Gen,morph_mhe_Case=Loc,morph_mhe_Case=Nom,morph_mhe_Degree=Comp,morph_mhe_Deriv=Coop,morph_mhe_Mood=Imp,morph_mhe_Mood=Opt,morph_mhe_Number=Plur,morph_mhe_Number=Sing,morph_mhe_Number[psor]=Plur,morph_mhe_Number[psor]=Sing,"morph_mhe_Number[psor]=Sing,Plur",morph_mhe_Person=1,morph_mhe_Person=2,morph_mhe_Person=3,morph_mhe_Person[psor]=1,morph_mhe_Person[psor]=2,morph_mhe_Person[psor]=3,morph_mhe_Polarity=Neg,morph_mhe_Tense=Aor,morph_mhe_Tense=Fut,morph_mhe_Tense=Past,morph_mhe_Tense=Pres,morph_mhe_Valency=1,morph_mhe_Valency=2,morph_mhe_VerbForm=Conv,morph_mhe_VerbForm=Fin,morph_mhe_VerbForm=Part,morph_mhe_VerbForm=Vnoun,morph_mhe_Voice=Pass,morph_mhe__,pos_mhe_ADJ,pos_mhe_ADV,pos_mhe_NOUN,pos_mhe_VERB,morph_mhe_Aspect=Imp
word,pos,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1
абаза,NOUN,[Case=Nom],абаза,NOUN,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
абазалыла,NOUN,"[Case=Nom, Number=Plur]",абазалыла,NOUN,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
абазалыланы,NOUN,"[Case=Acc, Case=Gen, Number=Plur]",абазалыланы,NOUN,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
аббревиатура,NOUN,[Case=Nom],аббревиатура,NOUN,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
абзац,NOUN,[Case=Nom],абзац,NOUN,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


Data Loader

In [0]:
import torch
from torch.utils.data import Dataset, DataLoader

In [0]:
char2index = {'PAD': 0}

for word in tqdm(train.word):
  for char in word:
    if char not in char2index:
      char2index[char] = len(char2index)

100%|██████████| 49110/49110 [00:00<00:00, 600216.42it/s]


In [0]:
# unique_morphotags = sorted({morphotag 
#                      for morphoanalysis in pd.concat([test.morph, train.morph]) 
#                      for morphotag in morphoanalysis.split('|')})

# idx2morphtag = {idx: morphtag for idx, morphtag in enumerate(unique_morphotags)}
# morphtag2idx = {morphtag: idx for idx, morphtag in enumerate(unique_morphotags)}


mhe_morph_colnames = [c for c in train.columns if c.startswith("morph_mhe_")]
mhe_pos_colnames = [c for c in train.columns if c.startswith("pos_mhe_")]

for col in set(mhe_morph_colnames).difference(set(test.columns)):
    test[col] = test.apply(lambda x: 0, axis=1)
for col in set(mhe_morph_colnames).difference(set(dev.columns)):
    dev[col] = dev.apply(lambda x: 0, axis=1)


class VectorizedData(Dataset):
    
    def __init__(self, df, char2index=char2index, sequence_length=12, pad_token = 'PAD', verbose=True):
        
        super().__init__()
        
        self.x_data = []
        self.y_data = []

        self.char2index = char2index

        self.sequence_length = sequence_length
        self.pad_token = pad_token
        self.pad_index = self.char2index[self.pad_token]
        # self.fasttext_model = fasttext_model

        self.load(df, verbose=verbose)

    # def vectorize(self, word, fasttext_model=ft_model):
    #   try:
    #     return self.fasttext_model[word]
    #   except KeyError:
    #     pass

    def preprocess(self, word):
      return [c for c in word]

    def indexing(self, chars):
      return [self.char2index[char] for char in chars if char in self.char2index]

    def load(self, data, verbose=True):
        for index, row in tqdm(data.iterrows(), desc='Loading data', disable=not verbose):
            # ft_x_vector = self.vectorize(row["word"], fasttext_model)
            # ft_x_vector = np.concatenate([ft_x_vector, row[mhe_pos_colnames].astype('float')])
            chars = self.preprocess(row["word"])
            indexed_chars = self.indexing(chars)

            y_vector = np.concatenate([[], row[mhe_morph_colnames].astype('float')])
            self.x_data.append(indexed_chars)
            self.y_data.append(y_vector)

    def padding(self, sequence):
        if len(sequence) > self.sequence_length:
          sequence = sequence[:self.sequence_length]
        elif len(sequence) < self.sequence_length:
          sequence = sequence + [self.pad_index] * (self.sequence_length - len(sequence))
        return sequence

    def __len__(self):
        return len(self.x_data)
    
    def __getitem__(self, idx):
        x = self.x_data[idx]
        x = self.padding(x)
        x = torch.Tensor(x).long()
        y = self.y_data[idx]
        
        return x, y

In [0]:
train_dataset = VectorizedData(train)

Loading data: 49110it [00:43, 1130.01it/s]


In [0]:
test_dataset = VectorizedData(test)

Loading data: 8966it [00:07, 1164.15it/s]


In [0]:
dev_dataset = VectorizedData(dev)

Loading data: 1025it [00:01, 1021.30it/s]


In [0]:
train_data_loader = DataLoader(train_dataset, batch_size=64)
test_data_loader = DataLoader(test_dataset, batch_size=64)
dev_data_loader = DataLoader(dev_dataset, batch_size=64)

Модель

In [0]:
from torch import nn
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight

In [0]:
class MorphoTagger(nn.Module):

  def __init__(self, embedding_dim=104, n_classes=len(mhe_morph_colnames), \
               vocab_size=len(char2index), hidden_dim=64, seq_len=12, \
               ngrams=[2,3,4], keep_proba=0.4):
    super().__init__()

    self.vocab_size = vocab_size
    self.embedding_dim = embedding_dim
    self.hidden_dim = hidden_dim

    self.dropout = nn.Dropout(p=keep_proba)

    self.embedding_layer = nn.Embedding(num_embeddings=self.vocab_size, \
                                        embedding_dim=self.embedding_dim)
    
    self.lstm_layer = nn.LSTM(self.embedding_dim, self.hidden_dim, batch_first=True, bidirectional=True)

    self.convs = nn.ModuleList([nn.Conv1d(in_channels=self.hidden_dim*2, 
                                          out_channels=self.hidden_dim, 
                                          kernel_size=n) for n in ngrams])
    
    self.pooling = nn.ModuleList([nn.MaxPool1d(kernel_size=seq_len-n+1) for n in ngrams])

    # self.linear1 = nn.Linear(in_features=embedding_dim, out_features=64)
    self.linear = nn.Linear(in_features=len(ngrams) * self.hidden_dim, out_features=n_classes)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x = self.embedding_layer(x)
    lstm_x, mem = self.lstm_layer(x)

    x_transposed = lstm_x.transpose(1, 2)

    conved = [conv(x_transposed) for conv in self.convs]
    pooled = [pool(conv).squeeze(-1) for pool, conv in zip(self.pooling, conved)]
    cat = self.dropout(torch.cat(pooled, 1))
    out = self.linear(cat)

    return self.sigmoid(out)

In [0]:
model = MorphoTagger()


criterion = nn.BCELoss()
optimizer = torch.optim.Adam(params=model.parameters())

In [0]:
from sklearn.metrics import label_ranking_loss


epochs = 30
losses = []
best_test_loss = 10.

acc, macros, prec, rec = [], [], [], []
label_ranking_losses = []
for n_epoch in range(epochs):
    
    train_losses = []
    test_losses = []
    test_preds = []
    test_targets = []
    test_pred_class = []
    
    progress_bar = tqdm(total=len(train_data_loader.dataset), desc='Epoch {}'.format(n_epoch + 1))
    
    tmp_idx = 0
    for x, y in train_data_loader:
        optimizer.zero_grad()
        
        pred = model(x)
        y = y.float()
        
        loss = criterion(pred, y)
        
        loss.backward()
        
        optimizer.step()
        
        train_losses.append(loss.item())
        losses.append(loss.item())
        
        progress_bar.set_postfix(train_loss = np.mean(losses[-500:]))

        progress_bar.update(x.shape[0])
        
    progress_bar.close()
    
    for x, y in test_data_loader:
      with torch.no_grad():
            
        pred = model(x) # .float())

        test_preds.append(pred.numpy())
        test_targets.append(y.float().numpy())


        test_pred_class.append(np.argmax(pred, axis=1))

        loss = criterion(pred, y.float())

        test_losses.append(loss.item())
        
    mean_test_loss = np.mean(test_losses)

    test_targets = np.concatenate(test_targets).squeeze()
    test_pred_class = np.concatenate(test_preds).squeeze()
    
    # accuracy = accuracy_score(test_targets, test_pred_class)
    # precision = precision_score(test_targets, test_pred_class, average='macro')
    # recall = recall_score(test_targets, test_pred_class, average='macro')
    # f1 = f1_score(test_targets, test_pred_class, average='macro')

    # acc.append(accuracy)
    # macros.append(f1)
    # prec.append(precision)
    # rec.append(recall)
    label_ranking_losses.append(label_ranking_loss(test_targets, test_pred_class))

    print(label_ranking_losses[-1])

    if mean_test_loss < best_test_loss:
        best_test_loss = mean_test_loss
    else:
        print('Early stopping')
        break
    # print('Losses: train - {:.3f}, test - {:.3f}'.format(np.mean(train_losses), mean_test_loss))

    # print('Test: accuracy - {:.3f}, precision - {:.3f}, recall - {:.3f}, f1 macro - {:.3f}'.format(accuracy, precision, recall, f1))

Epoch 1: 100%|██████████| 49110/49110 [00:26<00:00, 1847.89it/s, train_loss=0.094]
Epoch 2:   1%|          | 256/49110 [00:00<00:26, 1844.09it/s, train_loss=0.0939]

0.07357421026940943


Epoch 2: 100%|██████████| 49110/49110 [00:26<00:00, 1862.80it/s, train_loss=0.0723]
Epoch 3:   1%|          | 256/49110 [00:00<00:26, 1851.69it/s, train_loss=0.0726]

0.07264052021242942


Epoch 3: 100%|██████████| 49110/49110 [00:26<00:00, 1874.69it/s, train_loss=0.0646]
Epoch 4:   1%|          | 256/49110 [00:00<00:28, 1697.24it/s, train_loss=0.0648]

0.07072471846551878


Epoch 4: 100%|██████████| 49110/49110 [00:26<00:00, 1853.96it/s, train_loss=0.06]


0.07245349041587791
Early stopping


In [0]:
predictions, targets = [], []
for x_batch, y_batch in test_data_loader:
    with torch.no_grad():
        pred_batch = model(x_batch)
    predictions.extend(pred_batch)
    targets.extend(y_batch.float())

In [0]:
# !cp /content/drive/My\ Drive/thresholds.npy .
class_thersholds = [0.2]*57

def score_model(preds, targets, thresholds):
    precisions, recalls, fscores = [], [], []
    precisions_sum, recalls_sum, fscores_sum = 0, 0, 0
    for idx, (pred, target) in enumerate(zip(preds, targets)):
        if idx % 100 == 0:
            print(idx, len(targets))
        tp, fp, tn, fn = [], [], [], []
        tp, fp, tn, fn = [], [], [], []
        for class_idx in range(len(pred)):
            class_is_predicted = pred[class_idx] > thresholds[class_idx]
            class_is_required = bool(target[class_idx])
            if class_is_predicted:
                if class_is_required:
                    tp.append(class_idx)
                else:
                    fp.append(class_idx)
            else:
                if not class_is_required:
                    tn.append(class_idx)
                else:
                    fn.append(class_idx)
        precision = len(tp) / (len(tp) + len(fp)) if any((tp, fp)) else 0
        recall = len(tp) / (len(tp) + len(fn)) if any((tp, fn)) else 0
        fscore = 2 * precision* recall / (precision + recall) if any((precision, recall)) else 0

        precisions.append(precision)
        precisions_sum += precision
        recalls.append(recall)
        recalls_sum += recall
        fscores.append(fscores)
        fscores_sum += fscore


    return precisions_sum/(idx+1), recalls_sum/(idx+1), fscores_sum/(idx+1)

msd_res = score_model(predictions, targets, class_thersholds)

0 8966
100 8966
200 8966
300 8966
400 8966
500 8966
600 8966
700 8966
800 8966
900 8966
1000 8966
1100 8966
1200 8966
1300 8966
1400 8966
1500 8966
1600 8966
1700 8966
1800 8966
1900 8966
2000 8966
2100 8966
2200 8966
2300 8966
2400 8966
2500 8966
2600 8966
2700 8966
2800 8966
2900 8966
3000 8966
3100 8966
3200 8966
3300 8966
3400 8966
3500 8966
3600 8966
3700 8966
3800 8966
3900 8966
4000 8966
4100 8966
4200 8966
4300 8966
4400 8966
4500 8966
4600 8966
4700 8966
4800 8966
4900 8966
5000 8966
5100 8966
5200 8966
5300 8966
5400 8966
5500 8966
5600 8966
5700 8966
5800 8966
5900 8966
6000 8966
6100 8966
6200 8966
6300 8966
6400 8966
6500 8966
6600 8966
6700 8966
6800 8966
6900 8966
7000 8966
7100 8966
7200 8966
7300 8966
7400 8966
7500 8966
7600 8966
7700 8966
7800 8966
7900 8966
8000 8966
8100 8966
8200 8966
8300 8966
8400 8966
8500 8966
8600 8966
8700 8966
8800 8966
8900 8966


In [0]:
msd_res

(0.5155925740864236, 0.6507549242464463, 0.5369942173729173)

In [0]:
np.save("/content/drive/My Drive/msd_res.npy", msd_res)

In [0]:
msd_res