In [1]:
!wget https://raw.githubusercontent.com/HSE-CROSS-LING-DL/cma/master/data/trk-uncovered-train-transliterated.csv -O train.csv
!wget https://raw.githubusercontent.com/HSE-CROSS-LING-DL/cma/master/data/trk-uncovered-dev-transliterated.csv -O test.csv

--2019-12-12 14:49:58--  https://raw.githubusercontent.com/HSE-CROSS-LING-DL/cma/master/data/trk-uncovered-train-transliterated.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5650505 (5.4M) [text/plain]
Saving to: ‘train.csv’


2019-12-12 14:49:58 (46.0 MB/s) - ‘train.csv’ saved [5650505/5650505]

--2019-12-12 14:50:00--  https://raw.githubusercontent.com/HSE-CROSS-LING-DL/cma/master/data/trk-uncovered-dev-transliterated.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 92536 (90K) [text/plain]
Saving to: ‘test.csv’


2019-12-12 14:5

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
from tqdm import tqdm

In [0]:
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
data.shape

(79228, 5)

In [6]:
data = data.dropna()
data = data.reset_index(drop=True)
data.shape

(79228, 5)

In [7]:
test = test.dropna()
test = test.reset_index(drop=True)
test.shape

(1245, 5)

In [8]:
data['pos'].value_counts() * 100 / data.shape[0]

NOUN    53.789064
VERB    40.141112
ADJ      5.231736
ADV      0.838088
Name: pos, dtype: float64

In [9]:
data.head()

Unnamed: 0,lang,word,lemma,pos,morph
0,tat,казак,казак,NOUN,Case=Nom
1,tur,мuамеле,мuамеле,NOUN,Case=Nom
2,tat,заманы,заман,NOUN,"Case=Nom|Number[psor]=Sing,Plur|Person[psor]=3"
3,tur,конuт,конuт,NOUN,Case=Nom
4,tur,корuмасы,корu,VERB,"Case=Nom|Number[psor]=Sing,Plur|Person[psor]=3..."


In [0]:
import gensim

ft_model = gensim.models.FastText.load_fasttext_format('drive/My Drive/fasttext_multilingual/multilingual.bin')

POS-теги:

In [0]:
pos_mapper = {pos: n for n, pos in enumerate(data.pos.unique())}
data['pos_num'] = data.pos.map(pos_mapper)
test['pos_num'] = test.pos.map(pos_mapper)

In [12]:
data.head()

Unnamed: 0,lang,word,lemma,pos,morph,pos_num
0,tat,казак,казак,NOUN,Case=Nom,0
1,tur,мuамеле,мuамеле,NOUN,Case=Nom,0
2,tat,заманы,заман,NOUN,"Case=Nom|Number[psor]=Sing,Plur|Person[psor]=3",0
3,tur,конuт,конuт,NOUN,Case=Nom,0
4,tur,корuмасы,корu,VERB,"Case=Nom|Number[psor]=Sing,Plur|Person[psor]=3...",1


In [13]:
test.head()

Unnamed: 0,lang,word,lemma,pos,morph,pos_num
0,crh,егленcелер,егленcе,NOUN,Case=Nom|Number=Plur,0
1,crh,сüргüнликниң,сüргüнлик,NOUN,Case=Gen,0
2,crh,монархийада,монархийа,NOUN,Case=Loc,0
3,crh,кöстергенлери,кöстер,VERB,"Case=Nom|Number=Plur|Number[psor]=Sing,Plur|Pe...",1
4,crh,öксüз,öксüз,NOUN,Case=Nom,0


Data Loader

In [0]:
import torch
from torch.utils.data import Dataset, DataLoader

In [0]:
class VectorizedData(Dataset):
    
    def __init__(self, x_data, y_data, fasttext_model=ft_model, verbose=True):
        
        super().__init__()
        
        self.x_data = []
        self.y_data = y_data
        self.fasttext_model = fasttext_model

        self.load(x_data, verbose=verbose)

    def vectorize(self, word, fasttext_model=ft_model):
      try:
        return self.fasttext_model[word]
      except KeyError:
        pass

    def load(self, data, fasttext_model=ft_model, verbose=True):
        
        data_iterator = tqdm(data, desc='Loading data', disable=not verbose)
        
        for word in data_iterator:
            ft_vector = self.vectorize(word, fasttext_model)
            self.x_data.append(ft_vector)

    def __len__(self):
        return len(self.x_data)
    
    def __getitem__(self, idx):
        x = self.x_data[idx]
        y = self.y_data[idx]
        
        return x, y

In [16]:
train_dataset = VectorizedData(data.word, data.pos_num)

  from ipykernel import kernelapp as app
Loading data: 100%|██████████| 79228/79228 [00:00<00:00, 82525.56it/s]


In [17]:
test_dataset = VectorizedData(test.word, test.pos_num)

  from ipykernel import kernelapp as app
Loading data: 100%|██████████| 1245/1245 [00:00<00:00, 14307.77it/s]


In [0]:
train_data_loader = DataLoader(train_dataset, batch_size=64)
test_data_loader = DataLoader(test_dataset, batch_size=64)

Модель

In [0]:
from torch import nn
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

In [0]:
class POSTagger(nn.Module):

  def __init__(self, embedding_dim=100, n_classes=4):

    super().__init__()
    self.linear = nn.Linear(in_features=embedding_dim, out_features=n_classes)

  def forward(self, x):
    return self.linear(x)

In [0]:
model = POSTagger()

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters())

In [22]:
epochs = 15
losses = []

acc, macros, prec, rec = [], [], [], []
for n_epoch in range(epochs):
    
    train_losses = []
    test_losses = []
    test_preds = []
    test_targets = []
    test_pred_class = []
    
    progress_bar = tqdm(total=len(train_data_loader.dataset), desc='Epoch {}'.format(n_epoch + 1))
    
    for x, y in train_data_loader:     
        optimizer.zero_grad()
        
        pred = model(x)
        loss = criterion(pred, y)
        
        loss.backward()
        
        optimizer.step()
        
        train_losses.append(loss.item())
        losses.append(loss.item())
        
        progress_bar.set_postfix(train_loss = np.mean(losses[-500:]))

        progress_bar.update(x.shape[0])
        
    progress_bar.close()
    
    for x, y in test_data_loader:

      with torch.no_grad():
            
        pred = model(x)

        test_preds.append(pred.numpy())
        test_targets.append(y.numpy())
        test_pred_class.append(np.argmax(pred, axis=1))

        loss = criterion(pred, y)

        test_losses.append(loss.item())
        
    mean_test_loss = np.mean(test_losses)

    test_targets = np.concatenate(test_targets).squeeze()
    test_pred_class = np.concatenate(test_pred_class).squeeze()
    
    accuracy = accuracy_score(test_targets, test_pred_class)
    precision = precision_score(test_targets, test_pred_class, average='macro')
    recall = recall_score(test_targets, test_pred_class, average='macro')
    f1 = f1_score(test_targets, test_pred_class, average='macro')

    acc.append(accuracy)
    macros.append(f1)
    prec.append(precision)
    rec.append(recall)
    
    print()
    print('Losses: train - {:.3f}, test - {:.3f}'.format(np.mean(train_losses), mean_test_loss))

    print('Test: accuracy - {:.3f}, precision - {:.3f}, recall - {:.3f}, f1 macro - {:.3f}'.format(accuracy, precision, recall, f1))

Epoch 1: 100%|██████████| 79228/79228 [00:07<00:00, 10973.48it/s, train_loss=0.975]
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
Epoch 2:   3%|▎         | 2048/79228 [00:00<00:07, 10540.33it/s, train_loss=0.97]


Losses: train - 1.076, test - 0.921
Test: accuracy - 0.607, precision - 0.258, recall - 0.256, f1 macro - 0.216


Epoch 2: 100%|██████████| 79228/79228 [00:07<00:00, 11208.62it/s, train_loss=0.901]
Epoch 3:   3%|▎         | 2240/79228 [00:00<00:06, 11576.67it/s, train_loss=0.9]  


Losses: train - 0.913, test - 0.865
Test: accuracy - 0.607, precision - 0.259, recall - 0.257, f1 macro - 0.221


Epoch 3: 100%|██████████| 79228/79228 [00:06<00:00, 11392.15it/s, train_loss=0.885]
Epoch 4:   3%|▎         | 2112/79228 [00:00<00:06, 11663.34it/s, train_loss=0.885]


Losses: train - 0.887, test - 0.852
Test: accuracy - 0.598, precision - 0.238, recall - 0.251, f1 macro - 0.211


Epoch 4: 100%|██████████| 79228/79228 [00:06<00:00, 11480.71it/s, train_loss=0.879]
Epoch 5:   3%|▎         | 2304/79228 [00:00<00:06, 12194.35it/s, train_loss=0.878]


Losses: train - 0.879, test - 0.847
Test: accuracy - 0.596, precision - 0.237, recall - 0.251, f1 macro - 0.212


Epoch 5: 100%|██████████| 79228/79228 [00:07<00:00, 11026.87it/s, train_loss=0.876]
Epoch 6:   3%|▎         | 2176/79228 [00:00<00:06, 11292.45it/s, train_loss=0.876]


Losses: train - 0.875, test - 0.845
Test: accuracy - 0.594, precision - 0.233, recall - 0.250, f1 macro - 0.211


Epoch 6: 100%|██████████| 79228/79228 [00:06<00:00, 11521.50it/s, train_loss=0.875]
Epoch 7:   3%|▎         | 2112/79228 [00:00<00:06, 11109.03it/s, train_loss=0.875]


Losses: train - 0.873, test - 0.844
Test: accuracy - 0.597, precision - 0.239, recall - 0.252, f1 macro - 0.214


Epoch 7: 100%|██████████| 79228/79228 [00:06<00:00, 11623.77it/s, train_loss=0.874]
Epoch 8:   3%|▎         | 2368/79228 [00:00<00:06, 12303.54it/s, train_loss=0.873]


Losses: train - 0.872, test - 0.844
Test: accuracy - 0.600, precision - 0.249, recall - 0.255, f1 macro - 0.220


Epoch 8: 100%|██████████| 79228/79228 [00:06<00:00, 11907.91it/s, train_loss=0.873]
Epoch 9:   3%|▎         | 2368/79228 [00:00<00:05, 13314.06it/s, train_loss=0.872]


Losses: train - 0.871, test - 0.843
Test: accuracy - 0.606, precision - 0.508, recall - 0.263, f1 macro - 0.235


Epoch 9: 100%|██████████| 79228/79228 [00:06<00:00, 11325.10it/s, train_loss=0.872]
Epoch 10:   3%|▎         | 2176/79228 [00:00<00:06, 11094.72it/s, train_loss=0.872]


Losses: train - 0.871, test - 0.843
Test: accuracy - 0.609, precision - 0.514, recall - 0.265, f1 macro - 0.239


Epoch 10: 100%|██████████| 79228/79228 [00:07<00:00, 11121.65it/s, train_loss=0.872]
Epoch 11:   3%|▎         | 2240/79228 [00:00<00:06, 11266.34it/s, train_loss=0.872]


Losses: train - 0.870, test - 0.843
Test: accuracy - 0.608, precision - 0.513, recall - 0.265, f1 macro - 0.240


Epoch 11: 100%|██████████| 79228/79228 [00:06<00:00, 11418.72it/s, train_loss=0.871]
Epoch 12:   3%|▎         | 2368/79228 [00:00<00:06, 12716.79it/s, train_loss=0.871]


Losses: train - 0.870, test - 0.843
Test: accuracy - 0.610, precision - 0.514, recall - 0.266, f1 macro - 0.240


Epoch 12: 100%|██████████| 79228/79228 [00:06<00:00, 11493.99it/s, train_loss=0.871]
Epoch 13:   3%|▎         | 2240/79228 [00:00<00:06, 11740.53it/s, train_loss=0.871]


Losses: train - 0.870, test - 0.843
Test: accuracy - 0.606, precision - 0.509, recall - 0.265, f1 macro - 0.239


Epoch 13: 100%|██████████| 79228/79228 [00:07<00:00, 11159.03it/s, train_loss=0.871]
Epoch 14:   3%|▎         | 2176/79228 [00:00<00:06, 11715.61it/s, train_loss=0.871]


Losses: train - 0.869, test - 0.843
Test: accuracy - 0.606, precision - 0.508, recall - 0.264, f1 macro - 0.239


Epoch 14: 100%|██████████| 79228/79228 [00:07<00:00, 11208.72it/s, train_loss=0.871]
Epoch 15:   3%|▎         | 2368/79228 [00:00<00:06, 12040.36it/s, train_loss=0.87]


Losses: train - 0.869, test - 0.843
Test: accuracy - 0.604, precision - 0.505, recall - 0.264, f1 macro - 0.239


Epoch 15: 100%|██████████| 79228/79228 [00:06<00:00, 11647.40it/s, train_loss=0.87]



Losses: train - 0.869, test - 0.843
Test: accuracy - 0.603, precision - 0.504, recall - 0.263, f1 macro - 0.238
