In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/silero-stress-predictor/sample_submission.csv
/kaggle/input/silero-stress-predictor/train.csv
/kaggle/input/silero-stress-predictor/test.csv


# Research part

Importing libraries

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

# import multiprocessing
# from gensim.models import Word2Vec

import warnings
warnings.filterwarnings("ignore")

Reading the data

In [3]:
df = pd.read_csv('/kaggle/input/silero-stress-predictor/train.csv')
df.head()

Unnamed: 0,id,word,stress,num_syllables,lemma
0,0,румяной,2,3,румяный
1,1,цифрами,1,3,цифра
2,2,слугами,1,3,слуга
3,3,выбирает,3,4,выбирать
4,4,управдом,3,3,управдом


A function for feature extracting

Setting constants

In [4]:
MAX_SYLLABUSES_COUNT = 6
N_CLASSES = 6

Function for naive syllabus extraction. The main thing is that each syllabus is based on single vowel. Also, it adds padding syllabuses based on constants above

In [5]:
def simple_syllabus_extractor(x):
    vowels = ["ё", "у", "е", "ы", "о", "э", "я", "и", "ю", "а"]
    
    syllabuses = [''] * MAX_SYLLABUSES_COUNT
    
    k = 0
    sub_syl = ''
    for e in x:
        sub_syl += e
        if e in vowels:
            syllabuses[k] = sub_syl
            sub_syl = ''
            k += 1
    
    if sub_syl:
        syllabuses[k - 1] += sub_syl
            
    return syllabuses

An example of syllabus extraction

In [6]:
simple_syllabus_extractor('румяной')

['ру', 'мя', 'ной', '', '', '']

Setting dictionary that maps token to its index

In [7]:
d_ind = 2
d = {'<UNKN>': 0, '': 1}

Function for numericalizing a word: it just split the word into naive syllabuses and the use the dictionary above to convert the syllabuses to corresponding indexes

In [8]:
def numericalize_word(x, fill_dict=False):
    """
    fill_dict: If True then global dict will be updated using x, otherwise 
    """
    global d_ind
    
    syllabuses = simple_syllabus_extractor(x)
    
    syllabuses_repr = np.zeros(shape=6, dtype=int)
        
    for i, syllabus in enumerate(syllabuses):
        if syllabus in d:
            syllabuses_repr[i] = d[syllabus]
        else:
            if fill_dict:
                d[syllabus] = d_ind
                syllabuses_repr[i] = d_ind
                d_ind += 1
            else:
                syllabuses_repr[i] = d['<UNKN>']
    
    return syllabuses_repr

Full feature set

In [9]:
X = df['word']
X[:5]

0     румяной
1     цифрами
2     слугами
3    выбирает
4    управдом
Name: word, dtype: object

Full target set

In [10]:
y = (df['stress'] - 1).to_numpy()

Function for splitting data into train, test and validation part

In [11]:
def train_val_test_split(x, y, train_size=0.7, val_size=0.1):
    #test = 1 - train - val
    x_train, x_, y_train, y_ = train_test_split(x, y, train_size=train_size, stratify=y, shuffle=True)
    x_val, x_test, y_val, y_test = train_test_split(x_, y_, train_size=val_size/(1-train_size), stratify=y_, shuffle=True)

    return x_train, y_train, x_val, y_val, x_test, y_test

Splitting full data

In [12]:
X_train_, y_train, X_val_, y_val, X_test_, y_test = train_val_test_split(X, y)

Function for numericalizing feature sets

In [13]:
def apply_boosted(X, fill_dict):
    res = []
    for i in X.index:
        res.append(numericalize_word(X.at[i], fill_dict))
    return res

Numericalizing feature sets. Remember that we need to create dictionary only based on training data

In [14]:
X_train = apply_boosted(X_train_, True)
X_val = apply_boosted(X_val_, False)
X_test = apply_boosted(X_test_, False)

In [15]:
X_train[:5], X_test[:5]

([array([2, 3, 4, 5, 6, 1]),
  array([2, 7, 8, 1, 1, 1]),
  array([ 9, 10,  1,  1,  1,  1]),
  array([11, 12, 13,  1,  1,  1]),
  array([ 2, 14, 15,  1,  1,  1])],
 [array([1028,  255,    5,    1,    1,    1]),
  array([  9, 362,  26,  90, 408,  43]),
  array([ 111, 1923,   93,    6,    1,    1]),
  array([  11, 2667, 1023,    1,    1,    1]),
  array([ 57,  50, 226, 579,  94,   1])])

Defining the size of the dictionary to use it in the model

In [16]:
LEN_D = len(d) + 1

Setting device

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Creating a class for dataset

In [18]:
class StressDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

        self.x = torch.tensor(self.x, dtype=torch.long).to(device)
        self.y = torch.tensor(self.y, dtype=torch.long).to(device)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return (self.x[idx], self.y[idx])

Creating datasets from the above data

In [19]:
dataset_train = StressDataset(X_train, y_train)
dataset_val = StressDataset(X_val, y_val)
dataset_test = StressDataset(X_test, y_test)

Creating dataloader. Batch size is pretty big, but the model is relatively simple, so we can afford it

In [20]:
batch_size = 1024

dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=False)
dataloader_val = DataLoader(dataset_train, batch_size=batch_size, shuffle=False)
dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)

Creating classification model.

It just uses Embedding layer with several Linear layers at the end

In [21]:
class StressTaggingModel(nn.Module):
    def __init__(self, embedding_dim, hidden_layer_size):
        super().__init__()
        
        self.embedding_dim = embedding_dim
        self.hidden_layer_size = hidden_layer_size
        
        self.emb = nn.Embedding(num_embeddings=LEN_D, embedding_dim=self.embedding_dim)
        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(0.3)
        self.fc1 = nn.Linear(self.embedding_dim * MAX_SYLLABUSES_COUNT, self.hidden_layer_size)
        self.act1 = nn.LeakyReLU()
        self.fc2 = nn.Linear(self.hidden_layer_size, N_CLASSES)
        #self.act = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = self.emb(x)
        x = self.flatten(x)
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.act1(x)
        x = self.fc2(x)
        #x = self.act(x)
        
        return x

Setting embedding size, hidden layer size and creating the model with these parameters

In [22]:
embedding_dim = 1024
hidden_layer_size = MAX_SYLLABUSES_COUNT ** 6

model = StressTaggingModel(embedding_dim, hidden_layer_size)

Setting optimizer and loss function

In [23]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

Putting model and loss to device

In [24]:
model = model.to(device)
criterion = criterion.to(device)

Defining train function. It also collect data to use it in classification report 

In [25]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0

    model.train()

    all_preds = []
    all_tags = []
    
    for batch in iterator:
        
        text = batch[0]
        tags = batch[1]
        
        optimizer.zero_grad()
        
        predictions = model(text)
        
        all_preds.append(predictions.detach().cpu().numpy())
        all_tags.append(tags.detach().cpu().numpy())

        loss = criterion(predictions, tags)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()

        
    return epoch_loss / len(iterator), np.concatenate(all_preds, 0).argmax(1).reshape(-1), np.concatenate(all_tags, 0)

Defining function for evaluating the model

In [26]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    
    model.eval()

    all_preds = []
    all_tags = []
    
    with torch.no_grad():
    
        for batch in iterator:

            text = batch[0]
            tags = batch[1]
            
            predictions = model(text)

            all_preds.append(predictions.detach().cpu().numpy())
            all_tags.append(tags.detach().cpu().numpy())
            
            loss = criterion(predictions, tags)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator), np.concatenate(all_preds, 0).argmax(1).reshape(-1), np.concatenate(all_tags, 0)

Performing training of the model. It uses classification_report at each iteration. The best model state is chosen based on validation set

In [27]:
N_EPOCHS = 35

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    train_loss, train_preds, train_tags = train(model, dataloader_train, optimizer, criterion)
    valid_loss, _, __ = evaluate(model, dataloader_val, criterion)
        
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    if epoch % 5 == 0:
        print(f'Epoch: {epoch+1:02}')
        print(f'\tTrain Loss: {train_loss:.3f}')

        print(classification_report(train_tags, train_preds))
    
        print(f'\t Val Loss: {valid_loss:.3f}')

Epoch: 01
	Train Loss: 7.546
              precision    recall  f1-score   support

           0       0.54      0.49      0.51     10677
           1       0.53      0.53      0.53     17481
           2       0.54      0.49      0.51     12931
           3       0.45      0.43      0.44      2926
           4       0.30      0.25      0.27       374
           5       0.00      0.06      0.00        17

    accuracy                           0.50     44406
   macro avg       0.39      0.38      0.38     44406
weighted avg       0.53      0.50      0.51     44406

	 Val Loss: 0.746
Epoch: 06
	Train Loss: 0.349
              precision    recall  f1-score   support

           0       0.80      0.80      0.80     10677
           1       0.83      0.85      0.84     17481
           2       0.90      0.88      0.89     12931
           3       0.95      0.93      0.94      2926
           4       0.94      0.92      0.93       374
           5       0.78      0.82      0.80        17

 

Evaluating the model on the train set

In [28]:
model.load_state_dict(torch.load('tut2-model.pt'))

test_loss, test_preds, test_tags = evaluate(model, dataloader_test, criterion)

print(f'Test Loss: {test_loss:.3f}')

print(test_preds[10:])
print(test_tags[10:])

print(classification_report(test_tags, test_preds))

Test Loss: 1.017
[3 0 1 ... 1 2 0]
[3 1 2 ... 1 2 0]
              precision    recall  f1-score   support

           0       0.77      0.80      0.79      3051
           1       0.80      0.84      0.82      4995
           2       0.89      0.83      0.86      3695
           3       0.88      0.84      0.86       836
           4       0.74      0.68      0.71       107
           5       0.00      0.00      0.00         4

    accuracy                           0.82     12688
   macro avg       0.68      0.66      0.67     12688
weighted avg       0.83      0.82      0.83     12688



# Full training

In this part we just use the labeled data to train the model and predicting targets for unlabeled data.

The pipeline is almost exact as the one used above

In [29]:
df_test = pd.read_csv('/kaggle/input/silero-stress-predictor/test.csv')
df_test.head()

Unnamed: 0,id,word,num_syllables,lemma
0,0,эпилепсия,5,эпилепсия
1,1,относящейся,5,относиться
2,2,размышлениями,6,размышление
3,3,модемы,3,модем
4,4,солнц,1,солнце


In [30]:
d_ind = 2
d = {'<UNKN>': 0, '': 1}

In [31]:
X_train_full = apply_boosted(X, True)
X_test_full = apply_boosted(df_test['word'], False)

In [32]:
X_train_full[:5], X_test_full[:5]

([array([2, 3, 4, 1, 1, 1]),
  array([5, 6, 7, 1, 1, 1]),
  array([8, 9, 7, 1, 1, 1]),
  array([10, 11, 12, 13,  1,  1]),
  array([14, 15, 16,  1,  1,  1])],
 [array([ 36, 195,  50, 234, 108,   1]),
  array([ 64, 115,  93,  92, 994,   1]),
  array([ 12, 604, 118,  51, 108,   7]),
  array([ 77,  49, 485,   1,   1,   1]),
  array([0, 1, 1, 1, 1, 1])])

In [33]:
y[:5]

array([1, 0, 0, 2, 2])

In [34]:
dataset_train_full = StressDataset(X_train_full, y)
#Fictive y_test in order to create dataloader and to form id further
dataset_test_full = StressDataset(X_test_full, df_test['id'].to_numpy(dtype=int))

In [35]:
dataloader_train_full = DataLoader(dataset_train_full, batch_size=batch_size, shuffle=False)
dataloader_test_full = DataLoader(dataset_test_full, batch_size=batch_size, shuffle=False)

In [36]:
LEN_D = len(d) + 1
LEN_D

7997

In [37]:
model_full = StressTaggingModel(embedding_dim, hidden_layer_size)

In [38]:
optimizer_full = optim.Adam(model_full.parameters(), lr=1e-3)
criterion_full = nn.CrossEntropyLoss()

In [39]:
model_full = model_full.to(device)
criterion_full = criterion_full.to(device)

In [40]:
N_EPOCHS = 35

best_full_loss = float('inf')

for epoch in range(N_EPOCHS):
    train_full_loss, train_full_preds, train_full_tags = train(model_full, dataloader_train_full, optimizer_full, criterion_full)
        
    if train_full_loss < best_full_loss:
        best_full_loss = train_full_loss
        torch.save(model_full.state_dict(), 'tut3-model.pt')
    
    if epoch % 5 == 0:
        print(f'Epoch: {epoch+1:02}')
        print(f'\tTrain Loss: {train_full_loss:.3f}')

        print(classification_report(train_full_tags, train_full_preds))

Epoch: 01
	Train Loss: 4.835
              precision    recall  f1-score   support

           0       0.55      0.50      0.52     15253
           1       0.55      0.59      0.57     24973
           2       0.60      0.57      0.59     18473
           3       0.63      0.51      0.56      4180
           4       0.29      0.34      0.31       535
           5       0.00      0.00      0.00        24

    accuracy                           0.56     63438
   macro avg       0.44      0.42      0.43     63438
weighted avg       0.56      0.56      0.56     63438

Epoch: 06
	Train Loss: 0.300
              precision    recall  f1-score   support

           0       0.83      0.83      0.83     15253
           1       0.86      0.87      0.86     24973
           2       0.93      0.91      0.92     18473
           3       0.95      0.94      0.95      4180
           4       0.92      0.90      0.91       535
           5       0.62      0.67      0.64        24

    accuracy       

In [41]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    
    model.eval()

    all_preds = []
    all_tags = []
    
    with torch.no_grad():
    
        for batch in iterator:

            text = batch[0]
            tags = batch[1]
            
            predictions = model(text)

            all_preds.append(predictions.detach().cpu().numpy())
            all_tags.append(tags.detach().cpu().numpy())
            
            #loss = criterion(predictions, tags)

            #epoch_loss += loss.item()
        
    return np.nan, np.concatenate(all_preds, 0).argmax(1).reshape(-1), np.concatenate(all_tags, 0)

In [42]:
model_full.load_state_dict(torch.load('tut3-model.pt'))

test_full_loss, test_full_preds, test_full_ids = evaluate(model_full, dataloader_test_full, criterion_full)

test_full_preds[:10], test_full_ids[:10]

(array([2, 2, 2, 1, 0, 1, 2, 1, 1, 2]), array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))

In [43]:
df_submission = pd.DataFrame.from_dict({'id': test_full_ids, 'stress': test_full_preds + 1}, orient='columns')
df_submission.set_index('id', inplace=True)
df_submission.head()

Unnamed: 0_level_0,stress
id,Unnamed: 1_level_1
0,3
1,3
2,3
3,2
4,1


In [44]:
df_submission.to_csv('submission.csv')