# Solution 1

To begin with, we start by making a naive solution based on making a dictionary that stores the most frequent tags (ners) for each token (or multiple tokens if the entity consists of more than one word), and use it to predict the ners in the test dataset.

## Read Data

In [106]:
import pandas as pd

# Load the training dataset
train_df = pd.read_json("../data/dev/train.jsonl", lines=True)

train_df.head()

Unnamed: 0,ners,sentences,id
0,"[[0, 5, CITY], [16, 23, PERSON], [34, 41, PERS...",Бостон взорвали Тамерлан и Джохар Царнаевы из ...,0
1,"[[21, 28, PROFESSION], [53, 67, ORGANIZATION],...",Умер избитый до комы гитарист и сооснователь г...,1
2,"[[0, 4, PERSON], [37, 42, COUNTRY], [47, 76, O...",Путин подписал распоряжение о выходе России из...,2
3,"[[0, 11, PERSON], [36, 47, PROFESSION], [49, 6...",Бенедикт XVI носил кардиостимулятор\nПапа Римс...,3
4,"[[0, 4, PERSON], [17, 29, ORGANIZATION], [48, ...",Обама назначит в Верховный суд латиноамериканк...,4


## Exploration

In [107]:
tag_freq = {}

for ners in train_df["ners"]:
    for start, end, tag in ners:
        if tag in tag_freq.keys():
            tag_freq[tag] += 1
        else:
            tag_freq[tag] = 1

sorted_tag_freq = dict(sorted(tag_freq.items(), key=lambda item: item[1], reverse=True))
print("These are the most frequent tags in the training dataset (sorted from most to least):")
sorted_tag_freq

These are the most frequent tags in the training dataset (sorted from most to least):


{'PERSON': 5119,
 'PROFESSION': 5039,
 'ORGANIZATION': 4088,
 'EVENT': 3335,
 'DATE': 2689,
 'COUNTRY': 2510,
 'CITY': 1261,
 'NUMBER': 1107,
 'AGE': 657,
 'ORDINAL': 614,
 'NATIONALITY': 437,
 'FACILITY': 424,
 'STATE_OR_PROVINCE': 412,
 'LAW': 405,
 'AWARD': 404,
 'LOCATION': 314,
 'IDEOLOGY': 273,
 'WORK_OF_ART': 270,
 'PRODUCT': 245,
 'CRIME': 221,
 'DISEASE': 220,
 'TIME': 182,
 'MONEY': 179,
 'DISTRICT': 103,
 'PENALTY': 92,
 'RELIGION': 89,
 'PERCENT': 68,
 'LANGUAGE': 54,
 'FAMILY': 24}

## Create Dictionary

The dictionary will have the words (tokens) as keys, and the their most frequent tag as values. We will get the most frequent tag for each word by simply counting how many times the word has appeared with the specific tag in the training data.

In [108]:
from collections import Counter

def get_most_frequent_tags(train_df):
    most_freq_tag = {}

    for sentence, ners in zip(train_df["sentences"], train_df["ners"]):
        for start, end, tag in ners:

            word = sentence[start:end+1]

            if word in most_freq_tag:
                most_freq_tag[word].append(tag)
            else:
                most_freq_tag[word] = [tag]

    for key in most_freq_tag.keys():
        tag, freq = Counter(most_freq_tag[key]).most_common()[0]

        most_freq_tag[key] = tag
    
    return most_freq_tag

In [109]:
most_freq_tag = get_most_frequent_tags(train_df)

print("This is how the dictionary looks like (first 5 items):")
dict(list(most_freq_tag.items())[:5])

This is how the dictionary looks like (first 5 items):


{'Бостон': 'CITY',
 'Тамерлан': 'PERSON',
 'Царнаевы': 'PERSON',
 'Северного Кавказа': 'LOCATION',
 'спецоперация по поимке': 'EVENT'}

## Prediction

### Load Test Data

In [110]:
test_df = pd.read_json("../data/test/test.jsonl", lines=True)

test_df.head()

Unnamed: 0,senences,id
0,Владелец «Бирмингема» получил шесть лет тюрьмы...,584
1,Акция протеста на Майдане Независимости объявл...,585
2,Фольксваген может перейти под контроль Порше \...,586
3,В Москве покажут фильмы Чарли Чаплина с живой ...,587
4,Чулпан Хаматова сыграет главную роль в фильме ...,588


### Define Prediction Methods

In [111]:
import re


def tokenize_russian(text):
    # Regular expression to match words and ignore punctuation
    pattern = r'[^\w\s]|_'
    # Replace punctuations with spaces before splitting
    cleaned_text = re.sub(pattern, ' ', text)
    # Split the text by whitespace and filter out any empty tokens
    tokens = [token for token in cleaned_text.split() if token]
    return tokens

def generate_ngrams(tokens, n=4):
    ngrams = []
    for i in range(1, n+1):  # To generate n-grams for n=1 to 4
        for j in range(len(tokens) - i + 1):
            ngrams.append(' '.join(tokens[j:j+i]))
    return ngrams


def predict(df):
    test_predictions = []

    for id, sentence in zip(df["id"], df["senences"]):

        # Tokenize the sentence
        sentence_tokens = tokenize_russian(sentence)

        index = 0

        sentence_ners = []
        for ngrams in generate_ngrams(sentence_tokens, n=29):
            if ngrams in most_freq_tag:
                # Start index of current ngram in the full sentence
                start = index + sentence[index:].find(ngrams)
                # End index of current ngram in the full sentence
                end = start + len(ngrams) - 1
                # Add prediction to current sentence's ners
                sentence_ners.append((start, end, most_freq_tag[ngrams]))

                # Increase the index by the predicted word(s) length
                index += len(ngrams) + 1

        test_predictions.append(sentence_ners)

    return [{"ners": test_predictions[i], "id": test_df.iloc[i]["id"]} for i in range(len(test_df))]

### Make Predictions

In [112]:
test_predictions = predict(test_df)
test_predictions[0]

{'ners': [(30, 34, 'NUMBER'),
  (40, 45, 'PENALTY'),
  (64, 69, 'PERSON'),
  (128, 134, 'DATE'),
  (137, 137, 'NUMBER'),
  (145, 147, 'EVENT'),
  (149, 156, 'STATE_OR_PROVINCE'),
  (158, 167, 'EVENT'),
  (298, 302, 'NUMBER'),
  (320, 329, 'PENALTY'),
  (350, 351, 'NUMBER'),
  (382, 389, 'EVENT'),
  (403, 404, 'NUMBER'),
  (406, 414, 'NUMBER'),
  (472, 475, 'DATE'),
  (480, 483, 'DATE'),
  (485, 488, 'DATE'),
  (534, 537, 'DATE'),
  (562, 563, 'NUMBER'),
  (350, 350, 'NUMBER'),
  (406, 414, 'NUMBER'),
  (130, 138, 'DATE'),
  (310, 329, 'PENALTY'),
  (374, 389, 'EVENT'),
  (532, 542, 'DATE')],
 'id': 584}

### Save Predictions to .jsonl

In [113]:
import json
import numpy as np

# Helper class to convert int64 integers to normal integers: see https://stackoverflow.com/questions/50916422/python-typeerror-object-of-type-int64-is-not-json-serializable
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        return super(NpEncoder, self).default(obj)

# Helper funcion to save the list of final predictions in a ".jsonl" format
def save_json_lines(data, path):
    with open(path, 'w', encoding='utf-8') as file:
        for entry in data:
            json.dump(entry, file, ensure_ascii=False, cls=NpEncoder)
            file.write('\n')

In [114]:
!mkdir predictions

In [115]:
save_json_lines(test_predictions, "./test.jsonl")

In [116]:
!zip ./test.zip ./test.jsonl

updating: test.jsonl (deflated 78%)
