# Solution 2

This solution is adapted from this [Medium blog](https://medium.com/data-science-in-your-pocket/training-custom-ner-system-using-crfs-146e0e922851).

## Read Data

In [1]:
import pandas as pd

# Load the training dataset
train_df = pd.read_json(path_or_buf='../data/dev/train.jsonl', lines=True)

## Data Preprocessing

In [2]:
# Uncomment the following line if you don't have "ru_core_news_md"
# !python -m spacy download ru_core_news_md

In [3]:
import spacy
import ru_core_news_md

# Load Russian spacy nlp pipeline to extract features from the text
ru_nlp = ru_core_news_md.load()

# A method to extract the features from a given sentence from the dataset
def extract_features(sentence):

    def update_dict(word, pos):
        if pos == "previous":
            pos = "-"
        elif pos == "next":
            pos = "+"

        postag = word.pos_
        return {
            f"{pos}1:word.lower()": word.text.lower(),
            f"{pos}1:word.istitle()": word.text.istitle(),
            f"{pos}1:word.isupper()": word.text.isupper(),
            f"{pos}1:postag": postag,
            f"{pos}1:postag[:2]": postag[:2],
        }

    # Pass the sentence through the pipeline
    sentence = ru_nlp(sentence)
    sentence_features = []
    for i, token in enumerate(sentence):
        postag = token.pos_

        # Parameters from the Medium blog (achieved best performancee)
        token_features = {
            "bias": 1.0,
            "word.lower()": token.text.lower(),
            "word[-3:]": token.text[-3:],
            "word[-2:]": token.text[-2:],
            "word.isupper()": token.text.isupper(),
            "word.istitle()": token.text.istitle(),
            "word.isdigit()": token.text.isdigit(),
            "postag": postag,
            "postag[:2]": postag[:2],
        }

        if i == 0:
            # Add beginning of sentence token
            token_features["BOS"] = True

        else:
            # Add previous word's features
            token_features.update(update_dict(sentence[i - 1], pos="previous"))
        
        if i < len(sentence) - 1:
            # Add next word's features
            token_features.update(update_dict(sentence[i + 1], pos="next"))
        else:
            # Add end of sentence token
            token_features["EOS"] = True

        sentence_features.append(token_features)

    return sentence_features

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from tqdm import tqdm

# Generating features and labels
labels_ = []
features = []
for idx, row in tqdm(train_df.iterrows(), total=len(train_df), desc="Extracting features for training data"):
    sentence, ners = row["sentences"], row["ners"]

    doc = ru_nlp(sentence)
    tokens = []
    labels = ['O'] * len(doc)  # default label
    for start, end, label in ners:
        for token in doc:
            if token.idx == start:
                labels[token.i] = 'B-' + label  # beginning of entity
            elif start < token.idx < end:
                labels[token.i] = 'I-' + label  # inside of entity
    features.append(extract_features(sentence))
    labels_.append(labels)

Extracting features for training data: 100%|██████████| 519/519 [01:49<00:00,  4.74it/s]


In [5]:
# Create a df for features and labels
preprocessed_train_df = pd.DataFrame({'token_features': features, 'labels': labels_})

## Train

### Split Training Data

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_eval, y_train, y_eval = train_test_split(
    preprocessed_train_df['token_features'], preprocessed_train_df['labels'], test_size=0.2, random_state=42)

### Fit CRF Model

In [7]:
import sklearn_crfsuite

# Define the Conditional Random Fields model
crf = sklearn_crfsuite.CRF(
    algorithm='l2sgd',
    max_iterations=200,
    all_possible_transitions=True
)

try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass

## Evaluate

In [8]:
y_pred = crf.predict(X_eval)

In [9]:
from sklearn_crfsuite.metrics import flat_f1_score

f1 = flat_f1_score(
    y_eval, y_pred, average='weighted')
print(f"Flat F1-score on evaluation data: {f1:.4f}")

Flat F1-score on evaluation data: 0.8122


## Train for Prediction

Train the crf model on the whole dataset to get better results for the test dataset.

In [10]:
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    algorithm='l2sgd',
    max_iterations=200,
    all_possible_transitions=True
)

try:
    crf.fit(preprocessed_train_df['token_features'],
            preprocessed_train_df['labels'])
except AttributeError:
    pass

## 

## Predict

### Load Test Data

In [11]:
test_df = pd.read_json(path_or_buf='../data/test/test.jsonl', lines=True)

test_df.head()

Unnamed: 0,senences,id
0,Владелец «Бирмингема» получил шесть лет тюрьмы...,584
1,Акция протеста на Майдане Независимости объявл...,585
2,Фольксваген может перейти под контроль Порше \...,586
3,В Москве покажут фильмы Чарли Чаплина с живой ...,587
4,Чулпан Хаматова сыграет главную роль в фильме ...,588


### Extract features

In [12]:
features_test = []
# Iterating over each row in the test dataframe with a progress bar
for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Extracting features for test data"):
    # Extracting the sentence from the row; note the typo in 'senences' which should be 'sentences'
    sentence = row.senences
    # Extracting features using a defined function and appending to the list
    features_test.append(extract_features(sentence))

# Converting the list of features into a pandas Series
X_pred = pd.Series((feature for feature in features_test))
# Making predictions using the CRF model
y_pred = crf.predict(X_pred)

Extracting features for test data: 100%|██████████| 65/65 [00:06<00:00,  9.82it/s]


### Make Predictions

In [14]:
predictions = []
for idx, (tokens, predicted_entities) in enumerate(tqdm(zip(test_df["senences"], y_pred), total=len(y_pred), desc="Making Predictions for Test Data")):
    answer = []
    current_start_index = None
    current_entity = None

    # Process each token and its corresponding predicted entity
    for token, entity in zip(ru_nlp(tokens), predicted_entities):
        if entity.startswith('B'):
            # Beginning of a new entity
            current_start_index = token.idx
            current_end_index = token.idx + len(token.text) - 1
            current_entity = entity[2:]  # Get the entity type
            answer.append(
                [current_start_index, current_end_index, current_entity])
        elif entity.startswith('I') and current_start_index is not None:
            # Inside an entity; update the end index of the last element
            current_end_index = token.idx + len(token.text) - 1
            # Update the last entity's end index
            answer[-1][1] = current_end_index

    predictions.append(answer)

Making Predictions for Test Data: 100%|██████████| 65/65 [00:06<00:00, 10.00it/s]


## Save Submission File

In [15]:
# Save predictions to test.jsonl file
pd.DataFrame({'ners':  predictions, 'id': test_df["id"]}).to_json('test.jsonl', orient='records',lines=True)

In [16]:
!zip ./test.zip ./test.jsonl

updating: test.jsonl (deflated 73%)
