In [1]:
import random
random.seed(42)
import spacy
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# !git clone https://github.com/Lednik7/SpacyToolKit.git
# !python -m pip install -r SpacyToolKit/requirements.txt

In [23]:
from SpacyToolKit.formatting.preprocessing_v1 import *

In [24]:
train = pd.read_csv("vacs_train.csv", sep=";")
train = train.dropna(subset=["description", "key_skills"]).drop_duplicates()[["description", "key_skills"]]

In [25]:
train.head()

Unnamed: 0,description,key_skills
8,Обязанности: Составление сметы по ремонтно-от...,Ответственность | Коммуникабельность
9,В сеть магазинов детской и подростковой обуви ...,Консультативные продажи | Активные продажи | 1...
16,Менеджер по продажам грузовых шин.Вакансия тол...,Активные продажи | Холодные продажи | Работа в...
17,Менеджер по работе с клиентамиТребования: хол...,Холодные звонки | Холодные продажи | Ведение п...
18,Менеджер по продажам грузовых шин.Вакансия тол...,Активные продажи | Холодные продажи | Работа в...


In [26]:
print(f"Number of data: {len(train)}")

Number of data: 20169


In [27]:
desc = train.description.to_list()
skill = train.key_skills.to_list()

In [28]:
TRAIN_DATA = preprocessing(desc, skill, "Skills")
TRAIN_DATA = choice(delete_None(TRAIN_DATA), 6) #6 - number of entities

100%|██████████| 20169/20169 [00:01<00:00, 15559.31it/s]


In [29]:
print(f"Number of data for training: {len(TRAIN_DATA)}")

Number of data for training: 526


In [30]:
def create_blank(TRAIN_DATA):
  nlp = spacy.blank("en")
  ner = nlp.create_pipe("ner")
  nlp.add_pipe(ner, last=True)

  for _, annotations in TRAIN_DATA:
      for ent in annotations.get('entities'):
          ner.add_label(ent[2])
  return nlp, ner

In [31]:
nlp, ner = create_blank(TRAIN_DATA)

In [32]:
def begin_training(nlp, TRAIN_DATA, n_iter=1):
  basket = []
  other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

  with nlp.disable_pipes(*other_pipes):
      optimizer = nlp.begin_training()
      for itn in range(n_iter):
          print("\n Iterations:", itn+1)
          random.shuffle(TRAIN_DATA)
          losses = {}

          for text, annotations in tqdm(TRAIN_DATA):
            try:
              nlp.update(
                  [text],  # batch of texts
                  [annotations],  # batch of annotations
                  drop=0.5,  # dropout - make it harder to memorise data
                  sgd=optimizer,  # callable to update weights
                  losses=losses)
              
              basket.append((text, annotations))
            except:
              pass
          print(f"\n Losses: {losses}")

  print("Done")
  return nlp, basket

In [33]:
basket = begin_training(nlp, TRAIN_DATA, 1)[1]

  0%|          | 2/526 [00:00<00:50, 10.32it/s]


 Iterations: 1


100%|██████████| 526/526 [00:54<00:00,  9.61it/s]


 Losses: {'ner': 8540.893973494269}
Done





In [34]:
from sklearn.model_selection import train_test_split
Train, Test = train_test_split(basket, test_size=0.33, random_state=42)

In [35]:
model = begin_training(nlp, Train, 3)[0]

  0%|          | 1/339 [00:00<00:44,  7.64it/s]


 Iterations: 1


100%|██████████| 339/339 [00:37<00:00,  9.15it/s]
  1%|          | 2/339 [00:00<00:31, 10.61it/s]


 Losses: {'ner': 3498.9621085440453}

 Iterations: 2


100%|██████████| 339/339 [00:36<00:00,  9.20it/s]
  0%|          | 1/339 [00:00<00:52,  6.40it/s]


 Losses: {'ner': 3111.8867971641907}

 Iterations: 3


100%|██████████| 339/339 [00:37<00:00,  9.13it/s]


 Losses: {'ner': 3208.3077564661453}
Done





In [36]:
def show(train):
  lst = []
  for i in train[1]["entities"]:
    lst.append(train[0][i[0]:i[1]])
  return lst

**Our metrics:** mean(intersection/union) * 100

---



In [37]:
metrics = []
for text, annotations in Test:
    doc = model(text)
    predictions = set([(ent.text) for ent in doc.ents])
    y = set(show((text, annotations)))
    metrics.append(
        (len(predictions.intersection(y))) / (len(predictions.union(y)))
    )
    """print('Entities: ', predictions)
    print(y)
    print("-"*100)"""

In [38]:
import numpy as np
score = np.mean(metrics)
print(f"Score: {score*100}")

Score: 46.167260898710644
