In [2]:
!pip install pip install snorkel

Collecting snorkel
  Using cached snorkel-0.9.8-py3-none-any.whl (103 kB)
Collecting munkres>=1.0.6
  Using cached munkres-1.1.4-py2.py3-none-any.whl (7.0 kB)
Collecting scikit-learn<0.25.0,>=0.20.2
  Downloading scikit_learn-0.24.2-cp38-cp38-macosx_10_13_x86_64.whl (7.2 MB)
     |████████████████████████████████| 7.2 MB 5.2 MB/s            
Collecting tensorboard<2.7.0,>=2.0.0
  Using cached tensorboard-2.6.0-py3-none-any.whl (5.6 MB)
Collecting numpy<1.20.0,>=1.16.5
  Downloading numpy-1.19.5-cp38-cp38-macosx_10_9_x86_64.whl (15.6 MB)
     |████████████████████████████████| 15.6 MB 8.4 MB/s            
Collecting google-auth<2,>=1.6.3
  Using cached google_auth-1.35.0-py2.py3-none-any.whl (152 kB)
Installing collected packages: numpy, google-auth, tensorboard, scikit-learn, munkres, snorkel
  Attempting uninstall: numpy
    Found existing installation: numpy 1.21.2
    Uninstalling numpy-1.21.2:
      Successfully uninstalled numpy-1.21.2
  Attempting uninstall: google-auth
    Found

In [1]:
from snorkel.preprocess.nlp import SpacyPreprocessor

In [64]:
import numpy as np
from scipy.sparse import dok_matrix, vstack, csr_matrix

ABSTAIN = -1
LOCATION = 1
ORGANIZATION = 2
PERSON = 3

loc_file = open("../data/locations/locations.txt", "r")
per_file = open("../data/people/people.txt", "r")
org_file = open("../data/organizations/organizations.txt", "r")

loc = set(loc_file.read().splitlines())
per = set(per_file.read().splitlines())
org = set(org_file.read().splitlines())


# helper functions
def dict_match(sentence, dictionary, max_ngrams=2):
   m = {}
   for i in range(len(sentence)):
       for j in range(i+1, min(len(sentence), i + max_ngrams) + 1):
           term = ' '.join(sentence[i:j])
           term = term.strip()
           if term in dictionary:
               m.update({idx:1 for idx in range(i,j+1)})
   return m
           
def create_token_L_mat(Xs, Ls, num_lfs):
   """
   Create token-level label-function matrix from label-functions indexed by sentence
   """
   Yws = []
   print(Xs)
   print(Ls)
   for sent_i in range(len(Xs)):
       ys = dok_matrix((len(Xs[sent_i]), num_lfs))
       for lf_i in range(num_lfs):
           for word_i,y in Ls[sent_i][lf_i].items():
               ys[word_i, lf_i] = y
       Yws.append(ys)
   return csr_matrix(vstack(Yws))
  
# labeling functions
def LF_is_location(s):
   matches = dict_match(s, loc)
   return {i:LOCATION if i in matches else ABSTAIN for i in range(len(s))}
   
def LF_is_organization(s):
   matches = dict_match(s, org)
   return {i:ORGANIZATION if i in matches else ABSTAIN for i in range(len(s))}

def LF_is_person(s):
   matches = dict_match(s, per)
   return {i:PERSON if i in matches else ABSTAIN for i in range(len(s))}

# training set
sents = [
   "Бойко Борисов е основател на ГЕРБ".split(),
]

lfs = [
   LF_is_location,
   LF_is_organization,
   LF_is_person
]

# apply labeling functions and transform label matrix 
L = [[lf(s) for lf in lfs] for s in sents] 
L = create_token_L_mat(sents, L, len(lfs))


# train your Snorkel label model 

[['Бойко', 'Борисов', 'е', 'основател', 'на', 'ГЕРБ']]
[[{0: -1, 1: -1, 2: -1, 3: -1, 4: -1, 5: -1}, {0: -1, 1: -1, 2: -1, 3: -1, 4: -1, 5: 2}, {0: 3, 1: 3, 2: 3, 3: -1, 4: -1, 5: -1}]]


In [65]:
print(L)

  (0, 0)	-1.0
  (0, 1)	-1.0
  (0, 2)	3.0
  (1, 0)	-1.0
  (1, 1)	-1.0
  (1, 2)	3.0
  (2, 0)	-1.0
  (2, 1)	-1.0
  (2, 2)	3.0
  (3, 0)	-1.0
  (3, 1)	-1.0
  (3, 2)	-1.0
  (4, 0)	-1.0
  (4, 1)	-1.0
  (4, 2)	-1.0
  (5, 0)	-1.0
  (5, 1)	2.0
  (5, 2)	-1.0
