Installation

In [None]:
pip install snorkel

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting snorkel
  Downloading snorkel-0.9.9-py3-none-any.whl (103 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/103.3 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 KB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting munkres>=1.0.6
  Downloading munkres-1.1.4-py2.py3-none-any.whl (7.0 kB)
Installing collected packages: munkres, snorkel
Successfully installed munkres-1.1.4 snorkel-0.9.9


**Task** 

Label data as a Question or Statement with the help of Snorkel

*   5 W (Why,When,Where,Who,Which)
*   ?
*   !

In [None]:
data = ["What would you name your boat if you had one? ",
"What's the closest thing to real magic? ",
"Who is the messiest person you know? ",
"What will finally break the internet? ",
"What's the most useless talent you have? ",
"What would be on the gag reel of your life? ",
"What Guilty Pleasure Makes You Feel Alive?",
"What’s your favorite way to spend a day off?",
"What type of music are you into?",
"What was the best vacation you ever took and why?",
"What are your hobbies, and how did you get into them?",
"What was your favorite age growing up?",
"Was the last thing you read?",
"What's your favorite ice cream topping?",
"What was the last TV show you binge-watched?",
"Are you into podcasts or do you only listen to music?",
"Do you have a favorite holiday? Why or why not?",
"What’s your favorite sleeping position?",
"What’s your go-to guilty pleasure?",
"What’s your favorite quote from a TV show/movie/book?",
"What’s your favorite thing about your current job?",
"What annoys you most?",
"What’s the career highlight you’re most proud of?",
"What do you remember most about your first job?",
"How old were you when you started working?",
"What’s the worst job you’ve ever had?",
"What’s your favorite part of the workday?",
"What’s the best career decision you’ve ever made?",
"What’s the worst career decision you’ve ever made?",
"How are you?",
"First, solve the problem. Then, write the code!",
"Experience is the name everyone gives to their mistakes!",
" In order to be irreplaceable, one must always be different!",
"Java is to JavaScript what car is to Carpet!",
"Knowledge is power!",
"Ruby is rubbish! PHP is phpantastic!",
" Code is like humor! When you have to explain it, it’s bad.",
"Fix the cause, not the symptom!",
"Simplicity is the soul of efficiency!",
"Before software can be reusable it first has to be usable!",
"Make it work, make it right, make it fast!",
"Programmer: A machine that turns coffee into code.",
"Computers are fast; programmers keep it slow.",
"Remember that there is no code faster than no code.",
"One man’s crappy software is another man’s full-time job.",
"No code has zero defects.",
"Deleted code is debugged code.",
"It’s not a bug — it’s an undocumented feature.",
"It works on my machine.",
"It compiles; ship it.",
"There is no Ctrl-Z in life.",
"Whitespace is never white."]

In [None]:
import pandas as pd
import random

In [None]:
# Shuffle dataset
random.shuffle(data)

pd.set_option('display.max_colwidth', None)

# Convert to Dataframe
df = pd.DataFrame({'sentences':data})
     

df.head()

Unnamed: 0,sentences
0,What type of music are you into?
1,How are you?
2,What’s the worst job you’ve ever had?
3,"First, solve the problem. Then, write the code!"
4,What was the best vacation you ever took and why?


In [None]:
# Train & Test
#from sklearn.model_selection import train_test_split
#df_train,df_test = train_test_split(df,train_size=0.5)
     

#print(df.shape)
#print(df_train.shape)

**Define Labeling Functions**


*   Keyword searches: looking for specific words in a sentence
*   Pattern matching: looking for specific syntactical patterns
*   Third-party models: using an pre-trained model (usually a model for a different task than the one at hand)


In [None]:
from snorkel.labeling import labeling_function,PandasLFApplier,LFAnalysis

# Constants for our labels
QUESTION = 1
STATEMENT = 0
ABSTAIN = -1

In [None]:
# Keyword search
@labeling_function()
def lf_keyword_lookup(x):
  keywords = ["why","what","when","who","where"] #how
  return QUESTION if any(word in x.sentences.lower() for word in keywords) else ABSTAIN

In [None]:
# Pattern matching
import re
@labeling_function()
def lf_regex(x):
  if re.search(r".*\?",x.sentences,flags=re.I):
    return QUESTION
  elif re.search(r".*!",x.sentences,flags=re.I):
    return STATEMENT
  else:
    return ABSTAIN

In [None]:
# Third-party 

from snorkel.labeling.lf.nlp import nlp_labeling_function
from snorkel.preprocess.nlp import SpacyPreprocessor

# The SpacyPreprocessor parses the text in text_field and
# stores the new enriched representation in doc_field
spacy = SpacyPreprocessor(text_field="sentences", doc_field="doc", memoize=True)

@labeling_function(pre=[spacy])
def lf_spacy(x):
    """Questions usually have a ? and the 5 W are adverbs"""
    if x.doc.text.endswith('?') or any([token.pos_ == "ADV" for token in x.doc]):
        return QUESTION
    elif x.doc.text.endswith('!'):
        return STATEMENT
    else:
        return ABSTAIN



In [None]:
# Apply Fxn
lfs = [lf_keyword_lookup,lf_regex, lf_spacy]
applier = PandasLFApplier(lfs=lfs)
preds_matrix = applier.apply(df=df)

100%|██████████| 52/52 [00:00<00:00, 135.14it/s]


In [None]:
# Label Matrix
preds_matrix

array([[ 1,  1,  1],
       [-1,  1,  1],
       [ 1,  1,  1],
       [-1,  0,  1],
       [ 1,  1,  1],
       [ 1,  1,  1],
       [ 1,  1,  1],
       [ 1,  1,  1],
       [ 1,  1,  1],
       [-1, -1,  1],
       [ 1,  1, -1],
       [ 1,  1,  1],
       [ 1,  1,  1],
       [ 1,  1,  1],
       [-1,  0,  1],
       [ 1,  0,  0],
       [-1,  0,  1],
       [-1, -1, -1],
       [ 1,  1,  1],
       [-1,  0,  1],
       [-1, -1, -1],
       [-1, -1, -1],
       [ 1,  1,  1],
       [ 1,  1,  1],
       [ 1,  1,  1],
       [ 1,  1,  1],
       [-1,  0,  0],
       [ 1,  0, -1],
       [-1,  0,  0],
       [ 1,  1,  1],
       [ 1,  1, -1],
       [-1,  1,  1],
       [ 1,  1,  1],
       [ 1,  1,  1],
       [-1, -1, -1],
       [-1,  0,  0],
       [ 1,  1,  1],
       [ 1,  1, -1],
       [ 1,  1,  1],
       [ 1,  1, -1],
       [ 1,  1,  1],
       [-1, -1, -1],
       [-1, -1,  1],
       [-1, -1, -1],
       [-1, -1, -1],
       [-1,  0,  0],
       [-1, -1, -1],
       [-1,  

In [None]:
# Evaluate the Performance
LFAnalysis(L=preds_matrix,lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lf_keyword_lookup,0,[1],0.557692,0.557692,0.038462
lf_regex,1,"[0, 1]",0.788462,0.788462,0.115385
lf_spacy,2,"[0, 1]",0.730769,0.692308,0.096154


Polarity: The set of unique labels this LF outputs (excluding abstains)

Coverage: The fraction of the dataset the LF labels

Overlaps: The fraction of the dataset where this LF and at least one other LF label

Conflicts: The fraction of the dataset where this LF and at least one other LF label and disagree


In [None]:
from snorkel.labeling.model import MajorityLabelVoter

majority_model = MajorityLabelVoter()
preds_label = majority_model.predict(L=preds_matrix)#, tie_break_policy="abstain", return_probs=True)

preds_label


array([ 1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  0, -1,
       -1,  1, -1, -1, -1,  1,  1,  1,  1,  0, -1,  0,  1,  1,  1,  1,  1,
       -1,  0,  1,  1,  1,  1,  1, -1,  1, -1, -1,  0, -1,  0,  1,  1,  1,
       -1])

**tie_break_policy**    
   
        - "abstain": return an abstain vote (-1)

        - "true-random": randomly choose among the tied options

        - "random": randomly choose among tied option using deterministic hash

In [None]:
df['label'] = preds_label

df

Unnamed: 0,sentences,label
0,What type of music are you into?,1
1,How are you?,1
2,What’s the worst job you’ve ever had?,1
3,"First, solve the problem. Then, write the code!",-1
4,What was the best vacation you ever took and why?,1
5,What’s the best career decision you’ve ever made?,1
6,What's your favorite ice cream topping?,1
7,What's the most useless talent you have?,1
8,What’s your favorite quote from a TV show/movie/book?,1
9,Whitespace is never white.,1
