# Snorkel Model v2

In [1]:
import snorkel
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from snorkel.labeling import LabelingFunction, PandasLFApplier
from snorkel.labeling.model.label_model import LabelModel

In [3]:
# Load Data
Labels = pd.read_csv('Labels.csv')
Labels.columns = ['Labels']
abstract_df = pd.read_csv('preprocessed_data_Jul14.csv')
abstract_df = abstract_df['concatenated_title_abstract'].to_frame(name='concatenated_title_abstract')

# Define Labeling Functions
LABEL_DICT = {label: i for i, label in enumerate(Labels['Labels'].tolist())}

In [4]:
abstract_df.iloc[0]

concatenated_title_abstract    KINEMATIC TREATMENT OF CORONAL MASS EJECTION E...
Name: 0, dtype: object

In [3]:
# Define ABSTAIN constant
ABSTAIN = -1

def lf_observatories(row):
    observatories = pd.read_csv('Astronomical Observatories.csv')['Name'].tolist()
    abstract = row['concatenated_title_abstract']
    for observatory in observatories:
        if observatory in abstract:
            return LABEL_DICT.get(observatory, ABSTAIN)
    return ABSTAIN

def lf_radio_telescopes(row):
    telescope_names = pd.read_csv('List of radio telescopes.csv')['Name'].tolist()
    abstract = row['concatenated_title_abstract']
    for telescope_name in telescope_names:
        if telescope_name in abstract:
            return LABEL_DICT.get(telescope_name, ABSTAIN)
    return ABSTAIN

def lf_solar_optical_telescopes(row):
    telescope_names = pd.read_csv('List of Solar Telescope Optical.csv')['Name/Observatory'].tolist()
    abstract = row['concatenated_title_abstract']
    for telescope_name in telescope_names:
        if telescope_name in abstract:
            return LABEL_DICT.get(telescope_name, ABSTAIN)
    return ABSTAIN

def lf_solar_radio_telescopes(row):
    telescope_names = pd.read_csv('List of Solar Telescope Radio.csv')['Name/Observatory'].tolist()
    abstract = row['concatenated_title_abstract']
    for telescope_name in telescope_names:
        if telescope_name in abstract:
            return LABEL_DICT.get(telescope_name, ABSTAIN)
    return ABSTAIN

def lf_space_telescopes(row):
    telescope_names = pd.read_csv('List of space telescopes.csv')['Name'].tolist()
    abstract = row['concatenated_title_abstract']
    for telescope_name in telescope_names:
        if telescope_name in abstract:
            return LABEL_DICT.get(telescope_name, ABSTAIN)
    return ABSTAIN

def lf_telescope_types(row):
    telescope_types = ['radio telescope', 'solar telescope', 'space telescope']
    abstract = row['concatenated_title_abstract']
    for telescope_type in telescope_types:
        if telescope_type in abstract:
            return LABEL_DICT.get(telescope_type, ABSTAIN)
    return ABSTAIN


# Wrap your functions using Snorkel's LabelingFunction
lfs = [
    LabelingFunction(name="LF_Observatories", f=lf_observatories),
    LabelingFunction(name="LF_RadioTelescopes", f=lf_radio_telescopes),
    LabelingFunction(name="LF_SolarOpticalTelescopes", f=lf_solar_optical_telescopes),
    LabelingFunction(name="LF_SolarRadioTelescopes", f=lf_solar_radio_telescopes),
    LabelingFunction(name="LF_SpaceTelescopes", f=lf_space_telescopes),
    LabelingFunction(name="LF_TelescopeTypes", f=lf_telescope_types)
]

In [4]:
# Apply the labeling functions to the abstract data
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(abstract_df)

100%|██████████| 63999/63999 [10:09<00:00, 104.99it/s]


In [5]:
# Train a Snorkel label model
NUM_LABELS = len(Labels)
label_model = LabelModel(cardinality=NUM_LABELS)
label_model.fit(L_train, n_epochs=100, log_freq=10, seed=123)

INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/100 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.012]
 10%|█         | 10/100 [01:12<10:44,  7.16s/epoch]INFO:root:[10 epochs]: TRAIN:[loss=0.012]
 20%|██        | 20/100 [02:23<09:32,  7.16s/epoch]INFO:root:[20 epochs]: TRAIN:[loss=0.012]
 30%|███       | 30/100 [03:35<08:20,  7.15s/epoch]INFO:root:[30 epochs]: TRAIN:[loss=0.012]
 40%|████      | 40/100 [04:46<07:09,  7.17s/epoch]INFO:root:[40 epochs]: TRAIN:[loss=0.012]
 50%|█████     | 50/100 [05:58<05:57,  7.15s/epoch]INFO:root:[50 epochs]: TRAIN:[loss=0.011]
 60%|██████    | 60/100 [07:09<04:45,  7.15s/epoch]INFO:root:[60 epochs]: TRAIN:[loss=0.011]
 70%|███████   | 70/100 [08:21<03:34,  7.14s/epoch]INFO:root:[70 epochs]: TRAIN:[loss=0.011]
 80%|████████  | 80/100 [09:32<02:22,  7.14s/epoch]INFO:root:[80 epochs]: TRAIN:[loss=0.011]
 90%|█████████ | 90/100 [10:43<01:11,  7.13s/epoch]INFO:root:[90 epochs]: TRAIN:[loss=0.011]
100%|██████████| 100/100 [1

In [6]:
# Add the 'label' column to the DataFrame
abstract_df['label'] = label_model.predict(L_train)

In [7]:
# Map the labels to their respective names
abstract_df['label_name'] = abstract_df['label'].apply(lambda x: Labels.iloc[x]['Labels'] if x != ABSTAIN else 'ABSTAIN')
abstract_df

Unnamed: 0,concatenated_title_abstract,label,label_name
0,KINEMATIC TREATMENT OF CORONAL MASS EJECTION E...,-1,ABSTAIN
1,The imaging performance of the Hubble Space Te...,243,Hubble Space Telescope
2,Numerical Simulations of Mass Outflows Driven ...,954,Kepler
3,The Origin of X-shaped Radio Galaxies: Clues f...,-1,ABSTAIN
4,The Ghost of Sagittarius and Lumps in the Halo...,-1,ABSTAIN
...,...,...,...
63994,Accretion Disk Spectra of the Ultra-luminous X...,-1,ABSTAIN
63995,The ghost of a dwarf galaxy: fossils of the hi...,-1,ABSTAIN
63996,Observations of the Solar Corona from Space Sp...,-1,ABSTAIN
63997,Constraints on the topology of the Universe de...,972,WMAP


In [11]:
# save to CSV
abstract_df.to_csv('labeled_data_non-overlapped_labeling_fn_v2.csv', index=False)

In [9]:
label_counts = abstract_df['label_name'].value_counts()
print(label_counts)

label_name
ABSTAIN                                               48597
Hubble Space Telescope                                 3020
Gamma                                                  1717
Kepler                                                  950
Astron                                                  937
                                                      ...  
Siberian Solar Radio Telescope (SSRT)                     1
Paranal Observatory                                       1
Neutron Star Interior Composition Explorer (NICER)        1
Ankara University Observatory                             1
Very Small Array (VSA)                                    1
Name: count, Length: 198, dtype: int64


In [12]:
label_counts_df = label_counts.reset_index()
label_counts_df.columns = ['Label', 'Count']

# save to CSV
label_counts_df.to_csv('label_counts_non-overlapped_labeling_fn_v2.csv', index=False)
label_counts_df

Unnamed: 0,Label,Count
0,ABSTAIN,48597
1,Hubble Space Telescope,3020
2,Gamma,1717
3,Kepler,950
4,Astron,937
...,...,...
193,Siberian Solar Radio Telescope (SSRT),1
194,Paranal Observatory,1
195,Neutron Star Interior Composition Explorer (NI...,1
196,Ankara University Observatory,1
