# Snorkel Model Experimental

In [10]:
import snorkel
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from snorkel.labeling import LabelingFunction, PandasLFApplier
from snorkel.labeling.model.label_model import LabelModel

In [11]:
# Load Data
labels_list = ['telescope', 'observatory', 'array']
Labels = pd.DataFrame(labels_list, columns=['Labels'])
abstract_df = pd.read_csv('preprocessed_data_Jul14.csv')
abstract_df = abstract_df['concatenated_title_abstract'].to_frame(name='concatenated_title_abstract')

# Define Labeling Functions
LABEL_DICT = {label: i for i, label in enumerate(Labels['Labels'].tolist())}
ABSTAIN = -1

In [12]:
# Loading CSV data outside LFs
observatories = pd.read_csv('Astronomical Observatories.csv')['Name'].tolist()
radio_telescope_names = pd.read_csv('List of radio telescopes.csv')['Name'].tolist()
solar_optical_telescope_names = pd.read_csv('List of Solar Telescope Optical.csv')['Name/Observatory'].tolist()
solar_radio_telescope_names = pd.read_csv('List of Solar Telescope Radio.csv')['Name/Observatory'].tolist()
space_telescope_names = pd.read_csv('List of space telescopes.csv')['Name'].tolist()

def lf_observatories(row):
    abstract = row['concatenated_title_abstract']
    for observatory in observatories:
        if observatory in abstract:
            return LABEL_DICT['observatory']
    return ABSTAIN

def lf_radio_telescopes(row):
    abstract = row['concatenated_title_abstract']
    for telescope_name in radio_telescope_names:
        if telescope_name in abstract:
            return LABEL_DICT['telescope']
    return ABSTAIN

def lf_solar_optical_telescopes(row):
    abstract = row['concatenated_title_abstract']
    for telescope_name in solar_optical_telescope_names:
        if telescope_name in abstract:
            return LABEL_DICT['telescope']
    return ABSTAIN

def lf_solar_radio_telescopes(row):
    abstract = row['concatenated_title_abstract']
    for telescope_name in solar_radio_telescope_names:
        if telescope_name in abstract:
            return LABEL_DICT['telescope']
    return ABSTAIN

def lf_space_telescopes(row):
    abstract = row['concatenated_title_abstract']
    for telescope_name in space_telescope_names:
        if telescope_name in abstract:
            return LABEL_DICT['telescope']
    return ABSTAIN

def lf_telescope_types(row):
    telescope_types = ['radio telescope', 'solar telescope', 'space telescope']
    abstract = row['concatenated_title_abstract']
    for telescope_type in telescope_types:
        if telescope_type in abstract:
            return LABEL_DICT['telescope']
    return ABSTAIN

lfs = [
    LabelingFunction(name="LF_Observatories", f=lf_observatories),
    LabelingFunction(name="LF_RadioTelescopes", f=lf_radio_telescopes),
    LabelingFunction(name="LF_SolarOpticalTelescopes", f=lf_solar_optical_telescopes),
    LabelingFunction(name="LF_SolarRadioTelescopes", f=lf_solar_radio_telescopes),
    LabelingFunction(name="LF_SpaceTelescopes", f=lf_space_telescopes),
    LabelingFunction(name="LF_TelescopeTypes", f=lf_telescope_types)
]

In [13]:
# Apply LFs
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(abstract_df)

100%|██████████| 63999/63999 [00:44<00:00, 1439.22it/s]


In [14]:
# Train Snorkel label model
NUM_LABELS = len(Labels)
label_model = LabelModel(cardinality=NUM_LABELS)
label_model.fit(L_train, n_epochs=100, log_freq=10, seed=123)

INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/100 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.030]
INFO:root:[10 epochs]: TRAIN:[loss=0.020]
INFO:root:[20 epochs]: TRAIN:[loss=0.011]
INFO:root:[30 epochs]: TRAIN:[loss=0.007]
INFO:root:[40 epochs]: TRAIN:[loss=0.007]
INFO:root:[50 epochs]: TRAIN:[loss=0.006]
INFO:root:[60 epochs]: TRAIN:[loss=0.006]
INFO:root:[70 epochs]: TRAIN:[loss=0.006]
INFO:root:[80 epochs]: TRAIN:[loss=0.005]
INFO:root:[90 epochs]: TRAIN:[loss=0.005]
100%|██████████| 100/100 [00:00<00:00, 1419.99epoch/s]
INFO:root:Finished Training


In [15]:
# Add labels
abstract_df['label'] = label_model.predict(L_train)
abstract_df['label_name'] = abstract_df['label'].apply(lambda x: Labels.iloc[x]['Labels'] if x != ABSTAIN else 'ABSTAIN')

print(abstract_df['label_name'].value_counts())

label_name
ABSTAIN        48210
telescope      13555
observatory     2234
Name: count, dtype: int64
