# Snorkel Model v4

In [1]:
import snorkel
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from snorkel.labeling import LabelingFunction, PandasLFApplier
from snorkel.labeling.model.label_model import LabelModel

In [2]:
# Load Data
Labels = pd.read_csv('Labels_renamed.csv')
Labels.columns = ['Labels']
abstract_df = pd.read_csv('preprocessed_data_Jul14.csv')
abstract_df = abstract_df['concatenated_title_abstract'].to_frame(name='concatenated_title_abstract')

# Define Labeling Functions
LABEL_DICT = {label: i for i, label in enumerate(Labels['Labels'].tolist())}
ABSTAIN = -1

In [3]:
# Load CSVs 
observatories = pd.read_csv('Astronomical Observatories.csv')['Name'].tolist()
telescope_radio_names = pd.read_csv('List of radio telescopes.csv')['Name'].tolist()
telescope_solar_optical_names = pd.read_csv('List of Solar Telescope Optical.csv')['Name/Observatory'].tolist()
telescope_solar_radio_names = pd.read_csv('List of Solar Telescope Radio.csv')['Name/Observatory'].tolist()
telescope_space_names = pd.read_csv('List of space telescopes.csv')['Name'].tolist()


def lf_observatories_fn(row, observatories_list):
    abstract = row['concatenated_title_abstract']
    for observatory in observatories_list:
        if observatory in abstract:
            return LABEL_DICT.get(observatory, ABSTAIN)
    return ABSTAIN


def lf_telescope_radio_fn(row, telescope_radio_list):
    abstract = row['concatenated_title_abstract']
    for telescope in telescope_radio_list:
        if telescope in abstract:
            return LABEL_DICT.get(telescope, ABSTAIN)
    return ABSTAIN


def lf_telescope_solar_optical_fn(row, telescope_solar_optical_list):
    abstract = row['concatenated_title_abstract']
    for telescope in telescope_solar_optical_list:
        if telescope in abstract:
            return LABEL_DICT.get(telescope, ABSTAIN)
    return ABSTAIN


def lf_telescope_solar_radio_fn(row, telescope_solar_radio_list):
    abstract = row['concatenated_title_abstract']
    for telescope in telescope_solar_radio_list:
        if telescope in abstract:
            return LABEL_DICT.get(telescope, ABSTAIN)
    return ABSTAIN


def lf_telescope_space_fn(row, telescope_space_list):
    abstract = row['concatenated_title_abstract']
    for telescope in telescope_space_list:
        if telescope in abstract:
            return LABEL_DICT.get(telescope, ABSTAIN)
    return ABSTAIN


def lf_telescope_types_fn(row):
    telescope_types = ['radio telescope', 'solar telescope', 'space telescope']
    abstract = row['concatenated_title_abstract']
    for telescope_type in telescope_types:
        if telescope_type in abstract:
            return LABEL_DICT.get(telescope_type, ABSTAIN)
    return ABSTAIN

# Define them as Snorkel's LabelingFunction
lfs = [
    LabelingFunction(name="lf_observatories", f=lf_observatories_fn, resources=dict(observatories_list=observatories)),
    LabelingFunction(name="lf_radio_telescopes", f=lf_telescope_radio_fn, resources=dict(telescope_radio_list=telescope_radio_names)),
    LabelingFunction(name="lf_solar_optical_telescopes", f=lf_telescope_solar_optical_fn, resources=dict(telescope_solar_optical_list=telescope_solar_optical_names)),
    LabelingFunction(name="lf_solar_radio_telescopes", f=lf_telescope_solar_radio_fn, resources=dict(telescope_solar_radio_list=telescope_solar_radio_names)),
    LabelingFunction(name="lf_space_telescopes", f=lf_telescope_space_fn, resources=dict(telescope_space_list=telescope_space_names)),
    LabelingFunction(name="lf_telescope_types", f=lf_telescope_types_fn)
]

In [4]:
# Initialize the PandasLFApplier with the list of Labeling Functions
applier = PandasLFApplier(lfs=lfs)

# Apply the labeling functions to the abstract_df
L_train = applier.apply(df=abstract_df)

100%|██████████| 63999/63999 [00:44<00:00, 1438.13it/s]


In [5]:
# Convert labels to multi-label format
def to_multi_label_format(label_array, num_labels):
    return np.array([[1 if j in label_row else 0 for j in range(num_labels)] for label_row in label_array])

multi_label_matrix = to_multi_label_format(L_train, len(LABEL_DICT))

In [6]:
# Train the Snorkel label model
NUM_LABELS = len(LABEL_DICT)
label_model = LabelModel(cardinality=NUM_LABELS)
label_model.fit(multi_label_matrix, n_epochs=100, log_freq=10, seed=123)

INFO:root:Computing O...


MemoryError: Unable to allocate 7.13 TiB for an array with shape (990025, 990025) and data type float64

In [None]:
# Use the label model to predict the labels
predicted_labels = label_model.predict(multi_label_matrix)

In [None]:
# Add predicted labels to dataframe
abstract_df['predicted_labels'] = [set(label) for label in predicted_labels]

In [None]:
# Convert the set to a string for easy storage and manipulation
abstract_df['predicted_label_str'] = abstract_df['predicted_labels'].apply(lambda x: ';'.join(map(str, sorted(x))) if x else 'ABSTAIN')

In [None]:
abstract_df.to_csv('labeled_data_snorkel_v4_multilabel_with_predictions.csv', index=False)

In [None]:
label_counts = abstract_df['predicted_label_str'].value_counts()
label_counts_df = label_counts.reset_index()
label_counts_df.columns = ['Predicted Label', 'Count']
label_counts_df.to_csv('predicted_label_counts_Snorkel_v3_multilabel.csv', index=False)