# Snorkel Model for Weak Supervisation (Version 1)

In [7]:
import snorkel
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from snorkel.labeling import LabelingFunction, PandasLFApplier
from snorkel.labeling.model.label_model import LabelModel

In [8]:
# Load Data
Labels = pd.read_csv('Labels.csv')
Labels.columns = ['Labels']

abstract_df = pd.read_csv('preprocessed_data_Jul14.csv')
abstract_df = abstract_df['concatenated_title_abstract'].to_frame(name='concatenated_title_abstract')

In [9]:
Labels

Unnamed: 0,Labels
0,Abastumani Astrophysical Observatory
1,Abu Reyhan-e Birooni Observatory
2,Adirondack Public Observatory
3,Adolphson Observatory
4,Airdrie Public Observatory
...,...
990,ULTRASAT
991,Nancy Grace Roman Space Telescope (Wide Field ...
992,ARIEL
993,Advanced Telescope for High Energy Astrophysic...


In [10]:
# Load lists of observatories, telescope types, and telescope names from CSV files.
observatories = pd.read_csv('Astronomical Observatories.csv')['Name'].tolist()

telescope_types = ['radio telescope', 'solar telescope', 'space telescope']

telescope_names = pd.read_csv('List of radio telescopes.csv')['Name'].tolist() + pd.read_csv('List of Solar Telescope Optical.csv')['Name/Observatory'].tolist() + pd.read_csv('List of Solar Telescope Radio.csv')['Name/Observatory'].tolist() + pd.read_csv('List of space telescopes.csv')['Name'].tolist()


In [11]:
# Define Labeling Functions
LABEL_DICT = {label: i for i, label in enumerate(Labels['Labels'].tolist())}


# Define ABSTAIN constant
ABSTAIN = -1

def labeling_function_1(row):
    abstract = row['concatenated_title_abstract']
    for observatory in observatories:
        if observatory in abstract:
            return LABEL_DICT.get(observatory, ABSTAIN)
    return ABSTAIN

def labeling_function_2(row):
    abstract = row['concatenated_title_abstract']
    for telescope_type in telescope_types:
        if telescope_type in abstract:
            return LABEL_DICT.get(telescope_type, ABSTAIN)
    return ABSTAIN

def labeling_function_3(row):
    abstract = row['concatenated_title_abstract']
    for telescope_name in telescope_names:
        if telescope_name in abstract:
            return LABEL_DICT.get(telescope_name, ABSTAIN)
    return ABSTAIN


In [12]:
# Wrap your functions using Snorkel's LabelingFunction
labeling_function_1 = LabelingFunction(name="LF1", f=labeling_function_1)
labeling_function_2 = LabelingFunction(name="LF2", f=labeling_function_2)
labeling_function_3 = LabelingFunction(name="LF3", f=labeling_function_3)

# List of labeling functions
lfs = [labeling_function_1, labeling_function_2, labeling_function_3]

# Apply the labeling functions to the abstract data

applier = PandasLFApplier(lfs=lfs)

# Manually apply labeling functions
n_rows, n_lfs = len(abstract_df), len(lfs)
L_train = np.empty((n_rows, n_lfs), dtype=int)
L_train.fill(ABSTAIN)

#for i, abstract in enumerate(abstract_df):
#    row = {'concatenated_title_abstract': abstract}
#    for j, lf in enumerate(lfs):
#        L_train[i, j] = lf(row)
for i, row in abstract_df.iterrows():
    for j, lf in enumerate(lfs):
        L_train[i, j] = lf(row)


# Train a Snorkel label model
NUM_LABELS = len(Labels)
label_model = LabelModel(cardinality=NUM_LABELS)
label_model.fit(L_train, n_epochs=100, log_freq=10, seed=123)

# Add the 'label' column to the DataFrame
abstract_df['label'] = label_model.predict(L_train)



''''# Define a SnorkelFlow
flow = snorkel.SnorkelFlow(lfs=[labeling_function_1, labeling_function_2, labeling_function_3])

# Apply the labeling functions to the abstract data
L_train = flow.apply(abstract_df)

# Train a Snorkel labeling model
label_model = flow.train_label_model(L_train, n_epochs=100, log_freq=10, seed=123)

# Predict the labels for the abstract data
abstract_df['label'] = label_model.predict(L_train)'''''



INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/100 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.011]
 10%|█         | 10/100 [00:18<02:43,  1.81s/epoch]INFO:root:[10 epochs]: TRAIN:[loss=0.011]
 20%|██        | 20/100 [00:36<02:24,  1.81s/epoch]INFO:root:[20 epochs]: TRAIN:[loss=0.011]
 30%|███       | 30/100 [00:54<02:06,  1.81s/epoch]INFO:root:[30 epochs]: TRAIN:[loss=0.011]
 40%|████      | 40/100 [01:12<01:48,  1.81s/epoch]INFO:root:[40 epochs]: TRAIN:[loss=0.011]
 50%|█████     | 50/100 [01:30<01:30,  1.81s/epoch]INFO:root:[50 epochs]: TRAIN:[loss=0.011]
 60%|██████    | 60/100 [01:49<01:12,  1.81s/epoch]INFO:root:[60 epochs]: TRAIN:[loss=0.011]
 70%|███████   | 70/100 [02:07<00:54,  1.81s/epoch]INFO:root:[70 epochs]: TRAIN:[loss=0.011]
 80%|████████  | 80/100 [02:25<00:36,  1.81s/epoch]INFO:root:[80 epochs]: TRAIN:[loss=0.011]
 90%|█████████ | 90/100 [02:43<00:18,  1.81s/epoch]INFO:root:[90 epochs]: TRAIN:[loss=0.011]
100%|██████████| 100/100 [0

"'# Define a SnorkelFlow\nflow = snorkel.SnorkelFlow(lfs=[labeling_function_1, labeling_function_2, labeling_function_3])\n\n# Apply the labeling functions to the abstract data\nL_train = flow.apply(abstract_df)\n\n# Train a Snorkel labeling model\nlabel_model = flow.train_label_model(L_train, n_epochs=100, log_freq=10, seed=123)\n\n# Predict the labels for the abstract data\nabstract_df['label'] = label_model.predict(L_train)"

In [13]:
# Map the labels to their respective names
abstract_df['label_name'] = abstract_df['label'].apply(lambda x: Labels.iloc[x]['Labels'] if x != ABSTAIN else 'ABSTAIN')
abstract_df

Unnamed: 0,concatenated_title_abstract,label,label_name
0,KINEMATIC TREATMENT OF CORONAL MASS EJECTION E...,-1,ABSTAIN
1,The imaging performance of the Hubble Space Te...,243,Hubble Space Telescope
2,Numerical Simulations of Mass Outflows Driven ...,954,Kepler
3,The Origin of X-shaped Radio Galaxies: Clues f...,-1,ABSTAIN
4,The Ghost of Sagittarius and Lumps in the Halo...,-1,ABSTAIN
...,...,...,...
63994,Accretion Disk Spectra of the Ultra-luminous X...,-1,ABSTAIN
63995,The ghost of a dwarf galaxy: fossils of the hi...,-1,ABSTAIN
63996,Observations of the Solar Corona from Space Sp...,-1,ABSTAIN
63997,Constraints on the topology of the Universe de...,972,WMAP


In [31]:
# save to CSV
abstract_df.to_csv('labeled_data_non-overlapped_labeling_fn.csv', index=False)

In [15]:
label_counts = abstract_df['label_name'].value_counts()
print(label_counts)

label_name
ABSTAIN                                 48597
Hubble Space Telescope                   2991
Gamma                                    1717
Kepler                                    950
Astron                                    936
                                        ...  
Kwasan Observatory, Kyoto University        1
Long Wavelength Array                       1
Mount Stromlo Observatory                   1
Kanzelhoehe Solar Observatory               1
Kamioka Observatory                         1
Name: count, Length: 199, dtype: int64


In [32]:
label_counts_df = label_counts.reset_index()
label_counts_df.columns = ['Label', 'Count']

# save to CSV
label_counts_df.to_csv('label_counts_non-overlapped_labeling_fn.csv', index=False)
label_counts_df

Unnamed: 0,Label,Count
0,ABSTAIN,48597
1,Hubble Space Telescope,2991
2,Gamma,1717
3,Kepler,950
4,Astron,936
...,...,...
194,"Kwasan Observatory, Kyoto University",1
195,Long Wavelength Array,1
196,Mount Stromlo Observatory,1
197,Kanzelhoehe Solar Observatory,1


# Downstream Model (Logistic Regression)

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [18]:
# Filter out ABSTAIN labeled data
train_data = abstract_df[abstract_df['label'] != ABSTAIN]

In [19]:
# Extract features using TF-IDF.
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(train_data['concatenated_title_abstract'])
y = train_data['label']

In [20]:
# Split the data into train and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# Train a classifier.
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Evaluate on test set
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          15       0.00      0.00      0.00         3
          16       1.00      1.00      1.00         4
          18       1.00      0.22      0.36         9
          22       1.00      0.38      0.55         8
          25       0.00      0.00      0.00         1
          32       0.00      0.00      0.00         1
          36       0.78      0.75      0.77        24
          42       0.00      0.00      0.00         1
          48       0.00      0.00      0.00         2
          49       0.00      0.00      0.00         1
          58       0.00      0.00      0.00         1
          84       0.00      0.00      0.00         1
          88       0.00      0.00      0.00         3
         104       1.00      0.29      0.45        24
         109       1.00      0.69      0.81        16
         130       0.00      0.00      0.00         4
         136       0.00      0.00      0.00         1
         149       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
# Predict on the abstained data.
abstained_data = abstract_df[abstract_df['label'] == ABSTAIN]
X_abstained = vectorizer.transform(abstained_data['concatenated_title_abstract'])

abstained_data['predicted_label'] = clf.predict(X_abstained)
abstained_data['predicted_label_name'] = abstained_data['predicted_label'].apply(lambda x: Labels.iloc[x]['Labels'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abstained_data['predicted_label'] = clf.predict(X_abstained)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abstained_data['predicted_label_name'] = abstained_data['predicted_label'].apply(lambda x: Labels.iloc[x]['Labels'])


In [23]:
abstained_data

Unnamed: 0,concatenated_title_abstract,label,label_name,predicted_label,predicted_label_name
0,KINEMATIC TREATMENT OF CORONAL MASS EJECTION E...,-1,ABSTAIN,917,Astron
3,The Origin of X-shaped Radio Galaxies: Clues f...,-1,ABSTAIN,638,Very Large Array
4,The Ghost of Sagittarius and Lumps in the Halo...,-1,ABSTAIN,952,Hipparcos
5,Spectral Models of the Type IC Supernova SN 19...,-1,ABSTAIN,243,Hubble Space Telescope
6,Around-the-Clock Observations of the Q0957+561...,-1,ABSTAIN,917,Astron
...,...,...,...,...,...
63989,"Transport of Magnetic Fields in Convective, Ac...",-1,ABSTAIN,899,Gamma
63990,Observational properties of extreme supernovae...,-1,ABSTAIN,917,Astron
63994,Accretion Disk Spectra of the Ultra-luminous X...,-1,ABSTAIN,954,Kepler
63995,The ghost of a dwarf galaxy: fossils of the hi...,-1,ABSTAIN,243,Hubble Space Telescope


In [24]:
# Extract features using TF-IDF.
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_all = vectorizer.fit_transform(train_data['concatenated_title_abstract'])
y_all = train_data['label']

# Train a classifier using all the data
clf_final = LogisticRegression(max_iter=1000)
clf_final.fit(X_all, y_all)

In [25]:
# Predict on the abstained data using the retrained model.
X_abstained = vectorizer.transform(abstained_data['concatenated_title_abstract'])

abstained_data['predicted_label'] = clf_final.predict(X_abstained)
abstained_data['predicted_label_name'] = abstained_data['predicted_label'].apply(lambda x: Labels.iloc[x]['Labels'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abstained_data['predicted_label'] = clf_final.predict(X_abstained)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abstained_data['predicted_label_name'] = abstained_data['predicted_label'].apply(lambda x: Labels.iloc[x]['Labels'])


In [27]:
# Display the results
abstained_data

Unnamed: 0,concatenated_title_abstract,label,label_name,predicted_label,predicted_label_name
0,KINEMATIC TREATMENT OF CORONAL MASS EJECTION E...,-1,ABSTAIN,917,Astron
3,The Origin of X-shaped Radio Galaxies: Clues f...,-1,ABSTAIN,638,Very Large Array
4,The Ghost of Sagittarius and Lumps in the Halo...,-1,ABSTAIN,952,Hipparcos
5,Spectral Models of the Type IC Supernova SN 19...,-1,ABSTAIN,243,Hubble Space Telescope
6,Around-the-Clock Observations of the Q0957+561...,-1,ABSTAIN,917,Astron
...,...,...,...,...,...
63989,"Transport of Magnetic Fields in Convective, Ac...",-1,ABSTAIN,899,Gamma
63990,Observational properties of extreme supernovae...,-1,ABSTAIN,917,Astron
63994,Accretion Disk Spectra of the Ultra-luminous X...,-1,ABSTAIN,954,Kepler
63995,The ghost of a dwarf galaxy: fossils of the hi...,-1,ABSTAIN,243,Hubble Space Telescope


In [28]:
label_counts_LR = abstained_data['predicted_label_name'].value_counts()
label_counts_LR

predicted_label_name
Hubble Space Telescope                           16737
Kepler                                            8493
Astron                                            6342
Gamma                                             5134
IRAS                                              2179
ROSAT                                             1264
Planck                                            1097
WMAP                                              1054
Very Large Array                                  1006
Hipparcos                                          979
XMM-Newton                                         962
Spitzer Space Telescope                            857
Solar and Heliospheric Observatory (SOHO)          665
LIGO                                               480
Hinode                                             323
Very Large Array (VLA)                             151
Australia Telescope Compact Array                   89
Rossi X-ray Timing Explorer (RXTE)          

In [29]:
# save to CSV
abstained_data.to_csv('LR_abstained_data_v1.csv', index=False)

In [30]:
label_counts_LR.to_csv('label_counts_LR_v1.csv', index=False)