# Telescope Label Analysis

In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
Labels = pd.read_csv('Labels.csv')

# Rename the first column to 'Labels'
Labels.columns = ['Labels']
Labels

Unnamed: 0,Labels
0,Abastumani Astrophysical Observatory
1,Abu Reyhan-e Birooni Observatory
2,Adirondack Public Observatory
3,Adolphson Observatory
4,Airdrie Public Observatory
...,...
1037,ULTRASAT
1038,Nancy Grace Roman Space Telescope (Wide Field ...
1039,ARIEL
1040,Advanced Telescope for High Energy Astrophysic...


In [4]:
abstract_df = pd.read_csv('preprocessed_data_Jul14.csv')
abstract_df = abstract_df['concatenated_title_abstract']
abstract_df

0        KINEMATIC TREATMENT OF CORONAL MASS EJECTION E...
1        The imaging performance of the Hubble Space Te...
2        Numerical Simulations of Mass Outflows Driven ...
3        The Origin of X-shaped Radio Galaxies: Clues f...
4        The Ghost of Sagittarius and Lumps in the Halo...
                               ...                        
63994    Accretion Disk Spectra of the Ultra-luminous X...
63995    The ghost of a dwarf galaxy: fossils of the hi...
63996    Observations of the Solar Corona from Space Sp...
63997    Constraints on the topology of the Universe de...
63998    Formation of Kuiper-belt binaries through mult...
Name: concatenated_title_abstract, Length: 63999, dtype: object

In [5]:
full_text = pd.read_csv('preprocessed_FullText_data.csv')
full_text = full_text['fullText']
full_text

  full_text = pd.read_csv('preprocessed_FullText_data.csv')


0        ['THE FLASH SPECTRUM. BY S. A. MITCHELL. (Read...
1        ['MAY 9, 1913] SCIENCE must be sought the key ...
2        ['OCTOBER 4, 1912] SCIENCE astronomy, increasi...
3        ["SCIENTIFIC NOTES AND NEWS A SPECIAL act of c...
4        ['Wfeek-ly morbidity and mnortality tatble, ci...
                               ...                        
24995    ['Atmos. Chem. Phys., 16, 7709–7724, 2016\nwww...
24996    ['Ann. Geophys., 31, 187–196, 2013\nwww.ann-ge...
24997    ['Research Article\nTrack-to-Track Association...
24998    ['Nonlin. Processes Geophys., 26, 91–108, 2019...
24999    ['Contribution of Ancestral Lines in the Devel...
Name: fullText, Length: 25000, dtype: object

In [23]:
# Count the occurrences of the words in abstracts data
telescope_count = abstract_df.str.count('telescope').sum()
satellite_count = abstract_df.str.count('satellite').sum()
array_count = abstract_df.str.count('array').sum()

# results
print(f"'telescope' appears {telescope_count} times in abstracts data.")
print(f"'satellite' appears {satellite_count} times in abstracts data.")
print(f"'array' appears {array_count} times in abstracts data.")

'telescope' appears 5758 times in abstracts data.
'satellite' appears 6072 times in abstracts data.
'array' appears 1126 times in abstracts data.


In [24]:
# Count the occurrences of the words in full text data
telescope_count = full_text.str.count('telescope').sum()
satellite_count = full_text.str.count('satellite').sum()
array_count = full_text.str.count('array').sum()

# results
print(f"'telescope' appears {telescope_count} times in full text data.")
print(f"'satellite' appears {satellite_count} times in full text data.")
print(f"'array' appears {array_count} times in full text data.")

'telescope' appears 7887 times in full text data.
'satellite' appears 72784 times in full text data.
'array' appears 11717 times in full text data.


# Abstract Analysis

In [6]:
# Count the occurrences of each label
label_counts = {}
for label in Labels['Labels']:
    escaped_label = re.escape(label)
    label_counts[label] = abstract_df.str.contains(escaped_label, case=False, na=False).sum()

# Results
for label, count in label_counts.items():
    print(f"'{label}' appears {count} times.")

'Abastumani Astrophysical Observatory' appears 0 times.
'Abu Reyhan-e Birooni Observatory' appears 0 times.
'Adirondack Public Observatory' appears 0 times.
'Adolphson Observatory' appears 0 times.
'Airdrie Public Observatory' appears 0 times.
'Aker Observatory' appears 0 times.
'Aldershot Observatory' appears 0 times.
'Algonquin Radio Observatory' appears 0 times.
'Allegheny Observatory' appears 1 times.
'Ametlla de Mar Observatory' appears 0 times.
'AMiBA' appears 0 times.
'Anderson Mesa Station' appears 0 times.
'Angell Hall Observatory' appears 0 times.
'Ankara University Observatory' appears 1 times.
'Antarctic Muon and Neutrino Detector Array' appears 0 times.
'Antarctic Submillimeter Telescope and Remote Observatory' appears 10 times.
'ANTARES' appears 28 times.
'Apache-Sitgreaves Observatory' appears 0 times.
'Apache Point Observatory' appears 37 times.
'Apatity Cosmic-Ray Station' appears 0 times.
'Apollo Observatory' appears 0 times.
'Arcetri Observatory' appears 0 times.
'Ar

In [12]:
def generate_bigrams(label):
    #Generate bigrams from a given label.
    tokens = label.split()
    return [" ".join(tokens[i:i+2]) for i in range(len(tokens)-1)]


# Process each label and count occurrences of its tokens and bigrams
for idx, label in enumerate(Labels['Labels']):
    # Token counts
    tokens = label.split()  # Split the label into tokens
    token_counts = [abstract_df.str.contains(re.escape(token), case=False, na=False).sum() for token in tokens]
    print(f"{tokens} appears {token_counts} times in abstracts data.")
    
    # Bigram counts
    bigrams = generate_bigrams(label)
    bigram_counts = [abstract_df.str.contains(re.escape(bigram), case=False, na=False).sum() for bigram in bigrams]
    print(f"{bigrams} appears {bigram_counts} times in abstracts data.")
    print('-' * 80) 

['Abastumani', 'Astrophysical', 'Observatory'] appears [5, 1907, 3950] times in abstracts data.
['Abastumani Astrophysical', 'Astrophysical Observatory'] appears [0, 112] times in abstracts data.
--------------------------------------------------------------------------------
['Abu', 'Reyhan-e', 'Birooni', 'Observatory'] appears [6344, 0, 0, 3950] times in abstracts data.
['Abu Reyhan-e', 'Reyhan-e Birooni', 'Birooni Observatory'] appears [0, 0, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Adirondack', 'Public', 'Observatory'] appears [0, 668, 3950] times in abstracts data.
['Adirondack Public', 'Public Observatory'] appears [0, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Adolphson', 'Observatory'] appears [0, 3950] times in abstracts data.
['Adolphson Observatory'] appears [0] times in abstracts data.
-----------------------------------------------------

['Astronomy Tower', 'Tower of', 'of the', 'the Sorbonne'] appears [0, 3, 58406, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Astrophysical', 'Institute', 'Potsdam'] appears [1907, 314, 21] times in abstracts data.
['Astrophysical Institute', 'Institute Potsdam'] appears [9, 3] times in abstracts data.
--------------------------------------------------------------------------------
['Atacama', 'Pathfinder', 'Experiment', '(APEX)'] appears [132, 77, 2619, 2] times in abstracts data.
['Atacama Pathfinder', 'Pathfinder Experiment', 'Experiment (APEX)'] appears [3, 4, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Australia', 'Telescope', 'Compact', 'Array'] appears [530, 10808, 4627, 2905] times in abstracts data.
['Australia Telescope', 'Telescope Compact', 'Compact Array'] appears [255, 217, 218] times in abstracts data.
---------------------------------------

['Bohyunsan Optical', 'Optical Astronomy', 'Astronomy Observatory', 'Observatory (BOAO)'] appears [2, 24, 92, 0] times in abstracts data.
--------------------------------------------------------------------------------
['BOOTES'] appears [67] times in abstracts data.
[] appears [] times in abstracts data.
--------------------------------------------------------------------------------
['Bordeaux', 'Observatory'] appears [2, 3950] times in abstracts data.
['Bordeaux Observatory'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Bosscha', 'Observatory'] appears [0, 3950] times in abstracts data.
['Bosscha Observatory'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Boswell', 'Observatory', '(defunct)'] appears [0, 3950, 0] times in abstracts data.
['Boswell Observatory', 'Observatory (defunct)'] appears [0, 0] times in abstracts data.
-----------

['Centennial', 'Observatory'] appears [7, 3950] times in abstracts data.
['Centennial Observatory'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Center', 'for', 'Astrophysics', '|', 'Harvard', '&', 'Smithsonian'] appears [4689, 56668, 1088, 575, 78, 3299, 32] times in abstracts data.
['Center for', 'for Astrophysics', 'Astrophysics |', '| Harvard', 'Harvard &', '& Smithsonian'] appears [122, 117, 2, 2, 4, 4] times in abstracts data.
--------------------------------------------------------------------------------
['Cerro', 'Armazones', 'Observatory'] appears [140, 3, 3950] times in abstracts data.
['Cerro Armazones', 'Armazones Observatory'] appears [3, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Cerro', 'Tololo', 'Inter-American', 'Observatory'] appears [140, 133, 93, 3950] times in abstracts data.
['Cerro Tololo', 'Tololo Inter-American', 'Inter

['Crawford', 'Observatory'] appears [3, 3950] times in abstracts data.
['Crawford Observatory'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Creighton', 'University', 'Observatory', '(defunct)'] appears [0, 617, 3950, 0] times in abstracts data.
['Creighton University', 'University Observatory', 'Observatory (defunct)'] appears [0, 28, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Crimean', 'Astrophysical', 'Observatory'] appears [23, 1907, 3950] times in abstracts data.
['Crimean Astrophysical', 'Astrophysical Observatory'] appears [14, 112] times in abstracts data.
--------------------------------------------------------------------------------
['Črni', 'Vrh', 'Observatory'] appears [0, 0, 3950] times in abstracts data.
['Črni Vrh', 'Vrh Observatory'] appears [0, 0] times in abstracts data.
--------------------------------------------------------

['European Southern', 'Southern Observatory', 'Observatory La', 'La Silla', 'Silla Observatory', 'Observatory New', 'New Technology', 'Technology Telescope', 'Telescope Llano', 'Llano de', 'de Chajnantor', 'Chajnantor Observatory', 'Observatory Very', 'Very Large', 'Large Telescope', 'Telescope Paranal', 'Paranal Observatory', 'Observatory European', 'European Extremely', 'Extremely Large', 'Large Telescope'] appears [114, 114, 13, 29, 3, 1, 43, 41, 0, 2, 0, 0, 24, 1584, 515, 0, 1, 0, 1, 111, 515] times in abstracts data.
--------------------------------------------------------------------------------
['Fabra', 'Observatory'] appears [0, 3950] times in abstracts data.
['Fabra Observatory'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Fan', 'Mountain', 'Observatory'] appears [379, 76, 3950] times in abstracts data.
['Fan Mountain', 'Mountain Observatory'] appears [0, 24] times in abstracts data.
-----------------

['Gornergrat', '(HFSJG)', '(defunct)'] appears [0, 0, 0] times in abstracts data.
['Gornergrat (HFSJG)', '(HFSJG) (defunct)'] appears [0, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Granat', '(defunct)'] appears [16, 0] times in abstracts data.
['Granat (defunct)'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Gran', 'Telescopio', 'Canarias'] appears [673, 26, 32] times in abstracts data.
['Gran Telescopio', 'Telescopio Canarias'] appears [19, 19] times in abstracts data.
--------------------------------------------------------------------------------
['Grant', 'O.', 'Gale', 'Observatory'] appears [37, 3837, 335, 3950] times in abstracts data.
['Grant O.', 'O. Gale', 'Gale Observatory'] appears [0, 0, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Green', 'Bank', 'Telescope,'] appears

['Highland', 'Road', 'Park', 'Observatory'] appears [6, 5570, 529, 3950] times in abstracts data.
['Highland Road', 'Road Park', 'Park Observatory'] appears [0, 0, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Hinode'] appears [98] times in abstracts data.
[] appears [] times in abstracts data.
--------------------------------------------------------------------------------
['Hirsch', 'Observatory'] appears [1, 3950] times in abstracts data.
['Hirsch Observatory'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Hobbs', 'Observatory'] appears [6, 3950] times in abstracts data.
['Hobbs Observatory'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Hoher', 'List', 'Observatory'] appears [0, 2429, 3950] times in abstracts data.
['Hoher List', 'List Observatory'] appears [0, 0] times in

['IXPE'] appears [3] times in abstracts data.
[] appears [] times in abstracts data.
--------------------------------------------------------------------------------
['Jack', 'C.', 'Davis', 'Observatory'] appears [91, 5539, 59, 3950] times in abstracts data.
['Jack C.', 'C. Davis', 'Davis Observatory'] appears [0, 0, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Astronomical', 'Observatory', 'of', 'the', 'Jagiellonian', 'University'] appears [1486, 3950, 63810, 63868, 6, 617] times in abstracts data.
['Astronomical Observatory', 'Observatory of', 'of the', 'the Jagiellonian', 'Jagiellonian University'] appears [113, 127, 58406, 4, 6] times in abstracts data.
--------------------------------------------------------------------------------
['James', 'Clerk', 'Maxwell', 'Telescope'] appears [318, 166, 310, 10808] times in abstracts data.
['James Clerk', 'Clerk Maxwell', 'Maxwell Telescope'] appears [164, 164, 161] times in a

['Lee', 'Observatory'] appears [116, 3950] times in abstracts data.
['Lee Observatory'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Leiden', 'Observatory'] appears [28, 3950] times in abstracts data.
['Leiden Observatory'] appears [8] times in abstracts data.
--------------------------------------------------------------------------------
['Leipzig', 'Observatory'] appears [1, 3950] times in abstracts data.
['Leipzig Observatory'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Leoncito', 'Astronomical', 'Complex'] appears [7, 1486, 2936] times in abstracts data.
['Leoncito Astronomical', 'Astronomical Complex'] appears [0, 2] times in abstracts data.
--------------------------------------------------------------------------------
['Levenhagen', 'Observatory'] appears [0, 3950] times in abstracts data.
['Levenhagen Observatory'] appears [0]

['Marseille', 'Observatory'] appears [0, 3950] times in abstracts data.
['Marseille Observatory'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Martz', 'Observatory'] appears [0, 3950] times in abstracts data.
['Martz Observatory'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Mauna', 'Kea', 'Observatory'] appears [85, 89, 3950] times in abstracts data.
['Mauna Kea', 'Kea Observatory'] appears [59, 10] times in abstracts data.
--------------------------------------------------------------------------------
['Mauna', 'Loa', 'Solar', 'Observatory'] appears [85, 2727, 10845, 3950] times in abstracts data.
['Mauna Loa', 'Loa Solar', 'Solar Observatory'] appears [26, 17, 141] times in abstracts data.
--------------------------------------------------------------------------------
['Maynard', 'F.', 'Jordan', 'Observatory'] appears [0, 1900, 16, 3

['Montevideo', 'National', 'Observatory'] appears [0, 808, 3950] times in abstracts data.
['Montevideo National', 'National Observatory'] appears [0, 71] times in abstracts data.
--------------------------------------------------------------------------------
['Moore', 'Observatory'] appears [32, 3950] times in abstracts data.
['Moore Observatory'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Morgan–Monroe', 'Observatory'] appears [0, 3950] times in abstracts data.
['Morgan–Monroe Observatory'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Morris', 'Observatory'] appears [31, 3950] times in abstracts data.
['Morris Observatory'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Morrison', 'Observatory'] appears [9, 3950] times in abstracts data.
['Morrison Observatory'] 

['National Optical', 'Optical Astronomy', 'Astronomy Observatory', 'Observatory Cerro', 'Cerro Tololo', 'Tololo Inter-American', 'Inter-American Observatory', 'Observatory Gemini', 'Gemini Observatory', 'Observatory Kitt', 'Kitt Peak', 'Peak National', 'National Observatory', 'Observatory National', 'National Solar', 'Solar Observatory'] appears [21, 24, 92, 0, 114, 93, 89, 0, 9, 2, 158, 52, 71, 0, 37, 141] times in abstracts data.
--------------------------------------------------------------------------------
['National', 'Optical', 'Observatory'] appears [808, 12092, 3950] times in abstracts data.
['National Optical', 'Optical Observatory'] appears [21, 5] times in abstracts data.
--------------------------------------------------------------------------------
['National', 'Radio', 'Astronomy', 'Observatory', 'Atacama', 'Large', 'Millimeter', 'Array', 'Green', 'Bank', 'Telescope', 'Very', 'Large', 'Array', 'Very', 'Long', 'Baseline', 'Array'] appears [808, 9524, 1242, 3950, 132, 233

['Observatorio', 'Solar', 'Carl', 'Sagan'] appears [25, 10845, 1262, 8] times in abstracts data.
['Observatorio Solar', 'Solar Carl', 'Carl Sagan'] appears [0, 0, 4] times in abstracts data.
--------------------------------------------------------------------------------
['Observatory', 'House', '(defunct)'] appears [3950, 102, 0] times in abstracts data.
['Observatory House', 'House (defunct)'] appears [0, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Observatory', 'of', 'the', 'rue', 'Serpente', '(defunct)'] appears [3950, 63810, 63868, 1297, 0, 0] times in abstracts data.
['Observatory of', 'of the', 'the rue', 'rue Serpente', 'Serpente (defunct)'] appears [127, 58406, 0, 0, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Oil', 'Region', 'Astronomical', 'Observatory'] appears [275, 15323, 1486, 3950] times in abstracts data.
['Oil Region', 'Region Astronomi

['Pico', 'dos', 'Dias', 'Observatory'] appears [24, 215, 12, 3950] times in abstracts data.
['Pico dos', 'dos Dias', 'Dias Observatory'] appears [2, 2, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Piera', 'Observatory'] appears [1, 3950] times in abstracts data.
['Piera Observatory'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Pierre', 'Auger', 'Observatory'] appears [45, 65, 3950] times in abstracts data.
['Pierre Auger', 'Auger Observatory'] appears [39, 37] times in abstracts data.
--------------------------------------------------------------------------------
['Pine', 'Bluff', 'Observatory'] appears [209, 0, 3950] times in abstracts data.
['Pine Bluff', 'Bluff Observatory'] appears [0, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Pine', 'Mountain', 'Observatory'] appears [209,

['Roger Twitchell', 'Twitchell Observatory[3]'] appears [0, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Rogers', 'Observatory'] appears [7, 3950] times in abstracts data.
['Rogers Observatory'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Rolnick', 'Observatory'] appears [0, 3950] times in abstracts data.
['Rolnick Observatory'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Rome', 'Observatory'] appears [4213, 3950] times in abstracts data.
['Rome Observatory'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Roseland', 'Observatory'] appears [0, 3950] times in abstracts data.
['Roseland Observatory'] appears [0] times in abstracts data.
-------------------------------------------------------------------

['Smithsonian Astrophysical', 'Astrophysical Observatory'] appears [4, 112] times in abstracts data.
--------------------------------------------------------------------------------
['Sobaeksan', 'Optical', 'Astronomy', 'Observatory'] appears [0, 12092, 1242, 3950] times in abstracts data.
['Sobaeksan Optical', 'Optical Astronomy', 'Astronomy Observatory'] appears [0, 24, 92] times in abstracts data.
--------------------------------------------------------------------------------
['Sola', 'Fide', 'Observatory'] appears [12282, 1913, 3950] times in abstracts data.
['Sola Fide', 'Fide Observatory'] appears [0, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Solar', 'and', 'Heliospheric', 'Observatory', '(SOHO)'] appears [10845, 62845, 457, 3950, 141] times in abstracts data.
['Solar and', 'and Heliospheric', 'Heliospheric Observatory', 'Observatory (SOHO)'] appears [484, 252, 233, 138] times in abstracts data.
---------------

['Stull Observatory'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Stuttgart', 'Observatory'] appears [1, 3950] times in abstracts data.
['Stuttgart Observatory'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Submillimeter', 'Array'] appears [843, 2905] times in abstracts data.
['Submillimeter Array'] appears [152] times in abstracts data.
--------------------------------------------------------------------------------
['St.', 'Thomas', 'Observatory'] appears [3170, 54, 3950] times in abstracts data.
['St. Thomas', 'Thomas Observatory'] appears [0, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Sudbury', 'Neutrino', 'Observatory'] appears [15, 1141, 3950] times in abstracts data.
['Sudbury Neutrino', 'Neutrino Observatory'] appears [15, 33] times in abstracts data.
-----------

['University', 'of', 'Illinois', 'Astronomical', 'Observatory'] appears [617, 63810, 60, 1486, 3950] times in abstracts data.
['University of', 'of Illinois', 'Illinois Astronomical', 'Astronomical Observatory'] appears [354, 4, 0, 113] times in abstracts data.
--------------------------------------------------------------------------------
['University', 'of', 'London', 'Observatory'] appears [617, 63810, 26, 3950] times in abstracts data.
['University of', 'of London', 'London Observatory'] appears [354, 3, 0] times in abstracts data.
--------------------------------------------------------------------------------
['University', 'of', 'Maryland', 'Observatory'] appears [617, 63810, 82, 3950] times in abstracts data.
['University of', 'of Maryland', 'Maryland Observatory'] appears [354, 17, 0] times in abstracts data.
--------------------------------------------------------------------------------
['University', 'of', 'Michigan-Dearborn', 'Observatory'] appears [617, 63810, 0, 3950] t

['Very Large', 'Large Array'] appears [1584, 682] times in abstracts data.
--------------------------------------------------------------------------------
['Very', 'Long', 'Baseline', 'Array'] appears [15587, 13581, 972, 2905] times in abstracts data.
['Very Long', 'Long Baseline', 'Baseline Array'] appears [532, 471, 305] times in abstracts data.
--------------------------------------------------------------------------------
['Vienna', 'Observatory'] appears [3, 3950] times in abstracts data.
['Vienna Observatory'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Vilnius', 'University', 'Observatory'] appears [6, 617, 3950] times in abstracts data.
['Vilnius University', 'University Observatory'] appears [0, 28] times in abstracts data.
--------------------------------------------------------------------------------
['Višnjan', 'Observatory'] appears [0, 3950] times in abstracts data.
['Višnjan Observatory'] appe

['Arctic Space', 'Space Weather', 'Weather Center'] appears [0, 160, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Yale', 'Student', 'Observatory'] appears [68, 87, 3950] times in abstracts data.
['Yale Student', 'Student Observatory'] appears [0, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Yangbajing', 'International', 'Cosmic-Ray', 'Observatory'] appears [4, 419, 773, 3950] times in abstracts data.
['Yangbajing International', 'International Cosmic-Ray', 'Cosmic-Ray Observatory'] appears [0, 0, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Yantra', 'Mandir'] appears [0, 0] times in abstracts data.
['Yantra Mandir'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Yerevan', 'Cosmic-Ray', 'Station'] appears [3, 773, 879] t

['Giant Metrewave', 'Metrewave Radio', 'Radio Telescope', 'Telescope (GMRT)'] appears [103, 103, 611, 73] times in abstracts data.
--------------------------------------------------------------------------------
['Ooty', 'Radio', 'Telescope', '(ORT)'] appears [16, 9524, 10808, 0] times in abstracts data.
['Ooty Radio', 'Radio Telescope', 'Telescope (ORT)'] appears [8, 611, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Gauribidanur', 'Radio', 'Observatory'] appears [5, 9524, 3950] times in abstracts data.
['Gauribidanur Radio', 'Radio Observatory'] appears [4, 98] times in abstracts data.
--------------------------------------------------------------------------------
['Nobeyama', 'radio', 'observatory'] appears [94, 9524, 3950] times in abstracts data.
['Nobeyama radio', 'radio observatory'] appears [41, 98] times in abstracts data.
--------------------------------------------------------------------------------
['Siberia

['Lovell Telescope'] appears [6] times in abstracts data.
--------------------------------------------------------------------------------
['Yevpatoria', 'RT-70', 'radio', 'telescope'] appears [0, 0, 9524, 10808] times in abstracts data.
['Yevpatoria RT-70', 'RT-70 radio', 'radio telescope'] appears [0, 0, 611] times in abstracts data.
--------------------------------------------------------------------------------
['RATAN-600'] appears [11] times in abstracts data.
[] appears [] times in abstracts data.
--------------------------------------------------------------------------------
['RT-64', '(TNA-1500)'] appears [0, 0] times in abstracts data.
['RT-64 (TNA-1500)'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['RT-64', '(TNA-1500)'] appears [0, 0] times in abstracts data.
['RT-64 (TNA-1500)'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['R

['Wurzburg v2.0', 'v2.0 LAB/OASU', 'LAB/OASU (Bordeaux', '(Bordeaux Observatory', 'Observatory radio', 'radio telescope)'] appears [0, 0, 0, 0, 6, 7] times in abstracts data.
--------------------------------------------------------------------------------
['European', 'VLBI', 'Network', '(EVN)'] appears [312, 436, 817, 6] times in abstracts data.
['European VLBI', 'VLBI Network', 'Network (EVN)'] appears [20, 22, 6] times in abstracts data.
--------------------------------------------------------------------------------
['Plateau', 'de', 'Bure', 'Interferometer'] appears [386, 62741, 61, 877] times in abstracts data.
['Plateau de', 'de Bure', 'Bure Interferometer'] appears [57, 57, 42] times in abstracts data.
--------------------------------------------------------------------------------
['Northern', 'Extended', 'Millimeter', 'Array'] appears [748, 4272, 1367, 2905] times in abstracts data.
['Northern Extended', 'Extended Millimeter', 'Millimeter Array'] appears [0, 0, 265] times in 

['Green', 'Bank', 'Interferometer', '(GBI)'] appears [522, 235, 877, 0] times in abstracts data.
['Green Bank', 'Bank Interferometer', 'Interferometer (GBI)'] appears [158, 5, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Green', 'Bank', 'Telescope', '(GBT)'] appears [522, 235, 10808, 24] times in abstracts data.
['Green Bank', 'Bank Telescope', 'Telescope (GBT)'] appears [158, 126, 24] times in abstracts data.
--------------------------------------------------------------------------------
['Green', 'Bank', '140', 'Foot', 'Telescope', '(140foot)'] appears [522, 235, 842, 370, 10808, 0] times in abstracts data.
['Green Bank', 'Bank 140', '140 Foot', 'Foot Telescope', 'Telescope (140foot)'] appears [158, 2, 9, 4, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Green', 'Bank', '20m', 'Telescope'] appears [522, 235, 50, 10808] times in abstracts data.
['Green Ban

['Baryon Acoustic', 'Acoustic Oscillations', 'Oscillations in', 'in Integrated', 'Integrated Neutral', 'Neutral Gas', 'Gas Observations', 'Observations (BINGO)'] appears [129, 141, 383, 16, 0, 345, 2, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Brazilian', 'Decimetric', 'Array', '(BDA)'] appears [1, 28, 2905, 0] times in abstracts data.
['Brazilian Decimetric', 'Decimetric Array', 'Array (BDA)'] appears [0, 0, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Cosmic', 'Background', 'Imager', '(CBI)'] appears [5786, 4430, 698, 21] times in abstracts data.
['Cosmic Background', 'Background Imager', 'Imager (CBI)'] appears [182, 27, 21] times in abstracts data.
--------------------------------------------------------------------------------
['Itapetinga', 'Radio', 'Observatory'] appears [0, 9524, 3950] times in abstracts data.
['Itapetinga Radio', 'Radio Observat

['Large', 'Latin', 'American', 'Millimeter', 'Array', '(LLAMA)'] appears [23305, 1813, 262, 1367, 2905, 0] times in abstracts data.
['Large Latin', 'Latin American', 'American Millimeter', 'Millimeter Array', 'Array (LLAMA)'] appears [0, 0, 0, 265, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Qitai', '110m', 'Radio', 'Telescope', '(QTT)'] appears [0, 2, 9524, 10808, 0] times in abstracts data.
['Qitai 110m', '110m Radio', 'Radio Telescope', 'Telescope (QTT)'] appears [0, 0, 611, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Square', 'Kilometer', 'Array', '(SKA-Phase1)'] appears [1022, 307, 2905, 0] times in abstracts data.
['Square Kilometer', 'Kilometer Array', 'Array (SKA-Phase1)'] appears [64, 57, 0] times in abstracts data.
--------------------------------------------------------------------------------
['30m', 'Sub-Millimeter', 'Telescope', '(TSMT)'] a

['THÉMIS Solar', 'Solar Telescope,', 'Telescope, Teide', 'Teide Observatory'] appears [0, 12, 0, 9] times in abstracts data.
--------------------------------------------------------------------------------
['Vacuum', 'Tower', 'Telescope', '(VTT),', 'Teide', 'Observatory'] appears [311, 52, 10808, 0, 45, 3950] times in abstracts data.
['Vacuum Tower', 'Tower Telescope', 'Telescope (VTT),', '(VTT), Teide', 'Teide Observatory'] appears [6, 11, 0, 0, 9] times in abstracts data.
--------------------------------------------------------------------------------
['Hida', 'Domeless', 'Solar', 'Telescope', '(ja)'] appears [23, 3, 10845, 10808, 0] times in abstracts data.
['Hida Domeless', 'Domeless Solar', 'Solar Telescope', 'Telescope (ja)'] appears [0, 3, 89, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Udaipur', 'Solar', 'ObservatoryMASTFull', 'Disk', 'H-alpha', 'TelescopeH-alpha', 'Spar', 'TelescopeCoudé', 'Telescope'] appears 

['Expanded', 'Owens', 'Valley', 'Solar', 'Array', '(EOVSA)'] appears [199, 101, 147, 10845, 2905, 0] times in abstracts data.
['Expanded Owens', 'Owens Valley', 'Valley Solar', 'Solar Array', 'Array (EOVSA)'] appears [1, 97, 5, 10, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Nobeyama', 'Radioheliograph', '(NoRH),', 'Nobeyama', 'Radio', 'Observatory'] appears [94, 34, 2, 94, 9524, 3950] times in abstracts data.
['Nobeyama Radioheliograph', 'Radioheliograph (NoRH),', '(NoRH), Nobeyama', 'Nobeyama Radio', 'Radio Observatory'] appears [15, 2, 0, 41, 98] times in abstracts data.
--------------------------------------------------------------------------------
['Nobeyama', 'Radio', 'Polarimeters,', 'Nobeyama', 'Radio', 'Observatory'] appears [94, 9524, 14, 94, 9524, 3950] times in abstracts data.
['Nobeyama Radio', 'Radio Polarimeters,', 'Polarimeters, Nobeyama', 'Nobeyama Radio', 'Radio Observatory'] appears [41, 0, 0, 41, 98

['Astronomical', 'Netherlands', 'Satellite', '(ANS)'] appears [1486, 33, 3148, 4] times in abstracts data.
['Astronomical Netherlands', 'Netherlands Satellite', 'Satellite (ANS)'] appears [4, 4, 4] times in abstracts data.
--------------------------------------------------------------------------------
['Ariel', 'V'] appears [56, 63826] times in abstracts data.
['Ariel V'] appears [11] times in abstracts data.
--------------------------------------------------------------------------------
['Aryabhata'] appears [0] times in abstracts data.
[] appears [] times in abstracts data.
--------------------------------------------------------------------------------
['Small', 'Astronomy', 'Satellite', '3', '(SAS-C)'] appears [12157, 1242, 3148, 39020, 0] times in abstracts data.
['Small Astronomy', 'Astronomy Satellite', 'Satellite 3', '3 (SAS-C)'] appears [5, 47, 0, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Cos-B'] appears [1

['Hitomi', '(Astro-H)'] appears [6, 0] times in abstracts data.
['Hitomi (Astro-H)'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Mikhailo', 'Lomonosov'] appears [0, 10] times in abstracts data.
['Mikhailo Lomonosov'] appears [0] times in abstracts data.
--------------------------------------------------------------------------------
['Neutron', 'Star', 'Interior', 'Composition', 'Explorer', '(NICER)'] appears [3892, 31462, 1164, 2368, 1333, 3] times in abstracts data.
['Neutron Star', 'Star Interior', 'Interior Composition', 'Composition Explorer', 'Explorer (NICER)'] appears [3370, 38, 21, 39, 3] times in abstracts data.
--------------------------------------------------------------------------------
['Hard', 'X-ray', 'Modulation', 'Telescope', '(HXMT)'] appears [3151, 11374, 985, 10808, 3] times in abstracts data.
['Hard X-ray', 'X-ray Modulation', 'Modulation Telescope', 'Telescope (HXMT)'] appears [1110, 30

['Hipparcos'] appears [465] times in abstracts data.
[] appears [] times in abstracts data.
--------------------------------------------------------------------------------
['Hubble', 'Space', 'Telescope'] appears [4804, 10148, 10808] times in abstracts data.
['Hubble Space', 'Space Telescope'] appears [3165, 4281] times in abstracts data.
--------------------------------------------------------------------------------
['MOST'] appears [17492] times in abstracts data.
[] appears [] times in abstracts data.
--------------------------------------------------------------------------------
['Swift', 'Gamma', 'Ray', 'Burst', 'Explorer'] appears [678, 4273, 18529, 7926, 1333] times in abstracts data.
['Swift Gamma', 'Gamma Ray', 'Ray Burst', 'Burst Explorer'] appears [30, 911, 2689, 7] times in abstracts data.
--------------------------------------------------------------------------------
['COROT'] appears [483] times in abstracts data.
[] appears [] times in abstracts data.
---------------

['SAMPEX'] appears [4] times in abstracts data.
[] appears [] times in abstracts data.
--------------------------------------------------------------------------------
['Alpha', 'Magnetic', 'Spectrometer', '01', '(AMS-01)'] appears [1500, 7933, 1267, 9024, 0] times in abstracts data.
['Alpha Magnetic', 'Magnetic Spectrometer', 'Spectrometer 01', '01 (AMS-01)'] appears [6, 12, 0, 0] times in abstracts data.
--------------------------------------------------------------------------------
['Payload', 'for', 'Antimatter', 'Matter', 'Exploration', 'and', 'Light-nuclei', 'Astrophysics', '(PAMELA)'] appears [74, 56668, 44, 7279, 295, 62845, 7, 1088, 0] times in abstracts data.
['Payload for', 'for Antimatter', 'Antimatter Matter', 'Matter Exploration', 'Exploration and', 'and Light-nuclei', 'Light-nuclei Astrophysics', 'Astrophysics (PAMELA)'] appears [9, 11, 7, 7, 21, 7, 7, 0] times in abstracts data.
--------------------------------------------------------------------------------
['IBEX'] a

In [7]:
def generate_bigrams(label):
    """Generate bigrams from a given label."""
    tokens = label.split()
    return [" ".join(tokens[i:i+2]) for i in range(len(tokens)-1)]

def get_indices(abstract_df, pattern):
    """Return indices of rows in abstract_df containing the pattern."""
    return set(abstract_df[abstract_df.str.contains(re.escape(pattern), case=False, na=False)].index.tolist())

# Collect indices of all rows that contain any bigram
all_bigram_indices = set()

for label in Labels['Labels']:
    bigrams = generate_bigrams(label)
    for bigram in bigrams:
        all_bigram_indices.update(get_indices(abstract_df, bigram))

# Determine indices of rows that don't contain any bigram
no_bigram_indices = set(abstract_df.index) - all_bigram_indices

# Display results
print(f"Number of rows that don't contain any bigram: {len(no_bigram_indices)}")
print(f"Indices of rows without any bigrams: {sorted(list(no_bigram_indices))}")

Number of rows that don't contain any bigram: 1507
Indices of rows without any bigrams: [39, 64, 65, 83, 101, 225, 235, 236, 266, 324, 342, 399, 453, 462, 470, 634, 642, 643, 698, 699, 735, 786, 789, 830, 1041, 1104, 1117, 1135, 1151, 1187, 1208, 1261, 1276, 1335, 1402, 1414, 1425, 1476, 1477, 1484, 1510, 1563, 1576, 1603, 1699, 1784, 1793, 1812, 1884, 1934, 1956, 1959, 2246, 2338, 2351, 2398, 2428, 2488, 2489, 2494, 2531, 2555, 2658, 2679, 2690, 2701, 2762, 2763, 3026, 3051, 3059, 3060, 3098, 3109, 3123, 3127, 3158, 3220, 3221, 3235, 3263, 3285, 3365, 3368, 3388, 3534, 3636, 3735, 3770, 3796, 3828, 3905, 3908, 3926, 3934, 3955, 3957, 3968, 4071, 4072, 4088, 4234, 4258, 4354, 4381, 4454, 4467, 4586, 4617, 4622, 4658, 4732, 4825, 4841, 4842, 4843, 4863, 4870, 4898, 4935, 5114, 5129, 5130, 5197, 5198, 5210, 5293, 5378, 5391, 5401, 5414, 5416, 5457, 5471, 5542, 5567, 5614, 5694, 5736, 5769, 5892, 6008, 6075, 6103, 6124, 6184, 6201, 6228, 6249, 6276, 6324, 6354, 6392, 6400, 6430, 6471, 651

# Full Text Data

In [8]:
# Count the occurrences of each label
label_counts = {}
for label in Labels['Labels']:
    escaped_label = re.escape(label)
    label_counts[label] = full_text.str.contains(escaped_label, case=False, na=False).sum()

# Results
for label, count in label_counts.items():
    print(f"'{label}' appears {count} times.")

'Abastumani Astrophysical Observatory' appears 3 times.
'Abu Reyhan-e Birooni Observatory' appears 0 times.
'Adirondack Public Observatory' appears 0 times.
'Adolphson Observatory' appears 0 times.
'Airdrie Public Observatory' appears 0 times.
'Aker Observatory' appears 0 times.
'Aldershot Observatory' appears 0 times.
'Algonquin Radio Observatory' appears 0 times.
'Allegheny Observatory' appears 75 times.
'Ametlla de Mar Observatory' appears 0 times.
'AMiBA' appears 2 times.
'Anderson Mesa Station' appears 1 times.
'Angell Hall Observatory' appears 0 times.
'Ankara University Observatory' appears 0 times.
'Antarctic Muon and Neutrino Detector Array' appears 0 times.
'Antarctic Submillimeter Telescope and Remote Observatory' appears 0 times.
'ANTARES' appears 73 times.
'Apache-Sitgreaves Observatory' appears 0 times.
'Apache Point Observatory' appears 2 times.
'Apatity Cosmic-Ray Station' appears 0 times.
'Apollo Observatory' appears 0 times.
'Arcetri Observatory' appears 0 times.
'Are

In [None]:
# Process each label and count occurrences of its tokens and bigrams
for idx, label in enumerate(Labels['Labels']):
    # Token counts
    tokens = label.split()  # Split the label into tokens
    token_counts = [full_text.str.contains(re.escape(token), case=False, na=False).sum() for token in tokens]
    print(f"{tokens} appears {token_counts} times in full text data.")
    
    # Bigram counts
    bigrams = generate_bigrams(label)
    bigram_counts = [full_text.str.contains(re.escape(bigram), case=False, na=False).sum() for bigram in bigrams]
    print(f"{bigrams} appears {bigram_counts} times in full text data.")
    print('-' * 80) 

['Abastumani', 'Astrophysical', 'Observatory'] appears [9, 901, 3881] times in full text data.
['Abastumani Astrophysical', 'Astrophysical Observatory'] appears [3, 179] times in full text data.
--------------------------------------------------------------------------------
['Abu', 'Reyhan-e', 'Birooni', 'Observatory'] appears [6795, 0, 0, 3881] times in full text data.
['Abu Reyhan-e', 'Reyhan-e Birooni', 'Birooni Observatory'] appears [0, 0, 0] times in full text data.
--------------------------------------------------------------------------------
['Adirondack', 'Public', 'Observatory'] appears [97, 15272, 3881] times in full text data.
['Adirondack Public', 'Public Observatory'] appears [0, 2] times in full text data.
--------------------------------------------------------------------------------
['Adolphson', 'Observatory'] appears [0, 3881] times in full text data.
['Adolphson Observatory'] appears [0] times in full text data.
---------------------------------------------------

['Astronomy', 'Tower', 'of', 'the', 'Sorbonne'] appears [2145, 1862, 24853, 24856, 208] times in full text data.
['Astronomy Tower', 'Tower of', 'of the', 'the Sorbonne'] appears [0, 158, 24800, 47] times in full text data.
--------------------------------------------------------------------------------
['Astrophysical', 'Institute', 'Potsdam'] appears [901, 12296, 628] times in full text data.
['Astrophysical Institute', 'Institute Potsdam'] appears [6, 3] times in full text data.
--------------------------------------------------------------------------------
['Atacama', 'Pathfinder', 'Experiment', '(APEX)'] appears [75, 391, 15373, 16] times in full text data.
['Atacama Pathfinder', 'Pathfinder Experiment', 'Experiment (APEX)'] appears [0, 0, 9] times in full text data.
--------------------------------------------------------------------------------
['Australia', 'Telescope', 'Compact', 'Array'] appears [3321, 1771, 2321, 2538] times in full text data.
['Australia Telescope', 'Teles

['Bohyunsan', 'Optical', 'Astronomy', 'Observatory', '(BOAO)'] appears [0, 7021, 2145, 3881, 0] times in full text data.
['Bohyunsan Optical', 'Optical Astronomy', 'Astronomy Observatory', 'Observatory (BOAO)'] appears [0, 11, 15, 0] times in full text data.
--------------------------------------------------------------------------------
['BOOTES'] appears [11] times in full text data.
[] appears [] times in full text data.
--------------------------------------------------------------------------------
['Bordeaux', 'Observatory'] appears [182, 3881] times in full text data.
['Bordeaux Observatory'] appears [1] times in full text data.
--------------------------------------------------------------------------------
['Bosscha', 'Observatory'] appears [2, 3881] times in full text data.
['Bosscha Observatory'] appears [0] times in full text data.
--------------------------------------------------------------------------------
['Boswell', 'Observatory', '(defunct)'] appears [34, 3881, 0] t

['Celestial Observatory'] appears [0] times in full text data.
--------------------------------------------------------------------------------
['Centennial', 'Observatory'] appears [577, 3881] times in full text data.
['Centennial Observatory'] appears [0] times in full text data.
--------------------------------------------------------------------------------
['Center', 'for', 'Astrophysics', '|', 'Harvard', '&', 'Smithsonian'] appears [12720, 24844, 863, 6340, 1823, 10103, 675] times in full text data.
['Center for', 'for Astrophysics', 'Astrophysics |', '| Harvard', 'Harvard &', '& Smithsonian'] appears [3210, 114, 1, 1, 0, 0] times in full text data.
--------------------------------------------------------------------------------
['Cerro', 'Armazones', 'Observatory'] appears [157, 3, 3881] times in full text data.
['Cerro Armazones', 'Armazones Observatory'] appears [2, 0] times in full text data.
--------------------------------------------------------------------------------
['C

In [None]:
def get_indices(full_text, pattern):
    """Return indices of rows in full_text containing the pattern."""
    return set(full_text[full_text.str.contains(re.escape(pattern), case=False, na=False)].index.tolist())

# Collect indices of all rows that contain any bigram
all_bigram_indices = set()

for label in Labels['Labels']:
    bigrams = generate_bigrams(label)
    for bigram in bigrams:
        all_bigram_indices.update(get_indices(full_text, bigram))

# Determine indices of rows that don't contain any bigram
no_bigram_indices = set(full_text.index) - all_bigram_indices

# Display results
print(f"Number of rows that don't contain any bigram: {len(no_bigram_indices)}")
print(f"Indices of rows without any bigrams: {sorted(list(no_bigram_indices))}")

# Labeling Task