In [89]:
from astropy.constants import c
from astropy.units import Quantity
import astropy.units as U
from astropy.cosmology import FlatLambdaCDM
from astropy.io import fits
import math
import pyvo
import numpy as np 
import pandas as pd
import os 
import sys
import random
from math import pi
import matplotlib.pyplot as plt
import seaborn as sns
import textwrap

master_path = os.path.dirname(os.getcwd())
service_url = "https://almascience.eso.org/tap"
service = pyvo.dal.TAPService(service_url)


In [90]:
def estimate_alma_beam_size(central_frequency_ghz, max_baseline_km, return_value=True):
  """
  Estimates the beam size of the Atacama Large Millimeter/submillimeter Array (ALMA) in arcseconds.

  This function provides an approximation based on the theoretical relationship between
  observing frequency and maximum baseline. The formula used is:
  beam_size = (speed_of_light / central_frequency) / max_baseline * (180 / pi) * 3600 arcseconds
  [km]/[s] * [s] / [km] = [radians] * [arcsec /radian] * [arcseconds/degree]

  Args:
      central_frequency_ghz: Central frequency of the observing band in GHz (float).
      max_baseline_km: Maximum baseline of the antenna array in kilometers (float).

  Returns:
      Estimated beam size in arcseconds (float).

  Raises:
      ValueError: If either input argument is non-positive.
  """

  # Input validation
  if central_frequency_ghz <= 0 or max_baseline_km <= 0:
    raise ValueError("Central frequency and maximum baseline must be positive values.")
    
  if type(central_frequency_ghz) != Quantity:
    central_frequency_ghz = central_frequency_ghz * U.GHz
  if type(max_baseline_km) != Quantity:
    max_baseline_km = max_baseline_km * U.km

  # Speed of light in meters per second
  light_speed = c.to(U.m / U.s).value

  # Convert frequency to Hz
  central_frequency_hz = central_frequency_ghz.to(U.Hz).value

  # Convert baseline to meters
  max_baseline_meters = max_baseline_km.to(U.m).value


  # Theoretical estimate of beam size (radians)
  theta_radians = (light_speed / central_frequency_hz) / max_baseline_meters

  # Convert theta from radians to arcseconds
  beam_size_arcsec = theta_radians * (180 / math.pi) * 3600 * U.arcsec
  if return_value == True:
    return beam_size_arcsec.value
  else:
    return beam_size_arcsec


In [91]:
def get_fov_from_band(band, antenna_diameter: int = 12, return_value=True):
    """
    This function returns the field of view of an ALMA band in arcseconds
    input: 
        band number (int): the band number of the ALMA band, between 1 and 10
        antenna_diameter (int): the diameter of the antenna in meters
    output:
        fov (astropy unit): the field of view in arcseconds

    """
    light_speed = c.to(U.m / U.s).value
    if band == 1:
        central_freq = 43 * U.GHz  
    elif band == 2:
        central_freq = 67 * U.GHz
    elif band == 3:
        central_freq = 100 * U.GHz
    elif band == 4:
        central_freq = 150 * U.GHz
    elif band == 5:
        central_freq = 217 * U.GHz
    elif band == 6:
        central_freq = 250 * U.GHz
    elif band == 7:
        central_freq = 353 * U.GHz
    elif band == 8:
        central_freq = 545 * U.GHz
    elif band == 9:
        central_freq = 650 * U.GHz    
    elif band == 10:
        central_freq = 868.5 * U.GHz
    central_freq = central_freq.to(U.Hz).value
    central_freq_s = 1 / central_freq
    wavelength = light_speed * central_freq_s
    # this is the field of view in Radians
    fov = 1.22 * wavelength / antenna_diameter
    # fov in arcsec
    fov = fov * (180 / math.pi) * 3600 * U.arcsec
    if return_value == True:
        return fov.value
    else:
        return fov

In [92]:
def get_band_central_freq(band):
    """
    Takes as input the band number and returns its central frequency in GHz
    """
    if band == 1:
        return 38
    elif band == 2:
        return 78.5
    elif band == 3:
        return 100
    elif band == 4:
        return  143
    elif band == 5:
        return  217
    elif band == 6:
        return 250
    elif band == 7:
        return 353
    elif band == 8:
        return 545
    elif band == 9:
        return 650
    elif band == 10:
        return 850

In [93]:
def get_max_baseline_from_antenna_array(antenna_array, master_path):
    antenna_coordinates = pd.read_csv(os.path.join(master_path, 'antenna_config', 'antenna_coordinates.csv'))
    obs_antennas = antenna_array.split(' ')
    obs_antennas = [antenna.split(':')[0] for antenna in obs_antennas]
    obs_coordinates = antenna_coordinates[antenna_coordinates['name'].isin(obs_antennas)]
    max_baseline = 2 * np.max(np.sqrt(obs_coordinates['x'].values**2 + obs_coordinates['y'].values**2 + obs_coordinates['z'].values**2)) / 1000
    return max_baseline

In [94]:
# def plot_science_keywords_distributions(service, master_path):
#     plot_dir = os.path.join(master_path, "plots")

#         # Check if plot directory exists
#     if not os.path.exists(plot_dir):
#         os.makedirs(plot_dir)
#     else:
#         # Check if plot files already exist
#         existing_plots = [f for f in os.listdir(plot_dir) if f.endswith('.png')]
#         expected_plots = ['science_vs_bands.png', 'science_vs_int_time.png', 'science_vs_source_freq.png',
#                           'science_vs_FoV.png', 'science_vs_beam_size.png', 'science_vs_total_time.png']
#         if all(plot_file in existing_plots for plot_file in expected_plots):
#             print("Plots already exist. Exiting.")
#             return
#         else:
#             print("Some plots are missing. Generating missing plots.")
#         # Identify missing plots
#     missing_plots = [plot for plot in expected_plots if plot not in existing_plots]

#     # Query only for variables associated with missing plots
#     query_variables = set()
#     for missing_plot in missing_plots:
#         if missing_plot == 'science_vs_bands.png':
#             query_variables.update(['science_keyword', 'band_list'])
#         elif missing_plot == 'science_vs_int_time.png':
#             query_variables.update(['science_keyword', 't_resolution'])
#         elif missing_plot == 'science_vs_source_freq.png':
#             query_variables.update(['science_keyword', 'frequency'])
#         elif missing_plot == 'science_vs_FoV.png':
#             query_variables.update(['science_keyword', 'band_list'])
#         elif missing_plot == 'science_vs_beam_size.png':
#             query_variables.update(['science_keyword', 'band_list', 'antenna_arrays'])
#         elif missing_plot == 'science_vs_total_time.png':
#             query_variables.update(['science_keyword', 't_max'])

#     query = f"""  
#             SELECT {', '.join(query_variables)}, member_ous_uid
#             FROM ivoa.obscore  
#             WHERE science_observation = 'T'
#             AND is_mosaic = 'F'
#             """
    
#     custom_palette = sns.color_palette("tab20")
#     sns.set_palette(custom_palette)
#     db = service.search(query).to_table().to_pandas()
#     db = db.drop_duplicates(subset='member_ous_uid')

#     # Splitting the science keywords at commas
#     db['science_keyword'] = db['science_keyword'].str.split(',')
#     db['science_keyword'] = db['science_keyword'].apply(lambda x: [y.strip() for y in x])
#     db = db.explode('science_keyword')
#     db = db.drop(db[db['science_keyword'] == ''].index)
#     db = db.drop(db[db['science_keyword'] == 'Exoplanets'].index)
#     db = db.drop(db[db['science_keyword'] == 'Galaxy structure &evolution'].index)
#     db = db.drop(db[db['science_keyword'] == 'Evolved stars: Shaping/physical structure'].index)

#     db['band_list'] = db['band_list'].str.split(' ')
#     db['band_list'] = db['band_list'].apply(lambda x: [y.strip() for y in x])
#     db = db.explode('band_list')
#     db['max_baseline'] = db['antenna_arrays'].apply(lambda x: get_max_baseline_from_antenna_array(x, master_path))
#     db['central_freq'] = db['band_list'].apply(lambda x: get_band_central_freq(int(x)))
#     db['fov'] = db['band_list'].apply(lambda x: get_fov_from_band(int(x)))
#     db['beam_size'] = db[['central_freq', 'max_baseline']].apply(lambda x: estimate_alma_beam_size(*x), axis=1)

#     # Exploding to have one row for each combination of science keyword and band
#     #db = db.explode(['science_keyword', 'band_list', 'frequency', 't_resolution', 't_max', 'max_baseline', 'central_freq', 'fov', 'beam_size'])

#     db = db[db['t_resolution'] <= 3e4]
#     frequency_bins = np.arange(db['frequency'].min(), db['frequency'].max(), 50)  # 50 GHz bins
#     db['frequency_bin'] = pd.cut(db['frequency'], bins=frequency_bins)
#     time_bins = np.arange(db['t_resolution'].min(), db['t_resolution'].max(), 1000)  # 1000 second bins
#     db['time_bin'] = pd.cut(db['t_resolution'], bins=time_bins)
#     fov_bins = np.arange(db['fov'].min(), db['fov'].max(), 10)  #  10 arcsec bins
#     db['fov_bins'] = pd.cut(db['fov'], bins=fov_bins)
#     beam_size_bins = np.arange(db['beam_size'].min(), db['beam_size'].max(), 0.1)  # 0.1 arcsec bins
#     db['beam_bins'] = pd.cut(db['beam_size'], bins=beam_size_bins)
#     total_time_bins = np.arange(db['t_max'].min(), db['t_max'].max(), 500)  # 500 seconds bins
#     db['Ttime_bins'] = pd.cut(db['t_max'], bins=total_time_bins)

#     db_sk_b = db.groupby(['science_keyword', 'band_list']).size().unstack(fill_value=0)
#     db_sk_f = db.groupby(['science_keyword', 'frequency_bin']).size().unstack(fill_value=0)
#     db_sk_t = db.groupby(['science_keyword', 'time_bin']).size().unstack(fill_value=0)
#     db_sk_fov = db.groupby(['science_keyword', 'fov_bins']).size().unstack(fill_value=0)
#     db_sk_bs = db.groupby(['science_keyword', 'beam_bins']).size().unstack(fill_value=0)
#     db_sk_Tt = db.groupby(['science_keyword', 'Ttime_bins']).size().unstack(fill_value=0)
    
#     plt.rcParams["figure.figsize"] = (14,18)
#     db_sk_b.plot(kind='barh', stacked=True, color=custom_palette)
#     plt.title('Science Keywords vs. ALMA Bands')
#     plt.xlabel('Counts')
#     plt.ylabel('Science Keywords')
#     plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left',title='ALMA Bands')
#     plt.savefig(os.path.join(plot_dir, 'science_vs_bands.png'))
#     plt.close()

#     plt.rcParams["figure.figsize"] = (14,18)
#     db_sk_t.plot(kind='barh', stacked=True)
#     plt.title('Science Keywords vs. Integration Time')
#     plt.xlabel('Counts')
#     plt.ylabel('Science Keywords')
#     plt.legend(title='Integration Time', loc='upper left', bbox_to_anchor=(1.01, 1))
#     plt.savefig(os.path.join(plot_dir, 'science_vs_int_time.png'))
#     plt.close()

#     plt.rcParams["figure.figsize"] = (14,18)
#     db_sk_f.plot(kind='barh', stacked=True, color=custom_palette)
#     plt.title('Science Keywords vs. Source Frequency')
#     plt.xlabel('Counts')
#     plt.ylabel('Science Keywords')
#     plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left',title='Frequency')
#     plt.savefig(os.path.join(plot_dir, 'science_vs_source_freq.png')) 
#     plt.close()

#     plt.rcParams["figure.figsize"] = (14,18)
#     db_sk_fov.plot(kind='barh', stacked=True, color=custom_palette)
#     plt.title('Science Keywords vs. FoV')
#     plt.xlabel('Counts')
#     plt.ylabel('Science Keywords')
#     plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left',title='FoV')
#     plt.savefig(os.path.join(plot_dir, 'science_vs_FoV.png'))
#     plt.close()

#     plt.rcParams["figure.figsize"] = (14,18)
#     db_sk_bs.plot(kind='barh', stacked=True, color=custom_palette)
#     plt.title('Science Keywords vs. beams_size')
#     plt.xlabel('Counts')
#     plt.ylabel('Science Keywords')
#     plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left',title='Beams Size')
#     plt.savefig(os.path.join(plot_dir, 'science_vs_beam_size.png'))
#     plt.close()

#     plt.rcParams["figure.figsize"] = (14,18)
#     db_sk_Tt.plot(kind='barh', stacked=True, color=custom_palette)
#     plt.title('Science Keywords vs. Total Time')
#     plt.xlabel('Counts')
#     plt.ylabel('Science Keywords')
#     plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left',title='Total Time')
#     plt.savefig(os.path.join(plot_dir, 'science_vs_total_time.png'))
#     plt.close()

    

# plot = plot_science_keywords_distributions(service, master_path)

In [98]:
def plot_science_keywords_distributions(service, master_path):
    plot_dir = os.path.join(master_path, "plots")

        # Check if plot directory exists
    if not os.path.exists(plot_dir):
        os.makedirs(plot_dir)
        existing_plots = []  # Initialize as empty list if plot directory doesn't exist
    else:
        # Check if plot files already exist
        existing_plots = [f for f in os.listdir(plot_dir) if f.endswith('.png')]

    expected_plots = ['science_vs_bands.png', 'science_vs_int_time.png', 'science_vs_source_freq.png',
                      'science_vs_FoV.png', 'science_vs_beam_size.png', 'science_vs_total_time.png']

    if all(plot_file in existing_plots for plot_file in expected_plots):
        print("Plots already exist. Exiting.")
        return
    else:
        print("Some plots are missing. Generating missing plots.")
        # Identify missing plots
    missing_plots = [plot for plot in expected_plots if plot not in existing_plots]

    # Query only for variables associated with missing plots
    query_variables = set()
    for missing_plot in missing_plots:
        if missing_plot == 'science_vs_bands.png':
            query_variables.update(['science_keyword', 'band_list'])
        elif missing_plot == 'science_vs_int_time.png':
            query_variables.update(['science_keyword', 't_resolution'])
        elif missing_plot == 'science_vs_source_freq.png':
            query_variables.update(['science_keyword', 'frequency'])
        elif missing_plot == 'science_vs_FoV.png':
            query_variables.update(['science_keyword', 'band_list'])
        elif missing_plot == 'science_vs_beam_size.png':
            query_variables.update(['science_keyword', 'band_list', 'antenna_arrays'])
        elif missing_plot == 'science_vs_total_time.png':
            query_variables.update(['science_keyword', 't_max'])

    query = f"""  
            SELECT {', '.join(query_variables)}, member_ous_uid
            FROM ivoa.obscore  
            WHERE science_observation = 'T'
            AND is_mosaic = 'F'
            """
    
    custom_palette = sns.color_palette("tab20")
    sns.set_palette(custom_palette)
    db = service.search(query).to_table().to_pandas()
    db = db.drop_duplicates(subset='member_ous_uid')

    # Splitting the science keywords at commas
    db['science_keyword'] = db['science_keyword'].str.split(',')
    db['science_keyword'] = db['science_keyword'].apply(lambda x: [y.strip() for y in x])
    db = db.explode('science_keyword')
    db = db.drop(db[db['science_keyword'] == ''].index)
    db = db.drop(db[db['science_keyword'] == 'Exoplanets'].index)
    db = db.drop(db[db['science_keyword'] == 'Galaxy structure &evolution'].index)
    db = db.drop(db[db['science_keyword'] == 'Evolved stars: Shaping/physical structure'].index)
    short_keyword = {
        'Solar system - Trans-Neptunian Objects (TNOs)' : 'Solar System - TNOs',
        'Photon-Dominated Regions (PDR)/X-Ray Dominated Regions (XDR)': 'Photon/X-Ray Domanited Regions',
        'Luminous and Ultra-Luminous Infra-Red Galaxies (LIRG & ULIRG)': 'LIRG & ULIRG',
        'Cosmic Microwave Background (CMB)/Sunyaev-Zel\'dovich Effect (SZE)': 'CMB/Sunyaev-Zel\'dovich Effect',
        'Active Galactic Nuclei (AGN)/Quasars (QSO)': 'AGN/QSO',
        'Inter-Stellar Medium (ISM)/Molecular clouds': 'ISM & Molecular Clouds',
    }
    
    db['science_keyword'] = db['science_keyword'].replace(short_keyword)

    for missing_plot in missing_plots:
        if missing_plot == 'science_vs_bands.png':
            db['band_list'] = db['band_list'].str.split(' ')
            db['band_list'] = db['band_list'].apply(lambda x: [y.strip() for y in x])
            db = db.explode('band_list')

            db_sk_b = db.groupby(['science_keyword', 'band_list']).size().unstack(fill_value=0)

            plt.rcParams["figure.figsize"] = (26,18)
            db_sk_b.plot(kind='barh', stacked=True, color=custom_palette)
            plt.title('Science Keywords vs. ALMA Bands')
            plt.xlabel('Counts')
            plt.ylabel('Science Keywords')
            plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left',title='ALMA Bands')
            plt.savefig(os.path.join(plot_dir, 'science_vs_bands.png'))
            plt.close()

        elif missing_plot == 'science_vs_int_time.png':
            db = db[db['t_resolution'] <= 3e4]
            time_bins = np.arange(db['t_resolution'].min(), db['t_resolution'].max(), 1000)  # 1000 second bins
            db['time_bin'] = pd.cut(db['t_resolution'], bins=time_bins)

            db_sk_t = db.groupby(['science_keyword', 'time_bin']).size().unstack(fill_value=0)

            plt.rcParams["figure.figsize"] = (26,18)
            db_sk_t.plot(kind='barh', stacked=True)
            plt.title('Science Keywords vs. Integration Time')
            plt.xlabel('Counts')
            plt.ylabel('Science Keywords')
            plt.legend(title='Integration Time', loc='upper left', bbox_to_anchor=(1.01, 1))
            plt.savefig(os.path.join(plot_dir, 'science_vs_int_time.png'))
            plt.close()

        elif missing_plot == 'science_vs_source_freq.png':
            frequency_bins = np.arange(db['frequency'].min(), db['frequency'].max(), 50)  # 50 GHz bins
            db['frequency_bin'] = pd.cut(db['frequency'], bins=frequency_bins)

            db_sk_f = db.groupby(['science_keyword', 'frequency_bin']).size().unstack(fill_value=0)

            plt.rcParams["figure.figsize"] = (26,18)
            db_sk_f.plot(kind='barh', stacked=True, color=custom_palette)
            plt.title('Science Keywords vs. Source Frequency')
            plt.xlabel('Counts')
            plt.ylabel('Science Keywords')
            plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left',title='Frequency')
            plt.savefig(os.path.join(plot_dir, 'science_vs_source_freq.png')) 
            plt.close()

        elif missing_plot == 'science_vs_FoV.png':
            db['band_list'] = db['band_list'].str.split(' ')
            db['band_list'] = db['band_list'].apply(lambda x: [y.strip() for y in x])
            db = db.explode('band_list')
            db['fov'] = db['band_list'].apply(lambda x: get_fov_from_band(int(x)))
            fov_bins = np.arange(db['fov'].min(), db['fov'].max(), 10)  #  10 arcsec bins
            db['fov_bins'] = pd.cut(db['fov'], bins=fov_bins)

            db_sk_fov = db.groupby(['science_keyword', 'fov_bins']).size().unstack(fill_value=0)

            plt.rcParams["figure.figsize"] = (26,18)
            db_sk_fov.plot(kind='barh', stacked=True, color=custom_palette)
            plt.title('Science Keywords vs. FoV')
            plt.xlabel('Counts')
            plt.ylabel('Science Keywords')
            plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left',title='FoV')
            plt.savefig(os.path.join(plot_dir, 'science_vs_FoV.png'))
            plt.close()

        elif missing_plot == 'science_vs_beam_size.png':
            db['band_list'] = db['band_list'].str.split(' ')
            db['band_list'] = db['band_list'].apply(lambda x: [y.strip() for y in x])
            db = db.explode('band_list')
            db['max_baseline'] = db['antenna_arrays'].apply(lambda x: get_max_baseline_from_antenna_array(x, master_path))
            db['central_freq'] = db['band_list'].apply(lambda x: get_band_central_freq(int(x)))
            db['beam_size'] = db[['central_freq', 'max_baseline']].apply(lambda x: estimate_alma_beam_size(*x), axis=1)
            beam_size_bins = np.arange(db['beam_size'].min(), db['beam_size'].max(), 0.1)  # 0.1 arcsec bins
            db['beam_bins'] = pd.cut(db['beam_size'], bins=beam_size_bins)

            db_sk_bs = db.groupby(['science_keyword', 'beam_bins']).size().unstack(fill_value=0)

            plt.rcParams["figure.figsize"] = (26,18)
            db_sk_bs.plot(kind='barh', stacked=True, color=custom_palette)
            plt.title('Science Keywords vs. beams_size')
            plt.xlabel('Counts')
            plt.ylabel('Science Keywords')
            plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left',title='Beams Size')
            plt.savefig(os.path.join(plot_dir, 'science_vs_beam_size.png'))
            plt.close()

        elif missing_plot == 'science_vs_total_time.png':
            total_time_bins = np.arange(db['t_max'].min(), db['t_max'].max(), 500)  # 500 seconds bins
            db['Ttime_bins'] = pd.cut(db['t_max'], bins=total_time_bins)
            
            db_sk_Tt = db.groupby(['science_keyword', 'Ttime_bins']).size().unstack(fill_value=0)
            
            plt.rcParams["figure.figsize"] = (26,18)
            db_sk_Tt.plot(kind='barh', stacked=True, color=custom_palette)
            plt.title('Science Keywords vs. Total Time')
            plt.xlabel('Counts')
            plt.ylabel('Science Keywords')
            plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left',title='Total Time')
            plt.savefig(os.path.join(plot_dir, 'science_vs_total_time.png'))
            plt.close()
    

plot = plot_science_keywords_distributions(service, master_path)

Some plots are missing. Generating missing plots.


  db_sk_t = db.groupby(['science_keyword', 'time_bin']).size().unstack(fill_value=0)
  db_sk_f = db.groupby(['science_keyword', 'frequency_bin']).size().unstack(fill_value=0)
  db_sk_fov = db.groupby(['science_keyword', 'fov_bins']).size().unstack(fill_value=0)
  db_sk_bs = db.groupby(['science_keyword', 'beam_bins']).size().unstack(fill_value=0)
  db_sk_Tt = db.groupby(['science_keyword', 'Ttime_bins']).size().unstack(fill_value=0)
