In [1]:
# Start small and save your time.

In [2]:
# import necessary packages

import pandas as pd # for working with tables
pd.options.mode.chained_assignment = None  # default = 'warn'

import os # for finding file directory with os.getcwd()
from datetime import datetime

# for calculations
import numpy as np 
import math

import matplotlib.pyplot as plt # for plotting

# for plotting histogram with percentages
from matplotlib.ticker import PercentFormatter
import matplotlib.ticker as mtick
from matplotlib.ticker import FormatStrFormatter

from PIL import Image # for converting plots to black-and-white
from IPython.display import HTML

# from ipynb.fs.full.MyFunctions_00 import *

In [3]:
# import dataframes from local .csv files

def read_data_old(file_string):
    # arguments: a filename string
    # returns: table containing data from the corresponding .csv file in my Dropbox
    
    mac = True # change depending on Windows/Mac operating system
    
    # finds whether the file is in the "Voids" or "Walls" folder
    if 'VOID' in file_string: folder = 'Voids'
    else: folder = 'Walls'
    
    if(mac):
#         file = pd.read_csv('/Users/anisharadhey/Dropbox/AstroSummer22/Data/'
#                            + folder + '/' + file_string + '.csv')
          file = pd.read_csv('/Users/anisharadhey/Dropbox/voids_Anish/Data/'
                           + folder + '/' + file_string + '.csv')
    else:
        file = pd.read_csv('\\Users\\smara\\Dropbox\\AstroSummer22\\Data\\'
                           + folder + '\\' + file_string + '.csv')
    return(file)

In [4]:
# import dataframes from local .csv files from a Mac computer

def read_data(file_string):
    # arguments: a filename string
    # returns: table containing data from the corresponding .csv file in local computer folder
    
    file = pd.read_csv('/Users/anisharadhey/AstroLocal22/' + file_string + '.csv')
    print('read ' + file_string)
        
    return(file)

def read_data_db(file_string):
    # arguments: a filename string
    # returns: table containing data from the corresponding .csv file in Dropbox folder
    
#     file = pd.read_csv('/Users/anisharadhey/Dropbox/AstroSummer22/Code/' + file_string + '.csv')
    
    file = pd.read_csv('/Users/anisharadhey/Dropbox/voids_Anish/Code/Files/' + file_string + '.csv')
    print('read ' + file_string)
        
    return(file)

In [5]:
def read_data_code(file_string):
    # arguments: a filename string
    # returns: table containing data from the corresponding .csv file in Dropbox folder
    
#     file = pd.read_csv('/Users/anisharadhey/Dropbox/AstroSummer22/Code/' + file_string + '.csv')
    
    file = pd.read_csv('/Users/anisharadhey/Dropbox/voids_Anish/Code/' + file_string + '.csv')
    print('read ' + file_string)
        
    return(file)

In [6]:
def combine(NEO_full, ME_full):
    # arguments: dataframes for one group from NEOWISE and MEP
    # returns: dataframe containing rows from both NEOWISE and MEP tables with standardized columns

    # filter by SNR to keep only higher values
    # for the multi-epoch data, we estimate SNR using the inverse of uncertainty in magnitudes
    
    NEO_filt = NEO_full[(NEO_full['w1snr'] >= 5) & 
                        (NEO_full['w2snr'] >= 5)]
    
    ME_full['w1snr_est'] = ME_full.w1mpro_ep / ME_full.w1sigmpro_ep
    ME_full['w2snr_est'] = ME_full.w2mpro_ep / ME_full.w2sigmpro_ep
    
    ME_filt = ME_full[(ME_full['w1snr_est'] >= 5) & 
                      (ME_full['w2snr_est'] >= 5)]
    
    # remove unecessary columns and rename other so that NEO and MEP match
    
    NEO_cols = NEO_filt[["object_tag_01", "w1mpro", "w2mpro", "mjd"]].rename(
        columns = {"object_tag_01": "object_tag"})

    ME_cols = ME_filt[["object_tag_01", "w1mpro_ep", "w2mpro_ep", "mjd"]].rename(
        columns = {"object_tag_01": "object_tag", "w1mpro_ep": "w1mpro", "w2mpro_ep": "w2mpro"})

    full = pd.concat([NEO_cols, ME_cols], ignore_index = True)

    return(full)

In [7]:
def combine_new(NEO_full, ME_full):
    # arguments: dataframes for one group from NEOWISE and MEP
    # returns: dataframe containing rows from both NEOWISE and MEP tables with standardized columns

    # filter by SNR to keep only higher values
    # for the multi-epoch data, we estimate SNR using the inverse of uncertainty in magnitudes
    
    NEO_filt = NEO_full[(NEO_full['w1snr'] >= 5) & 
                        (NEO_full['w2snr'] >= 5)]
    
    ME_full['w1snr_est'] = ME_full.w1mpro_ep / ME_full.w1sigmpro_ep
    ME_full['w2snr_est'] = ME_full.w2mpro_ep / ME_full.w2sigmpro_ep
    
    ME_filt = ME_full[(ME_full['w1snr_est'] >= 5) & 
                      (ME_full['w2snr_est'] >= 5)]
    
    # remove unecessary columns and rename other so that NEO and MEP match
    
    NEO_cols = NEO_filt[["nsaid_01", "w1mpro", "w2mpro", "mjd"]]
    NEO_cols['object_tag'] = NEO_cols['nsaid_01']

    ME_cols = ME_filt[["nsaid_01", "w1mpro_ep", "w2mpro_ep", "mjd"]].rename(
        columns = {"w1mpro_ep": "w1mpro", "w2mpro_ep": "w2mpro"})
    ME_cols['object_tag'] = NEO_cols['nsaid_01']

    full = pd.concat([NEO_cols, ME_cols], ignore_index = True)

    return(full)

In [8]:
def expand_cols(full, tag_num):
    # arguments: dataframe of combined NEOWISE and MEP data + desired object tag
    # returns: same dataframe with columns for mjd starting at 0, W1-W2 color, bin #, and mjd_binned.
    #          the dataframe will only contain rows with the indicated object tag.
    # filter dataframe to only get rows for the specific object
    
    tag = tag_num
    full_tag = full[(full['object_tag'] == tag)]

    # within the rows for the specific object, subtract from the mjds so that the earliest date is 0 days
    min_mjd = full_tag['mjd'].min()
    full_tag['mjd_zero'] = full_tag['mjd'] - min_mjd

    # add column for W1 - W2 color
    full_tag['W1-W2'] = full_tag.w1mpro - full_tag.w2mpro
    
    # bin the data (resource: https://stackoverflow.com/questions/6163334/binning-data-in-python-with-scipy-numpy)
    mjd_full = full_tag['mjd_zero']
    data = mjd_full.to_numpy()
    bins = np.arange(start = 10, stop = int(mjd_full.max()) + 11, step = 10) # each bins is 10 days
    bin_num = np.digitize(data, bins, right = True) # returns the bin index for each data item
    # bins[i-1] < x <= bins[i]

    # add a bin number to the dataframe
    full_tag['bin_num'] = bin_num
    
    # define x values to plot the data as the center of the bins
    # based on the bin number, assign each observation a centered mjd value for plotting
#     print(bins)
    xs = np.arange(start = 5, stop = bins.max() + 10, step = 10)
    full_tag['mjd_binned'] = xs[full_tag['bin_num']]
    
    return(full_tag)

In [9]:
def save_binned(full_tag, tag_num):
    # arguments: dataframe of expanded column data for one group and one object tag
    # returns: table with mean W1, W2, and color (with associated erors) for each filled bin in the argument table

    # make an empty mean table with standard deviations
    mean_table = pd.DataFrame(columns = ['object_tag', 'mjd_binned', 
                                         'mean_W1', 'mean_W2', 'mean_color',
                                         'std_W1', 'std_W2', 'std_color'])

    # for each unique centered mjd value...
    for x in np.unique(full_tag['mjd_binned']):

        # get the rows from the expanded dataframe that have that mjd binned value
        temp = full_tag[(full_tag['mjd_binned'] == x)]

        # calculate the means of the measurements as well as the associated errors
        # this will be plotted as the point on the light curve for the corresponding bin
        mean_W1 = temp['w1mpro'].mean()
        mean_W2 = temp['w2mpro'].mean()
        mean_diff = temp['W1-W2'].mean()

        std_W1 = np.std(temp['w1mpro'])
        std_W2 = np.std(temp['w2mpro'])
        std_diff = np.std(temp['W1-W2'])

        # add this information as a row to the mean table
        # the mean table will contain a row for each unique centered mjd position
        # this will contain all of the information needed to plot a light curve
        mean_table.loc[len(mean_table.index)] = [tag_num, 
                                                 int(x), 
                                                 mean_W1, mean_W2, mean_diff, 
                                                 std_W1, std_W2, std_diff]
        
    return(mean_table)

In [10]:
def get_err(x, y):
    # arguments: any quantity with a dividend and divisor
    # returns: a string containing the associated error for the quotient
    
    num_error = math.sqrt((1 / x) + (1 / y))
    return " ± " + str(round(num_error, 3))

In [11]:
def get_err_num(x, y):
    # arguments: any quantity with a dividend and divisor
    # returns: a string containing the associated error for the quotient
    
    num_error = math.sqrt((1.0 / x) + (1.0 / y))
    return num_error

In [12]:
def outlier_table(ultimate_mean, sig):
    # arguments = 
    # returns = 

    outlier_table = pd.DataFrame(columns = ['object_tag', 
                                            'mjd_binned', 
                                            'mean_W1', 
                                            'mean_W2', 
                                            'mean_color', 
                                            'std_W1', 
                                            'std_W2', 
                                            'std_color'])
    
    count = 0

    for t in np.unique(ultimate_mean['object_tag']):

        ultimate_t = ultimate_mean[(ultimate_mean['object_tag'] == t)]
        
        num_bins = len(ultimate_t)
        
        if (num_bins > 3):
            
            W1_mean = ultimate_t['mean_W1'].mean()
            W2_mean = ultimate_t['mean_W2'].mean()

            W1_std = np.std(ultimate_t['mean_W1'])
            W2_std = np.std(ultimate_t['mean_W2'])
            
            outliers_removed = ultimate_t[
                (ultimate_t['mean_W1'] <= W1_mean + (sig * W1_std)) &
                (ultimate_t['mean_W1'] >= W1_mean - (sig * W1_std)) &
                (ultimate_t['mean_W2'] <= W2_mean + (sig * W2_std)) &
                (ultimate_t['mean_W2'] >= W2_mean - (sig * W2_std))
            ]
            
            if (len(outliers_removed) > 0):
                outlier_table = pd.concat([outlier_table, outliers_removed], ignore_index = True)
                
        count += 1
        
        if(count % 5000 == 0):
            print(str(count) + ' galaxies')
        
    return(outlier_table)

In [13]:
def outlier_table_switched(ultimate_mean, sig):
    # arguments = 
    # returns = 

    outlier_table = pd.DataFrame(columns = ['object_tag', 
                                            'mjd_binned', 
                                            'mean_W1', 
                                            'mean_W2', 
                                            'mean_color', 
                                            'std_W1', 
                                            'std_W2', 
                                            'std_color'])
    
    count = 0

    for t in np.unique(ultimate_mean['object_tag']):

        ultimate_t = ultimate_mean[(ultimate_mean['object_tag'] == t)]
        
        num_bins = len(ultimate_t)
        
        W1_mean = ultimate_t['mean_W1'].mean()
        W2_mean = ultimate_t['mean_W2'].mean()

        W1_std = np.std(ultimate_t['mean_W1'])
        W2_std = np.std(ultimate_t['mean_W2'])
        
        outliers_removed = ultimate_t[
                (ultimate_t['mean_W1'] <= W1_mean + (sig * W1_std)) &
                (ultimate_t['mean_W1'] >= W1_mean - (sig * W1_std)) &
                (ultimate_t['mean_W2'] <= W2_mean + (sig * W2_std)) &
                (ultimate_t['mean_W2'] >= W2_mean - (sig * W2_std))
            ]
        
        if (len(outliers_removed) > 3):
            
            outlier_table = pd.concat([outlier_table, outliers_removed], ignore_index = True)
                
        count += 1
        
        if(count % 5000 == 0):
            print(str(count) + ' galaxies')
        
    return(outlier_table)

In [14]:
def light_curve(t, full, save):
    # plot light curve for a given object
    # enter object tag for the desired object
    # generate all columns and binned data for that object
    
    full_tag = expand_cols(full, t)
    mean = save_binned(full_tag, t)

    # define three vertically-stackd subplots
    fig, axs = plt.subplots(3, 1, 
                            figsize = (16, 10),
                            sharex = 'col', 
                            sharey = 'row'
                           )

    plt.subplots_adjust(wspace = 0, hspace = 0)

    # add title that automatically includes the name of the plotted object
    fig.suptitle('WISE light curve of void galaxy (object tag = ' + str(t) + ")", fontsize = 'x-large')

    # scatter individual measurements using the full_tag table
    # different colors and bands are used for each subplot
    axs[0].scatter(full_tag['mjd_binned'], 
                   full_tag['w1mpro'], 
                   s = 15, 
                   c = 'dimgray',
                   marker = "o",
                   alpha = 0.3)

    axs[1].scatter(full_tag['mjd_binned'], 
                   full_tag['w2mpro'], 
                   s = 15, 
                   c = 'dimgray',
                   marker = "o",
                   alpha = 0.3)

    axs[2].scatter(full_tag['mjd_binned'], 
                   full_tag['W1-W2'], 
                   s = 15, 
                   c = 'dimgray',
                   marker = "o",
                   alpha = 0.3)

    # scatter the mean points on top of the individual measurements, all on the same binned x-values
    axs[0].scatter(mean['mjd_binned'], 
                   mean['mean_W1'], 
                   s = 65, 
                   c = 'dodgerblue',
                   marker = "s")

    axs[1].scatter(mean['mjd_binned'], 
                   mean['mean_W2'], 
                   s = 65, 
                   c = 'dodgerblue',
                   marker = "s")

    axs[2].scatter(mean['mjd_binned'], 
                   mean['mean_color'], 
                   s = 65, 
                   c = 'dodgerblue',
                   marker = "s")

    # add error bars for means using the standard deviations calculated in the mean table
    axs[0].errorbar(mean['mjd_binned'], 
                    mean['mean_W1'],
                    yerr = mean['std_W1'],
                    c = "dodgerblue",
                    ecolor = "dodgerblue",
                    capsize = 4,
                    fmt = "o")

    axs[1].errorbar(mean['mjd_binned'], 
                    mean['mean_W2'],
                    yerr = mean['std_W2'],
                    c = "dodgerblue",
                    ecolor = "dodgerblue",
                    capsize = 4,
                    fmt = "o")

    axs[2].errorbar(mean['mjd_binned'], 
                    mean['mean_color'],
                    yerr = mean['std_color'],
                    c = "dodgerblue",
                    ecolor = "dodgerblue",
                    capsize = 4,
                    fmt = "o")

    # automatically set the axis limits for the graph based on the range of the plotted data
    axs[0].set_xlim([mean['mjd_binned'].min() - 100, mean['mjd_binned'].max() + 100])
    axs[0].set_ylim([full_tag['w1mpro'].min() - 0.1, full_tag['w1mpro'].max() + 0.1])
    axs[1].set_ylim([full_tag['w2mpro'].min() - 0.1, full_tag['w2mpro'].max() + 0.1])
    axs[2].set_ylim([full_tag['W1-W2'].min() - 0.1, full_tag['W1-W2'].max() + 0.1])

    # only label the outermost axes
    for ax in axs.flat:
        ax.label_outer()

    # set all axis labels, including mjd that autmatically includes the value we subtracted in order to start at zero
    axs[0].set_ylabel('W1 (mag)', fontsize = 'large')
    axs[1].set_ylabel('W2 (mag)', fontsize = 'large')
    axs[2].set_ylabel('W1 - W2 (mag)', fontsize = 'large')
    axs[2].set_xlabel('MJD - ' + str(full_tag['mjd'].min()), fontsize = 'large')
    
#     axs[0].set_yticklabels(tick_labels.astype(int))

    # save figure in a unique file for each object tag
    
    plt.rcParams.update({'font.size': 18})
    
    if(save == True):
        fig.savefig('light-curve-' + str(t) + '.pdf', dpi = 300)
        
    plt.show()

In [15]:
def sub_curve(t, full, g, ax1, ax2, ax3, all_Pr):
    # plot light curve for a given object
    # enter object tag for the desired object
    # generate all columns and binned data for that object
    
    full_tag = expand_cols(full, t)
    mean = save_binned(full_tag, t)
    
    test_r = all_Pr[(all_Pr['object_tag'] == t)]['Pr']

    # add title that automatically includes the name of the plotted object
#     ax1.set_title(g + ' object NSAID: ' + str(t) + " (r = " + test_r.to_string(index = False) + ")", 
#                  fontsize = 'x-large')
    ax1.set_title("(r = " + test_r.to_string(index = False) + ")", 
                 fontsize = '16')

    # scatter individual measurements using the full_tag table
    # different colors and bands are used for each subplot
    ax1.scatter(full_tag['mjd_binned'], 
                   full_tag['w1mpro'], 
                   s = 15, 
                   c = 'dimgray',
                   marker = "o",
                   alpha = 0.3)

    ax2.scatter(full_tag['mjd_binned'], 
                   full_tag['w2mpro'], 
                   s = 15, 
                   c = 'dimgray',
                   marker = "o",
                   alpha = 0.3)

    ax3.scatter(full_tag['mjd_binned'], 
                   full_tag['W1-W2'], 
                   s = 15, 
                   c = 'dimgray',
                   marker = "o",
                   alpha = 0.3)

    # scatter the mean points on top of the individual measurements, all on the same binned x-values
    ax1.scatter(mean['mjd_binned'], 
                   mean['mean_W1'], 
                   s = 65, 
                   c = 'dodgerblue',
                   marker = "s")

    ax2.scatter(mean['mjd_binned'], 
                   mean['mean_W2'], 
                   s = 65, 
                   c = 'dodgerblue',
                   marker = "s")

    ax3.scatter(mean['mjd_binned'], 
                   mean['mean_color'], 
                   s = 65, 
                   c = 'dodgerblue',
                   marker = "s")

    # add error bars for means using the standard deviations calculated in the mean table
    ax1.errorbar(mean['mjd_binned'], 
                    mean['mean_W1'],
                    yerr = mean['std_W1'],
                    c = "dodgerblue",
                    ecolor = "dodgerblue",
                    capsize = 4,
                    fmt = "o")

    ax2.errorbar(mean['mjd_binned'], 
                    mean['mean_W2'],
                    yerr = mean['std_W2'],
                    c = "dodgerblue",
                    ecolor = "dodgerblue",
                    capsize = 4,
                    fmt = "o")

    ax3.errorbar(mean['mjd_binned'], 
                    mean['mean_color'],
                    yerr = mean['std_color'],
                    c = "dodgerblue",
                    ecolor = "dodgerblue",
                    capsize = 4,
                    fmt = "o")

    # automatically set the axis limits for the graph based on the range of the plotted data
    ax1.set_xlim([mean['mjd_binned'].min() - 100, mean['mjd_binned'].max() + 100])
    ax1.set_ylim([full_tag['w1mpro'].min() - 0.1, full_tag['w1mpro'].max() + 0.1])
    ax2.set_ylim([full_tag['w2mpro'].min() - 0.1, full_tag['w2mpro'].max() + 0.1])
    ax3.set_ylim([full_tag['W1-W2'].min() - 0.1, full_tag['W1-W2'].max() + 0.1])
    
    ax1.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
    ax2.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
    ax3.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))

    # set all axis labels, including mjd that autmatically includes the value we subtracted in order to start at zero
    ax3.set_xlabel('MJD - ' + str(full_tag['mjd'].min()), fontsize = 'large')
    
#     ax1.get_shared_x_axes().join(ax1, ax2)
#     ax2.get_shared_x_axes().join(ax2, ax3)
#     ax1.set_xticklabels([])
#     ax2.set_xticklabels([])

In [16]:
def curve_mosaic(title_fig, num_v, num_w, voids_full, walls_full, all_Pr, save,):
    fig, axs = plt.subplots(3, 2, figsize = (24, 10), sharex = 'col')

#     fig.suptitle(title_fig, fontsize = 'x-large')

    sub_curve(num_v, voids_full, "void", axs[0, 0], axs[1, 0], axs[2, 0], all_Pr)

    sub_curve(num_w, walls_full, "wall", axs[0, 1], axs[1, 1], axs[2, 1], all_Pr)

    axs[0, 0].set_ylabel('W1 (mag)', fontsize = '16')
    axs[1, 0].set_ylabel('W2 (mag)', fontsize = '16')
    axs[2, 0].set_ylabel('W1 - W2 (mag)', fontsize = '16')

    plt.subplots_adjust(wspace = 0.07, hspace = 0)
    
    if(save == True):
        fig.savefig('/Users/anisharadhey/Dropbox/voids_Anish/Figures/curve-mosaic' + str(num_v) + '_' + str(num_w) + '.png', dpi = 600)

    plt.show()

In [17]:
def curve_mosaic_svrsef(title_fig, num_v, num_w, voids_full, walls_full, all_Pr, save,):
    fig, axs = plt.subplots(3, 2, figsize = (12, 5), sharex = 'col')

#     fig.suptitle(title_fig, fontsize = 'x-large')

#     sub_curve(num_v, voids_full, "void", axs[0, 0], axs[1, 0], axs[2, 0], all_Pr)

    sub_curve(num_w, walls_full, "wall", axs[0, 1], axs[1, 1], axs[2, 1], all_Pr)

    axs[0, 0].set_ylabel('W1 (mag)', fontsize = 18)
    axs[1, 0].set_ylabel('W2 (mag)', fontsize = 18)
    axs[2, 0].set_ylabel('W1 - W2 (mag)', fontsize = 18)

    plt.subplots_adjust(wspace = 0.07, hspace = 0)
    
    plt.rcParams.update({'font.size': 16})
    
#     fig.tight_layout()
    
    if(save == True):
        fig.savefig('/Users/anisharadhey/Dropbox/voids_Anish/Figures/curve-mosaic' + str(num_v) + '_' + str(num_w) + '.png', 
                    bbox_inches = "tight",
                    dpi = 600)

    plt.show()

In [18]:
def save_diff(full, tag_num):
    # arguments: 
    # returns: 

    full_tag = expand_cols(full, tag_num)
    mean = save_binned(full_tag, tag_num)

    object_tag = mean['object_tag'][0]
    num_bins = len(mean)

    mean_ME = mean[(mean['mjd_binned'] < 1000)]
    mean_NEO = mean[(mean['mjd_binned'] >= 1000)]

    avg_W1_mean_ME = mean_ME['mean_W1'].mean()
    avg_W1_mean_NEO = mean_NEO['mean_W1'].mean()

    std_W1_mean_ME = np.std(mean_ME['mean_W1'])
    std_W1_mean_NEO = np.std(mean_NEO['mean_W1'])
    
    avg_W2_mean_ME = mean_ME['mean_W2'].mean()
    avg_W2_mean_NEO = mean_NEO['mean_W2'].mean()

    std_W2_mean_ME = np.std(mean_ME['mean_W2'])
    std_W2_mean_NEO = np.std(mean_NEO['mean_W2'])
    
    avg_W1_mean = mean['mean_W1'].mean()
    std_W1_mean = np.std(mean['mean_W1'])

    avg_W2_mean = mean['mean_W2'].mean()
    std_W2_mean = np.std(mean['mean_W2'])
    
    diff_W1 = avg_W1_mean_ME - avg_W1_mean_NEO
    diff_W2 = avg_W2_mean_ME - avg_W2_mean_NEO

    # add this information as a row to the mean table
    diff_table = {'object_tag': [object_tag],
                  'num_bins': [num_bins],
                  
                  'diff_W1': [diff_W1],
                  'diff_W2': [diff_W2],
                  
                  'avg_W1': [avg_W1_mean],
                  'std_w1': [std_W1_mean],
                  
                  'avg_w2': [avg_W2_mean],
                  'std_w2': [std_W2_mean],
                  
                  'avg_W1_ME': [avg_W1_mean_ME],
                  'avg_W1_NEO': [avg_W1_mean_NEO],
                  'std_W1_ME': [std_W1_mean_ME],
                  'std_W1_NEO': [std_W1_mean_NEO],
                  
                  'avg_W2_ME': [avg_W2_mean_ME],
                  'avg_W2_NEO': [avg_W2_mean_NEO],
                  'std_W2_ME': [std_W2_mean_ME],
                  'std_W2_NEO': [std_W2_mean_NEO]
                 }
                  
    return(pd.DataFrame(data = diff_table))

In [19]:
def Pearson_table(ultimate_mean):
    # arguments = ultimate mean table from one of the groups
    # returns = table with Pearson coefficent for each of the objects in the ultimate table

    Pr_table = pd.DataFrame(columns = ['object_tag', 'num_obj', 'Pr'])
    count = 0

    for t in np.unique(ultimate_mean['object_tag']):

        Pr_t = ultimate_mean[(ultimate_mean['object_tag'] == t)]

        W1 = Pr_t['mean_W1'].to_numpy()
        W2 = Pr_t['mean_W2'].to_numpy()

        Pr = np.corrcoef(x = W2, y = W1, rowvar = False)[0][1]
        # returns a 2 x 2 matrix with the correlation between W1 and W2. the diagonal will be 1 since it is the correlation
        # between one variable and itself. therefore, we select a box not in the diagonal for our Pr measurement
        
        n_obj = Pr_t.shape[0] # measure number of rows in the dataframe
        # this tells us the number of objects the coefficient calculation is based on

        Pr_table.loc[len(Pr_table.index)] = [t, n_obj, Pr]
        count += 1
        
        if(count % 5000 == 0):
            print(str(count) + ' voids')
        
    return(Pr_table)

In [20]:
def percent_AGN_table(ultimate_mean):
    # arguments = ultimate mean table from one of the groups
    # returns = table with Pearson coefficent for each of the objects in the ultimate table

    pAGN_table = pd.DataFrame(columns = ['object_tag', 'num_obj', 'percentAGN', 'error'])
    count = 0

    for t in np.unique(ultimate_mean['object_tag']):

        pAGN_t = ultimate_mean[(ultimate_mean['object_tag'] == t)]

        pAGN = round(len(pAGN_t[(pAGN_t['mean_color'] >= 0.8)]) / len(pAGN_t) * 100, 3)
        err = get_err_percent_num(len(pAGN_t[(pAGN_t['mean_color'] >= 0.8)]), len(pAGN_t))

        n_obj = len(pAGN_t) # measure number of rows in the dataframe
        # this tells us the number of objects the coefficient calculation is based on

        pAGN_table.loc[len(pAGN_table.index)] = [t, n_obj, pAGN, err]
        count += 1
        
        if(count % 5000 == 0):
            print(str(count) + ' galaxies')
        
    return(pAGN_table)

In [21]:
def plot_mean(n, column_voids, column_walls, ax):
    
    ax[n].axvline(x = column_voids.mean(), 
              color = 'blue', 
              linewidth = 2.5, 
              label = 'mean void')
    ax[n].axvline(x = column_voids.mean() + np.std(column_voids), 
              color = 'blue', 
              linestyle = 'dashed',
              linewidth = 2, 
              label = 'mean void ± std')
    ax[n].axvline(x = column_voids.mean() - np.std(column_voids), 
              color = 'blue', 
              linestyle = 'dashed',
              linewidth = 2)

    ax[n].axvline(x = column_walls.mean(), 
              color = 'dimgray', 
              linewidth = 2.5, 
              label = 'mean void')
    ax[n].axvline(x = column_walls.mean() + np.std(column_walls), 
              color = 'dimgray', 
              linestyle = 'dashed',
              linewidth = 2, 
              label = 'mean void ± std')
    ax[n].axvline(x = column_walls.mean() - np.std(column_walls), 
              color = 'dimgray', 
              linestyle = 'dashed',
              linewidth = 2)
    
    ax[n].text(-0.5, -0.0075, 'mean voids = ' + str(round(column_voids.mean(), 2)) + ' ± ' + 
               str(round(np.std(column_voids), 2)), ha = 'left')
    ax[n].text(-0.5, -0.0095, 'mean walls = ' + str(round(column_walls.mean(), 2)) + ' ± ' + 
               str(round(np.std(column_walls), 2)), ha = 'left')

In [22]:
def plot_mean_color(n, column_voids, column_walls, color_v, color_w, ax, label):
    
    ax[n].axvline(x = column_voids.mean(), 
              color = color_v, 
              linewidth = 2.5, 
              label = 'mean void')
    ax[n].axvline(x = column_voids.mean() + np.std(column_voids), 
              color = color_v, 
              linestyle = 'dashed',
              linewidth = 2, 
              label = 'mean void ± std')
    ax[n].axvline(x = column_voids.mean() - np.std(column_voids), 
              color = color_v, 
              linestyle = 'dashed',
              linewidth = 2)

    ax[n].axvline(x = column_walls.mean(), 
              color = color_w, 
              linewidth = 2.5, 
              label = 'mean void')
    ax[n].axvline(x = column_walls.mean() + np.std(column_walls), 
              color = color_w, 
              linestyle = 'dashed',
              linewidth = 2, 
              label = 'mean void ± std')
    ax[n].axvline(x = column_walls.mean() - np.std(column_walls), 
              color = color_w, 
              linestyle = 'dashed',
              linewidth = 2)
    if(label):
        ax[n].text(-0.5, -0.0075, 'mean voids = ' + str(round(column_voids.mean(), 2)) + ' ± ' + 
                   str(round(np.std(column_voids), 2)), ha = 'left')
        ax[n].text(-0.5, -0.0095, 'mean walls = ' + str(round(column_walls.mean(), 2)) + ' ± ' + 
                   str(round(np.std(column_walls), 2)), ha = 'left')

In [23]:
# Filter out non-plotted values
def filter_walls(sample, x_low, x_high, y_low, y_high):
    
    sample = sample[(sample['M_r'] >= y_low) &
                    (sample['M_r'] <= y_high) &
                    (sample['z'] >= x_low) &
                    (sample['z'] <= x_high)]
    
    return(sample)

In [24]:
def filter_voids(sample, x_low, x_high, y_low, y_high):
    
    sample = sample[(sample['M_r_NYU'] >= y_low) &
                    (sample['M_r_NYU'] <= y_high) &
                    (sample['z'] >= x_low) &
                    (sample['z'] <= x_high)]
    
    return(sample)

In [25]:
# Filter out non-plotted values
def filter_new(sample, x_low, x_high, y_low, y_high):
    
    sample = sample[(sample['rabsmag_NSA'] >= y_low) &
                    (sample['rabsmag_NSA'] <= y_high) &
                    (sample['Z'] >= x_low) &
                    (sample['Z'] <= x_high)]
    
    return(sample)

In [26]:
# filter out rows where no WISE object was found

def filter_colors(data):
    # arguments: table from read_data()
    # returns: table where all snr rows are greater than 3, and without the rows containing "NA" for ra and dec
    
    data_filtered = data.dropna(subset = ['ra', 'dec'])
    
    data_filtered = data_filtered[(data_filtered['w1snr'] >= 5) &
                                  (data_filtered['w2snr'] >= 5)]

    return(data_filtered)

In [27]:
def merge_nona(df, colors):
    
    df_merged = df.merge(colors, how = 'left', on = 'object_tag')
    df_nona = df_merged.dropna(subset = ['W1-W2'])
    
    og_len = len(df)
    filt_len = len(df_nona)
    
    print("fraction lost = " + str(round(1 - (filt_len / og_len), 6)))
    
    return(df_nona)

In [28]:
def get_err_frac(x, y):
    # arguments: any quantity with a dividend and divisor
    # returns: a string containing the associated error for the quotient
    
    if(x == 0 or y == 0):
        
        num_error = '???'
        return(" ± " + num_error)
        
    else:
    
        num_error = math.sqrt((1 / x) + (1 / y)) * (x/y)
        return(" ± " + str(round(num_error, 2)))

In [29]:
def get_err_percent(x, y):
    # arguments: any quantity with a dividend and divisor
    # returns: a string containing the associated error for the quotient
    
    if(x == 0 or y == 0):
        
        return("")
    
    if(x/y == 1.0):
        
        return("") 
        
    else:
    
        num_error = math.sqrt((1 / x) + (1 / y)) * (x/y)
        return(" ± " + str(round(num_error * 100, 2)))

In [30]:
def get_err_percent_num(x, y):
    # arguments: any quantity with a dividend and divisor
    # returns: a string containing the associated error for the quotient
    
    if(x == 0 or y == 0):
        
        return(0.0)
    
    if(x/y == 1.0):
        
        return(0.0) 
        
    else:
    
        num_error = math.sqrt((1 / x) + (1 / y)) * (x/y)
        return(round(num_error * 100, 3))

In [31]:
# ERROR BOUND MAY BE INCORRECT
def get_fraction(sample, column, compare, r_threshold):

    if(compare == '>'):
        
        frac = len(sample[(sample[column] > r_threshold)]) / len(sample)
        err = get_err(len(sample[(sample[column] > r_threshold)]), len(sample))
        
    elif(compare == '>='):
        
        frac = len(sample[(sample[column] >= r_threshold)]) / len(sample)
        err = get_err(len(sample[(sample[column] >= r_threshold)]), len(sample))
        
    elif(compare == '<'):
        
        frac = len(sample[(sample[column] < r_threshold)]) / len(sample)
        err = get_err(len(sample[(sample[column] < r_threshold)]), len(sample))

    return(str(round(frac, 3)) + err)

In [32]:
def get_percent(sample, column, compare, r_threshold, parenthesis):

    if(compare == '>'):
        
        frac = len(sample[(sample[column] > r_threshold)]) / len(sample)
        err = get_err_percent(len(sample[(sample[column] > r_threshold)]), len(sample))
        
    elif(compare == '>='):
        
        frac = len(sample[(sample[column] >= r_threshold)]) / len(sample)
        err = get_err_percent(len(sample[(sample[column] >= r_threshold)]), len(sample))
        
    elif(compare == '<'):
        
        frac = len(sample[(sample[column] < r_threshold)]) / len(sample)
        err = get_err_percent(len(sample[(sample[column] < r_threshold)]), len(sample))

    if(parenthesis):
        
        return(" (" + str(round(frac * 100, 2)) + err + ")")
    
    else:
        
        return(str(round(frac * 100, 2)) + err)

In [33]:
def get_count(sample, column, compare, r_threshold):

    if(compare == '>'):
        
        count = len(sample[(sample[column] > r_threshold)])
        
    elif(compare == '>='):
        
        count = len(sample[(sample[column] >= r_threshold)])
        
    elif(compare == '<'):
        
        count = len(sample[(sample[column] < r_threshold)])

    return(str(count))

In [34]:
def get_mean(column):

    mean = round(column.mean(), 3)
    err = round(np.std(column), 3)

    return(str(mean) + " ± " + str(err))

In [35]:
def timestamp():
    return str(datetime.now().strftime("%m-%d-%y"))

In [36]:
def table_1(voids_parent, walls_parent, title):
    
    title = ' ' + title

    voids50 = voids_parent[(voids_parent['Pr'] > 0.50)]
    voids75 = voids_parent[(voids_parent['Pr'] > 0.75)]
    voids85 = voids_parent[(voids_parent['Pr'] > 0.85)]

    walls50 = walls_parent[(walls_parent['Pr'] > 0.50)]
    walls75 = walls_parent[(walls_parent['Pr'] > 0.75)]
    walls85 = walls_parent[(walls_parent['Pr'] > 0.85)]

    blank = [' ', ' ', ' ', ' ', ' ', ' ']

    percent_table = pd.DataFrame(
        [
            ['(Parent)' + title + ' voids', 
             len(voids_parent), 
             get_percent(voids_parent, 'Pr', '>', -100, False),
             get_mean(voids_parent['W1-W2']),
             get_percent(voids_parent, 'W1-W2', '<', 0.5, False),
             get_count(voids_parent, 'W1-W2', '>=', 0.80) + get_percent(voids_parent, 'W1-W2', '>=', 0.80, True),
            ],

            ['(Parent)' + title + ' walls', 
             len(walls_parent), 
             get_percent(walls_parent, 'Pr', '>', -100, False),
             get_mean(walls_parent['W1-W2']),
             get_percent(walls_parent, 'W1-W2', '<', 0.5, False),
             get_count(walls_parent, 'W1-W2', '>=', 0.80) + get_percent(walls_parent, 'W1-W2', '>=', 0.80, True),
            ],

            blank,

            ['Variable' + title + ' voids (r > 0.50)', 
             len(voids50), 
             get_percent(voids_parent, 'Pr', '>', 0.50, False),
             get_mean(voids50['W1-W2']),
             get_percent(voids50, 'W1-W2', '<', 0.5, False),
             get_count(voids50, 'W1-W2', '>=', 0.80) + get_percent(voids50, 'W1-W2', '>=', 0.80, True),
            ],

            ['Variable' + title + ' walls (r > 0.50)', 
             len(walls50), 
             get_percent(walls_parent, 'Pr', '>', 0.50, False),
             get_mean(walls50['W1-W2']),
             get_percent(walls50, 'W1-W2', '<', 0.5, False),
             get_count(walls50, 'W1-W2', '>=', 0.80) + get_percent(walls50, 'W1-W2', '>=', 0.80, True),
            ],

            blank,

            ['Variable' + title + ' voids (r > 0.75)', 
             len(voids75), 
             get_percent(voids_parent, 'Pr', '>', 0.75, False),
             get_mean(voids75['W1-W2']),
             get_percent(voids75, 'W1-W2', '<', 0.5, False),
             get_count(voids75, 'W1-W2', '>=', 0.80) + get_percent(voids75, 'W1-W2', '>=', 0.80, True),
            ],

            ['Variable' + title + ' walls (r > 0.75)', 
             len(walls75), 
             get_percent(walls_parent, 'Pr', '>', 0.75, False),
             get_mean(walls75['W1-W2']),
             get_percent(walls75, 'W1-W2', '<', 0.5, False),
             get_count(walls75, 'W1-W2', '>=', 0.80) + get_percent(walls75, 'W1-W2', '>=', 0.80, True),
            ],

            blank,

            ['Variable' + title + ' voids (r > 0.85)', 
             len(voids85), 
             get_percent(voids_parent, 'Pr', '>', 0.85, False),
             get_mean(voids85['W1-W2']),
             get_percent(voids85, 'W1-W2', '<', 0.5, False),
             get_count(voids85, 'W1-W2', '>=', 0.80) + get_percent(voids85, 'W1-W2', '>=', 0.80, True),
            ],

            ['Variable' + title + ' walls (r > 0.85)', 
             len(walls85), 
             get_percent(walls_parent, 'Pr', '>', 0.85, False),
             get_mean(walls85['W1-W2']),
             get_percent(walls85, 'W1-W2', '<', 0.5, False),
             get_count(walls85, 'W1-W2', '>=', 0.80) + get_percent(walls85, 'W1-W2', '>=', 0.80, True),
            ],
        ],

        columns = ['Galaxy group', 
                   'Total count',
                   'Percent of parent',
                   'Mean W1 - W2',
                   'W1 - W2 < 0.5',
                   'W1 - W2 >= 0.8'
                  ]
            )

    display(HTML(percent_table.to_html(index = False)))

In [37]:
def get_coronal_count(sample, coronal, new):
    
    if(not new):
        merge = sample.merge(coronal, how = 'left', on = ['plate', 'fiber', 'mjd'])
        merge = merge.dropna(subset = ['RA'])
    else:
        merge = sample.merge(coronal, how = 'left', on = ['PLATE', 'FIBERID', 'MJD'])
        merge = merge.dropna(subset = ['RA_C'])

    frac = len(merge) / len(sample)
    err = get_err_percent(len(merge), len(sample))
        
    return(str(len(merge)) + " (" + str(round(frac * 100, 2)) + err + ")")
    

In [38]:
def table_1_coronal(voids_parent, walls_parent, title, coronal_short, new):
    
    title = ' ' + title

    voids50 = voids_parent[(voids_parent['Pr'] > 0.50)]
    voids75 = voids_parent[(voids_parent['Pr'] > 0.75)]
    voids85 = voids_parent[(voids_parent['Pr'] > 0.85)]

    walls50 = walls_parent[(walls_parent['Pr'] > 0.50)]
    walls75 = walls_parent[(walls_parent['Pr'] > 0.75)]
    walls85 = walls_parent[(walls_parent['Pr'] > 0.85)]

    blank = [' ', ' ', ' ', ' ', ' ', ' ',' ']

    percent_table = pd.DataFrame(
        [
            ['(Parent)' + title + ' voids', 
             len(voids_parent), 
             get_percent(voids_parent, 'Pr', '>', -100, False),
             get_mean(voids_parent['W1-W2']),
             get_percent(voids_parent, 'W1-W2', '<', 0.5, False),
             get_count(voids_parent, 'W1-W2', '>=', 0.80) + get_percent(voids_parent, 'W1-W2', '>=', 0.80, True),
             get_coronal_count(voids_parent, coronal_short, new)
            ],

            ['(Parent)' + title + ' walls', 
             len(walls_parent), 
             get_percent(walls_parent, 'Pr', '>', -100, False),
             get_mean(walls_parent['W1-W2']),
             get_percent(walls_parent, 'W1-W2', '<', 0.5, False),
             get_count(walls_parent, 'W1-W2', '>=', 0.80) + get_percent(walls_parent, 'W1-W2', '>=', 0.80, True),
             get_coronal_count(walls_parent, coronal_short, new)
            ],

            blank,

            ['Variable' + title + ' voids (r > 0.50)', 
             len(voids50), 
             get_percent(voids_parent, 'Pr', '>', 0.50, False),
             get_mean(voids50['W1-W2']),
             get_percent(voids50, 'W1-W2', '<', 0.5, False),
             get_count(voids50, 'W1-W2', '>=', 0.80) + get_percent(voids50, 'W1-W2', '>=', 0.80, True),
             get_coronal_count(voids50, coronal_short, new)
            ],

            ['Variable' + title + ' walls (r > 0.50)', 
             len(walls50), 
             get_percent(walls_parent, 'Pr', '>', 0.50, False),
             get_mean(walls50['W1-W2']),
             get_percent(walls50, 'W1-W2', '<', 0.5, False),
             get_count(walls50, 'W1-W2', '>=', 0.80) + get_percent(walls50, 'W1-W2', '>=', 0.80, True),
             get_coronal_count(walls50, coronal_short, new)
            ],

            blank,

            ['Variable' + title + ' voids (r > 0.75)', 
             len(voids75), 
             get_percent(voids_parent, 'Pr', '>', 0.75, False),
             get_mean(voids75['W1-W2']),
             get_percent(voids75, 'W1-W2', '<', 0.5, False),
             get_count(voids75, 'W1-W2', '>=', 0.80) + get_percent(voids75, 'W1-W2', '>=', 0.80, True),
             get_coronal_count(voids75, coronal_short, new)
            ],

            ['Variable' + title + ' walls (r > 0.75)', 
             len(walls75), 
             get_percent(walls_parent, 'Pr', '>', 0.75, False),
             get_mean(walls75['W1-W2']),
             get_percent(walls75, 'W1-W2', '<', 0.5, False),
             get_count(walls75, 'W1-W2', '>=', 0.80) + get_percent(walls75, 'W1-W2', '>=', 0.80, True),
             get_coronal_count(walls75, coronal_short, new)
            ],

            blank,

            ['Variable' + title + ' voids (r > 0.85)', 
             len(voids85), 
             get_percent(voids_parent, 'Pr', '>', 0.85, False),
             get_mean(voids85['W1-W2']),
             get_percent(voids85, 'W1-W2', '<', 0.5, False),
             get_count(voids85, 'W1-W2', '>=', 0.80) + get_percent(voids85, 'W1-W2', '>=', 0.80, True),
             get_coronal_count(voids85, coronal_short, new)
            ],

            ['Variable' + title + ' walls (r > 0.85)', 
             len(walls85), 
             get_percent(walls_parent, 'Pr', '>', 0.85, False),
             get_mean(walls85['W1-W2']),
             get_percent(walls85, 'W1-W2', '<', 0.5, False),
             get_count(walls85, 'W1-W2', '>=', 0.80) + get_percent(walls85, 'W1-W2', '>=', 0.80, True),
             get_coronal_count(walls85, coronal_short, new)
            ],
        ],

        columns = ['Galaxy group', 
                   'Total count',
                   'Percent of parent',
                   'Mean W1 - W2',
                   'W1 - W2 < 0.5',
                   'W1 - W2 >= 0.8',
                   'With coronal lines'
                  ]
            )

    display(HTML(percent_table.to_html(index = False)))

In [39]:
def filter_rows_CC(data):
    # arguments: table from read_data()
    # returns: table where all snr rows are greater than 3, and without the rows containing "NA" for ra and dec
    
    data_filtered = data.dropna(subset = ['ra', 'dec'])
    
    data_filtered = data_filtered[(data_filtered['w1snr'] >= 5) &
                                  (data_filtered['w2snr'] >= 5) &
                                  (data_filtered['w3snr'] >= 3) &
                                  (data_filtered['w4snr'] >= 3)]
    return(data_filtered)

In [40]:
def get_mean_sm(sample, sm, to_print, new, MPA_tag):
    
    if(new):
        
        premerge = sample.merge(MPA_tag, how = 'left', on = ['PLATE', 'FIBERID', 'MJD'])
        
    else:
        
        premerge = sample
        
    
    merge = premerge.merge(sm, how = 'left', on = ['MPA_tag'])

    len_1 = len(merge)

    merge = merge.dropna(subset = ['stellar_mass'])

    len_2 = len(merge)
    
    len_diff = len_1 - len_2
    
    if (to_print & (len_1 != len_2)):
        
        print("galaxies with stellar mass: " + str(len_2) + " out of " + str(len_1))
        
    return(get_mean(merge['stellar_mass']))

In [41]:
def table_1_sm(voids_parent, walls_parent, title, coronal_short, sm, new, tag):
    
    title = ' ' + title

    voids50 = voids_parent[(voids_parent['Pr'] > 0.50)]
    voids75 = voids_parent[(voids_parent['Pr'] > 0.75)]
    voids85 = voids_parent[(voids_parent['Pr'] > 0.85)]

    walls50 = walls_parent[(walls_parent['Pr'] > 0.50)]
    walls75 = walls_parent[(walls_parent['Pr'] > 0.75)]
    walls85 = walls_parent[(walls_parent['Pr'] > 0.85)]

    blank = [' ', ' ', ' ', ' ', ' ', ' ',' ',' ']

    percent_table = pd.DataFrame(
        [
            ['(Parent)' + title + ' voids', 
             len(voids_parent), 
             get_percent(voids_parent, 'Pr', '>', -100, False),
             get_mean(voids_parent['W1-W2']),
             get_percent(voids_parent, 'W1-W2', '<', 0.5, False),
             get_count(voids_parent, 'W1-W2', '>=', 0.80) + get_percent(voids_parent, 'W1-W2', '>=', 0.80, True),
             get_coronal_count(voids_parent, coronal_short, new),
             get_mean_sm(voids_parent, sm, True, new, tag)
            ],

            ['(Parent)' + title + ' walls', 
             len(walls_parent), 
             get_percent(walls_parent, 'Pr', '>', -100, False),
             get_mean(walls_parent['W1-W2']),
             get_percent(walls_parent, 'W1-W2', '<', 0.5, False),
             get_count(walls_parent, 'W1-W2', '>=', 0.80) + get_percent(walls_parent, 'W1-W2', '>=', 0.80, True),
             get_coronal_count(walls_parent, coronal_short, new),
             get_mean_sm(walls_parent, sm, True, new, tag)
            ],

            blank,

            ['Variable' + title + ' voids (r > 0.50)', 
             len(voids50), 
             get_percent(voids_parent, 'Pr', '>', 0.50, False),
             get_mean(voids50['W1-W2']),
             get_percent(voids50, 'W1-W2', '<', 0.5, False),
             get_count(voids50, 'W1-W2', '>=', 0.80) + get_percent(voids50, 'W1-W2', '>=', 0.80, True),
             get_coronal_count(voids50, coronal_short, new),
             get_mean_sm(voids50, sm, False, new, tag)
            ],

            ['Variable' + title + ' walls (r > 0.50)', 
             len(walls50), 
             get_percent(walls_parent, 'Pr', '>', 0.50, False),
             get_mean(walls50['W1-W2']),
             get_percent(walls50, 'W1-W2', '<', 0.5, False),
             get_count(walls50, 'W1-W2', '>=', 0.80) + get_percent(walls50, 'W1-W2', '>=', 0.80, True),
             get_coronal_count(walls50, coronal_short, new),
             get_mean_sm(walls50, sm, False, new, tag)
            ],

            blank,

            ['Variable' + title + ' voids (r > 0.75)', 
             len(voids75), 
             get_percent(voids_parent, 'Pr', '>', 0.75, False),
             get_mean(voids75['W1-W2']),
             get_percent(voids75, 'W1-W2', '<', 0.5, False),
             get_count(voids75, 'W1-W2', '>=', 0.80) + get_percent(voids75, 'W1-W2', '>=', 0.80, True),
             get_coronal_count(voids75, coronal_short, new),
             get_mean_sm(voids75, sm, False, new, tag)
            ],

            ['Variable' + title + ' walls (r > 0.75)', 
             len(walls75), 
             get_percent(walls_parent, 'Pr', '>', 0.75, False),
             get_mean(walls75['W1-W2']),
             get_percent(walls75, 'W1-W2', '<', 0.5, False),
             get_count(walls75, 'W1-W2', '>=', 0.80) + get_percent(walls75, 'W1-W2', '>=', 0.80, True),
             get_coronal_count(walls75, coronal_short, new),
             get_mean_sm(walls75, sm, False, new, tag)
            ],

            blank,

            ['Variable' + title + ' voids (r > 0.85)', 
             len(voids85), 
             get_percent(voids_parent, 'Pr', '>', 0.85, False),
             get_mean(voids85['W1-W2']),
             get_percent(voids85, 'W1-W2', '<', 0.5, False),
             get_count(voids85, 'W1-W2', '>=', 0.80) + get_percent(voids85, 'W1-W2', '>=', 0.80, True),
             get_coronal_count(voids85, coronal_short, new),
             get_mean_sm(voids85, sm, False, new, tag)
            ],

            ['Variable' + title + ' walls (r > 0.85)', 
             len(walls85), 
             get_percent(walls_parent, 'Pr', '>', 0.85, False),
             get_mean(walls85['W1-W2']),
             get_percent(walls85, 'W1-W2', '<', 0.5, False),
             get_count(walls85, 'W1-W2', '>=', 0.80) + get_percent(walls85, 'W1-W2', '>=', 0.80, True),
             get_coronal_count(walls85, coronal_short, new),
             get_mean_sm(walls85, sm, False, new, tag)
            ],
        ],

        columns = ['Galaxy group', 
                   'Total count',
                   'Percent of parent',
                   'Mean W1 - W2',
                   'W1 - W2 < 0.5',
                   'W1 - W2 >= 0.8',
                   'With coronal lines',
                   'Mean stellar mass'
                  ]
            )

    display(HTML(percent_table.to_html(index = False)))

In [42]:
def drop_na_W3(sample):
    
    print(len(sample))
    sample_nona = sample.dropna(subset = ['W2-W3'])
    print(len(sample_nona))
    print('')

    return sample_nona

In [43]:
def drop_r85(sample):
    
    print(len(sample))
    sample_r85 = sample[(sample['Pr'] > 0.85)]
    print(len(sample_r85))
    print('')

    return sample_r85

In [44]:
def keep_r85(sample):
    
    print(len(sample))
    sample_r85 = sample[(sample['Pr'] <= 0.85)]
    print(len(sample_r85))
    print('')

    return sample_r85

In [45]:
def read_data_newdata(file_string):
    
    file = pd.read_csv('/Users/anisharadhey/Dropbox/voids_Anish/Data/' + file_string + '.csv')
    print('read ' + file_string)
        
    return(file)

In [46]:
def table_1_svrsef(voids_parent, walls_parent, title):
    
    title = ' ' + title

    voids50 = voids_parent[(voids_parent['Pr'] > 0.50)]
    voids75 = voids_parent[(voids_parent['Pr'] > 0.75)]
    voids85 = voids_parent[(voids_parent['Pr'] > 0.85)]

    walls50 = walls_parent[(walls_parent['Pr'] > 0.50)]
    walls75 = walls_parent[(walls_parent['Pr'] > 0.75)]
    walls85 = walls_parent[(walls_parent['Pr'] > 0.85)]

    blank = [' ', ' ', ' ', ' ']

    percent_table = pd.DataFrame(
        [
            ['(Parent)' + title + ' voids', 
             str(len(voids_parent)), # + ' (' + get_percent(voids_parent, 'Pr', '>', -100, False) + ')',
             get_count(voids_parent, 'W1-W2', '<', 0.5) + get_percent(voids_parent, 'W1-W2', '<', 0.5, True),
             get_count(voids_parent, 'W1-W2', '>=', 0.80) + get_percent(voids_parent, 'W1-W2', '>=', 0.80, True),
            ],

            ['(Parent)' + title + ' walls', 
             str(len(walls_parent)), # + ' (' + get_percent(walls_parent, 'Pr', '>', -100, False) + ')',
             get_count(walls_parent, 'W1-W2', '<', 0.5) + get_percent(walls_parent, 'W1-W2', '<', 0.5, True),
             get_count(walls_parent, 'W1-W2', '>=', 0.80) + get_percent(walls_parent, 'W1-W2', '>=', 0.80, True),
            ],

#             blank,

            ['Variable' + title + ' voids (r > 0.75)', 
             str(len(voids75)) + ' (' + get_percent(voids_parent, 'Pr', '>', 0.75, False) + ')',
             get_count(voids75, 'W1-W2', '<', 0.5) + get_percent(voids75, 'W1-W2', '<', 0.5, True),
             get_count(voids75, 'W1-W2', '>=', 0.80) + get_percent(voids75, 'W1-W2', '>=', 0.80, True),
            ],

            ['Variable' + title + ' walls (r > 0.75)', 
             str(len(walls75)) + ' (' + get_percent(walls_parent, 'Pr', '>', 0.75, False) + ')',
             get_count(walls75, 'W1-W2', '<', 0.5) + get_percent(walls75, 'W1-W2', '<', 0.5, True),
             get_count(walls75, 'W1-W2', '>=', 0.80) + get_percent(walls75, 'W1-W2', '>=', 0.80, True),
            ],

#             blank,

            ['Variable' + title + ' voids (r > 0.85)', 
             str(len(voids85)) + ' (' + get_percent(voids_parent, 'Pr', '>', 0.85, False) + ')',
             get_count(voids85, 'W1-W2', '<', 0.5) + get_percent(voids85, 'W1-W2', '<', 0.5, True),
             get_count(voids85, 'W1-W2', '>=', 0.80) + get_percent(voids85, 'W1-W2', '>=', 0.80, True),
            ],

            ['Variable' + title + ' walls (r > 0.85)', 
             str(len(walls85)) + ' (' + get_percent(walls_parent, 'Pr', '>', 0.85, False) + ')',
             get_count(walls85, 'W1-W2', '<', 0.5) + get_percent(walls85, 'W1-W2', '<', 0.5, True),
             get_count(walls85, 'W1-W2', '>=', 0.80) + get_percent(walls85, 'W1-W2', '>=', 0.80, True),
            ],
        ],

        columns = ['Galaxy group', 
                   'Count (% of parent)',
                   'W1 - W2 < 0.5',
                   'W1 - W2 >= 0.8'
                  ]
            )

    display(HTML(percent_table.to_html(index = False)))
    return percent_table

In [47]:
def table_1_pAGN(voids_parent, walls_parent, title, thresh):
    
    title = ' ' + title

    voids50 = voids_parent[(voids_parent['Pr'] > 0.50)]
    voids75 = voids_parent[(voids_parent['Pr'] > 0.75)]
    voids85 = voids_parent[(voids_parent['Pr'] > 0.85)]

    walls50 = walls_parent[(walls_parent['Pr'] > 0.50)]
    walls75 = walls_parent[(walls_parent['Pr'] > 0.75)]
    walls85 = walls_parent[(walls_parent['Pr'] > 0.85)]
    
    pAGN_threshold = thresh

    percent_table = pd.DataFrame(
        [
            ['(Parent)' + title + ' voids', 
             str(len(voids_parent)), # + ' (' + get_percent(voids_parent, 'Pr', '>', 0.75, False) + ')',
             get_percent(voids_parent, 'W1-W2', '<', 0.5, False),
             get_count(voids_parent, 'W1-W2', '>=', 0.80) + get_percent(voids_parent, 'W1-W2', '>=', 0.80, True),
             get_count(voids_parent, 'percentAGN', '>=', pAGN_threshold) + get_percent(voids_parent, 'percentAGN', '>=', pAGN_threshold, True),
            ],

            ['(Parent)' + title + ' walls', 
             str(len(walls_parent)), # + ' (' + get_percent(voids_parent, 'Pr', '>', 0.75, False) + ')',
             get_percent(walls_parent, 'W1-W2', '<', 0.5, False),
             get_count(walls_parent, 'W1-W2', '>=', 0.80) + get_percent(walls_parent, 'W1-W2', '>=', 0.80, True),
             get_count(walls_parent, 'percentAGN', '>=', pAGN_threshold) + get_percent(walls_parent, 'percentAGN', '>=', pAGN_threshold, True),
            ],

#             blank,

            ['Variable' + title + ' voids (r > 0.75)', 
             str(len(voids75)) + ' (' + get_percent(voids_parent, 'Pr', '>', 0.75, False) + ')',
             get_percent(voids75, 'W1-W2', '<', 0.5, False),
             get_count(voids75, 'W1-W2', '>=', 0.80) + get_percent(voids75, 'W1-W2', '>=', 0.80, True),
             get_count(voids75, 'percentAGN', '>=', pAGN_threshold) + get_percent(voids75, 'percentAGN', '>=', pAGN_threshold, True),
            ],

            ['Variable' + title + ' walls (r > 0.75)', 
             str(len(walls75)) + ' (' + get_percent(voids_parent, 'Pr', '>', 0.75, False) + ')',
             get_percent(walls75, 'W1-W2', '<', 0.5, False),
             get_count(walls75, 'W1-W2', '>=', 0.80) + get_percent(walls75, 'W1-W2', '>=', 0.80, True),
             get_count(walls75, 'percentAGN', '>=', pAGN_threshold) + get_percent(walls75, 'percentAGN', '>=', pAGN_threshold, True),
            ],

#             blank,

            ['Variable' + title + ' voids (r > 0.85)', 
             str(len(voids85)) + ' (' + get_percent(voids_parent, 'Pr', '>', 0.85, False) + ')',
             get_percent(voids85, 'W1-W2', '<', 0.5, False),
             get_count(voids85, 'W1-W2', '>=', 0.80) + get_percent(voids85, 'W1-W2', '>=', 0.80, True),
             get_count(voids85, 'percentAGN', '>=', pAGN_threshold) + get_percent(voids85, 'percentAGN', '>=', pAGN_threshold, True),
            ],

            ['Variable' + title + ' walls (r > 0.85)', 
             str(len(walls85)) + ' (' + get_percent(voids_parent, 'Pr', '>', 0.85, False) + ')',
             get_percent(walls85, 'W1-W2', '<', 0.5, False),
             get_count(walls85, 'W1-W2', '>=', 0.80) + get_percent(walls85, 'W1-W2', '>=', 0.80, True),
             get_count(walls85, 'percentAGN', '>=', pAGN_threshold) + get_percent(walls85, 'percentAGN', '>=', pAGN_threshold, True),
            ],
        ],

        columns = ['Galaxy group', 
                   'Count (% of parent)',
                   'W1 - W2 < 0.5',
                   'W1 - W2 >= 0.8',
                   '%AGN >= ' + str(thresh) + '%'
                  ]
            )

    display(HTML(percent_table.to_html(index = False)))

In [48]:
# end