In [106]:
import cupy as cp
import pandas as pd
import numpy as np
from numba import jit

# Disable if running on GPU
from numba import jit, config
config.DISABLE_JIT = True
import cProfile
import sys
sys.path.append("../gaia_tools/")
import transformation_constants
import transformation_functions
import data_analysis
import covariance_generation as cov

In [67]:
print('Grabbing needed columns')
icrs_data = pd.read_csv('/home/svenpoder/DATA/Gaia_2MASS Data_DR2/gaia_rv_data_bayes.csv', nrows = 10)
print('Importing DR3')
path = '/home/svenpoder/DATA/Gaia_DR3/GaiaDR3_RV_RGB_fidelity.csv'
gaia_dr3 = pd.read_csv(path)
icrs_data = gaia_dr3[icrs_data.columns]

Grabbing needed columns
Importing DR3


In [68]:
## TRANSFORMATION CONSTANTS
v_sun = transformation_constants.V_SUN

r_0 = 8277.0
z_0 = 25.0

v_sun[0][0] = 11.1
v_sun[1][0] = 251.5*(r_0/8277)
v_sun[2][0] = 8.59*(r_0/8277)


## APPLY INITIAL CUT
galcen_data = data_analysis.get_transformed_data(icrs_data,
                                       include_cylindrical = True,
                                       z_0 = z_0,
                                       r_0 = r_0,
                                       v_sun = v_sun,
                                       debug = True,
                                       is_bayes = True,
                                       is_source_included = True)

galcen_data = galcen_data[(galcen_data.r < 15000) & (galcen_data.r > 5000)]
galcen_data = galcen_data[(galcen_data.z < 200) & (galcen_data.z > -200)]
galcen_data.reset_index(inplace=True, drop=True)

## DECLARE FINAL INPUT DATA
icrs_data = icrs_data.merge(galcen_data, on='source_id')[icrs_data.columns]
print("Final size of sample {}".format(galcen_data.shape))

Starting galactocentric transformation loop over all data points.. 
Time elapsed for data coordinate transformation: 2.6086947309959214 sec
Final size of sample (1694972, 11)


In [69]:
# Generate covariance matrices for INPUT
C_icrs = cov.generate_covmat(icrs_data)

In [70]:
# Declare backend
NUMPY_LIB = cp
dtype = cp.float32

In [71]:
# Export INPUT to GPU with needed columns
trans_needed_columns = ['source_id', 'ra', 'dec', 'r_est', 'pmra', 'pmdec', 'radial_velocity',]
icrs_data = NUMPY_LIB.asarray(icrs_data[trans_needed_columns], dtype=cp.float32)
C_icrs = NUMPY_LIB.asarray(C_icrs, dtype=cp.float32)

In [72]:
def transform_all(icrs_data, C_icrs, r_0, z_0, v_sun):

    r_0 = r_0
    z_0 = z_0
    v_sun = v_sun

    galcen_data = transformation_functions.get_transformed_data(icrs_data,
                                        include_cylindrical = True,
                                        z_0 = z_0,
                                        r_0 = r_0,
                                        v_sun = v_sun,
                                        debug = True,
                                        is_bayes = True,
                                        is_source_included = True, 
                                        NUMPY_LIB = NUMPY_LIB,
                                        dtype = dtype)

    # ["ra", "dec","r_est","pmra","pmdec","radial_velocity"] -> [:,1::]
    galactocentric_cov = cov.transform_cov_matrix(C = C_icrs, 
                                        df = icrs_data[:,1::],
                                        coordinate_system = 'Cartesian',
                                        z_0 = z_0,
                                        r_0 = r_0,
                                        is_bayes = True,
                                        NUMPY_LIB = NUMPY_LIB,
                                        dtype = dtype)

    # ["x", "y","r","phi","v_r","v_phi"] -> [0,1,6,7,8,9]
    cyl_cov_gpu = cov.transform_cov_matrix(C = galactocentric_cov, 
                                        df = galcen_data[:,[0,1,6,7,8,9]],
                                        coordinate_system = 'Cylindrical',
                                        z_0 = z_0,
                                        r_0 = r_0,
                                        is_bayes = False,
                                        NUMPY_LIB = NUMPY_LIB,
                                        dtype = dtype)

    sig_vphi = NUMPY_LIB.array([cyl_cov_gpu[:,4,4]])
    sig_vr = NUMPY_LIB.array([cyl_cov_gpu[:,3,3]])
    source_id = NUMPY_LIB.array([icrs_data[:,0]])
    galcen_data = NUMPY_LIB.concatenate(([galcen_data, sig_vphi.T, sig_vr.T, source_id.T]), axis=1)

    final_data_columns = ['x', 'y', 'z', 'v_x', 'v_y', 'v_z', 'r', 'phi', 'v_r', 'v_phi',
                'sig_vphi', 'sig_vr', 'source_id']
    
    galcen_data = pd.DataFrame(galcen_data.get(), columns=final_data_columns)

    return galcen_data

In [105]:
r0_range = [x for x in np.linspace(7800, 8500, 4)]
r0_range.append(8277)
r0_range.sort()
r0_range
print('Old Binning Scheme')
print('\n')
for r_0 in r0_range:
    galcen_data = transform_all(icrs_data, C_icrs, r_0, z_0, v_sun)
    bin_collection = data_analysis.get_collapsed_bins(data = galcen_data,
                                                        theta = (0, 1),
                                                        BL_r_min = 5000,
                                                        BL_r_max = 15000,
                                                        BL_z_min = -200,
                                                        BL_z_max = 200,
                                                        N_bins = (10, 1),
                                                        r_drift = False,
                                                        debug = False)


    print('R0 = {}'.format(r_0))
    print('Total number of stars across bins: {}'.format(i))
    print('Bins of increasing r --->')                                                 
    i = 0
    for bin in bin_collection.bins:
        n_bin = len(bin.data)
        print('| {}'.format(n_bin), end=" ")
        i += n_bin
    print('\n')
    

Old Binning Scheme


R0 = 7800.0
Total number of stars across bins: 1694405
Bins of increasing r --->
| 421896 | 473551 | 301729 | 181148 | 127575 | 66097 | 24010 | 7420 | 2294 | 512 

R0 = 8033.333333333333
Total number of stars across bins: 1606232
Bins of increasing r --->
| 361744 | 497503 | 336750 | 198459 | 138791 | 80062 | 30642 | 9699 | 2939 | 829 

R0 = 8266.666666666666
Total number of stars across bins: 1657418
Bins of increasing r --->
| 289906 | 505365 | 378622 | 220442 | 148362 | 94167 | 38877 | 12873 | 3780 | 1223 

R0 = 8277
Total number of stars across bins: 1693617
Bins of increasing r --->
| 286717 | 505320 | 380277 | 221674 | 148818 | 94771 | 39324 | 12991 | 3829 | 1250 

R0 = 8500.0
Total number of stars across bins: 1694971
Bins of increasing r --->
| 194393 | 487644 | 421778 | 251837 | 157863 | 108469 | 49047 | 16710 | 4938 | 1583 



In [102]:
r0_range = [x for x in np.linspace(7800, 8500, 4)]
r0_range.append(8277)
r0_range.sort()
r0_range

[7800.0, 8033.333333333333, 8266.666666666666, 8277, 8500.0]

In [104]:
r0_range = [x for x in np.linspace(7800, 8500, 4)]
r0_range.append(8277)
r0_range.sort()

print('New Binning Scheme')
print('\n')
for r_0 in r0_range:
    galcen_data = transform_all(icrs_data, C_icrs, r_0, z_0, v_sun)

    galcen_data['r_orig'] = galcen_data.r
    galcen_data['r'] = galcen_data.r/r_0 

    r_min = 5000/8277
    r_max = 15000/8277


    bin_collection = data_analysis.get_collapsed_bins(data = galcen_data,
                                                        theta = (0, 1),
                                                        BL_r_min = r_min,
                                                        BL_r_max = r_max,
                                                        BL_z_min = -200,
                                                        BL_z_max = 200,
                                                        N_bins = (10, 1),
                                                        r_drift = False,
                                                        debug = False)
    print('R0 = {}'.format(r_0))
    print('Total number of stars across bins: {}'.format(i))                                                 
    print('Bins of increasing r/r_0 --->')
    i = 0
    for bin in bin_collection.bins:
        n_bin = len(bin.data)
        print('| {}'.format(n_bin), end=" ")
        i += n_bin
    print('\n')
    

New Binning Scheme


R0 = 7800.0
Total number of stars across bins: 1694959
Bins of increasing r/r_0 --->
| 315339 | 474126 | 352513 | 215600 | 144123 | 97850 | 45352 | 16419 | 5230 | 1787 

R0 = 8033.333333333333
Total number of stars across bins: 1668339
Bins of increasing r/r_0 --->
| 302118 | 489972 | 365885 | 218616 | 146431 | 96449 | 42272 | 14651 | 4517 | 1501 

R0 = 8266.666666666666
Total number of stars across bins: 1682412
Bins of increasing r/r_0 --->
| 287317 | 504700 | 379694 | 221508 | 148754 | 94813 | 39449 | 13068 | 3864 | 1262 

R0 = 8277
Total number of stars across bins: 1694429
Bins of increasing r/r_0 --->
| 286717 | 505320 | 380277 | 221674 | 148818 | 94771 | 39324 | 12991 | 3829 | 1250 

R0 = 8500.0
Total number of stars across bins: 1694971
Bins of increasing r/r_0 --->
| 261846 | 518004 | 393706 | 224622 | 150564 | 93132 | 36700 | 11592 | 3317 | 922 



In [130]:
from scipy import stats
from BinCollection import BinCollection
from Bin import Bin

In [117]:
for i in range(100):
    bin_collection = data_analysis.get_collapsed_bins(data = galcen_data,
                                                        theta = (0, 1),
                                                        BL_r_min = 5000,
                                                        BL_r_max = 15000,
                                                        BL_z_min = -200,
                                                        BL_z_max = 200,
                                                        N_bins = (10, 1),
                                                        r_drift = False,
                                                        debug = False)

In [127]:
def get_collapsed_bins(data, theta, BL_r_min, BL_r_max, BL_z_min, BL_z_max, N_bins = (10, 10), r_drift = False, debug=False):


    # This assertion doesnt make sense, fix it later
    assert len(data.shape) > 0, "No data!"

    if not 'r' or 'phi' in data.index:
        print("No cylindrical coordinates found!")
        return

    # r and z parameters of points loaded into Series
    r = data.r
    z = data.z

    # Velocity projections of points: NOT NEEDED
    c = data.v_phi

    # Calling the actual binning function
    H, xedges, yedges, binnumber = stats.binned_statistic_2d(r, z, values = c, range = [[BL_r_min, BL_r_max], [BL_z_min, BL_z_max]], bins=N_bins, statistic='mean')

    print(H.shape)

    # Create a meshgrid from the vertices: X, Y -> R, Z
    XX, YY = np.meshgrid(xedges, yedges)

    # Assign a binnumber for each data entry
    data['Bin_index'] = binnumber

    # Instantiate a BinCollection object
    bin_collection = BinCollection(data, N_bins, XX, YY, YY, mode='r-z')

    # Generate the bins with respective r-z boundaries
    bin_collection.GenerateBins()

    
    return bin_collection

In [129]:
bin_collection = get_collapsed_bins(data = galcen_data,
                                    theta = (0, 1),
                                    BL_r_min = 5000,
                                    BL_r_max = 15000,
                                    BL_z_min = -200,
                                    BL_z_max = 200,
                                    N_bins = (10, 1),
                                    r_drift = False,
                                    debug = False)

(10, 1)


In [133]:
galcen_data.to_numpy()[:,1]

array([  8.08716774,   8.99738312,   8.74559402, ..., 203.26690674,
       195.06918335, 208.01971436])

In [135]:

r_array = (galcen_data.r).to_numpy()
z_array = (galcen_data.z).to_numpy()
data = galcen_data.to_numpy()

# Set the number of bins for each dimension
nbins = (10, 10)

# Use numpy's histogram2d function to bin the data
hist, xedges, yedges = np.histogram2d(r_array, z_array, bins=nbins)

# Initialize an empty list to store the Bin objects
bins = []
bin_index = 0
# Iterate through the bins and create a Bin object for each one
for i in range(hist.shape[0]):
    for j in range(hist.shape[1]):


        # Create a data subset for the current bin by selecting all points within the bin's boundaries
        data_subset = data[(r_array >= xedges[i]) & (r_array < xedges[i+1]) &
                           (z_array >= yedges[j]) & (z_array < yedges[j+1])]

        data_subset["Bin_index"] = bin_index
        # Create a Bin object for the current data subset and append it to the list
        bins.append(Bin(data_subset))
        bin_index+=1

# Now you have a list of Bin objects, one for each bin in the histogram


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [113]:
for i in range(50):
    H, xedges, yedges, binnumber = stats.binned_statistic_2d(r, 
                                                            z, 
                                                            values = c, 
                                                            range = [[BL_r_min, BL_r_max], [BL_z_min, BL_z_max]], 
                                                            bins=N_bins, 
                                                            statistic='mean')

ValueError: setting an array element with a sequence.