In [1]:
import pandas as pd
from pathlib import Path
import time
from os import path
import seaborn as sns

from preprocessing.exoplanets_gaia_crossmatch import gaia_exoplanets_cross, transform_to_cart
from preprocessing.download_gaia import GaiaDataset
from preprocessing.calc_density import get_densities
from models.gaussian_mixture import remove_outliers, gaussian_mixture
from models.classification import random_forest as rfc
from models.regression import random_forest as rfr
from models.regression import RadialVelocityRegression, ann
from sklearn.metrics import r2_score
from graphs.mixture_fit import best_fit_mixture

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle
import os
import shutil

Created TAP+ (v1.2.1) - Connection:
	Host: gea.esac.esa.int
	Use HTTPS: True
	Port: 443
	SSL Port: 443
Created TAP+ (v1.2.1) - Connection:
	Host: geadata.esac.esa.int
	Use HTTPS: True
	Port: 443
	SSL Port: 443


In [2]:
crossmatch_dir = "data/crossmatch/dr3"
densities_dir = "data/densities/dr3"
classification_dir = "data/classification/dr3"
datasets_dir = "data/initial_datasets"

In [None]:
if not os.path.exists(crossmatch_dir):
    os.mkdir(crossmatch_dir)
if not os.path.exists(densities_dir):
    os.mkdir(densities_dir)
if not os.path.exists(classification_dir):
    os.mkdir(classification_dir)
if not os.path.exists(datasets_dir):
    os.mkdir(datasets_dir)

# 1. Download Gaia dataset

In [5]:
GaiaDataset().get_gaia()

OK
INFO: Query finished. [astroquery.utils.tap.core]


# 2. Crossmatch NASA Exoplanet dataset with Gaia dataset

In [32]:
def exoplanet_gaia_crossmatch(crossmatch_dir, transform_type="6d", table_name="gaiaedr3", save_spherical=True):
    """
    :param: transform_type: Type of coordinates transformation to perform on the data (6d, 5d_drop_vx, 5d_drop_vy or 5d_drop_vz).
    :param: table_name: Name of the Gaia dataset to use.
    :param: save_spherical: Save spherical values to a CSV file. When looping it is adviced to apply only once to save time. 
    
    :return: Density values for 1065 exoplanets and their neighbours, Winter-Gaia-NASA exoplanet archive crossmatch
    table containing 6D coordinates only and Winter-Gaia-NASA exoplanet archive crossmatch table with data from all 3
    sources combined.
    """

    # Cross match datasets and generate new ones.
    gaia = gaia_exoplanets_cross(f"{table_name}.csv", crossmatch_dir, save_gaia_id=True, return_data=True, save_spherical=save_spherical)
    transform_to_cart(gaia, table_name, crossmatch_dir, setting=transform_type)

In [8]:
exoplanet_gaia_crossmatch(crossmatch_dir, transform_type="6d", save_spherical=True)
exoplanet_gaia_crossmatch(crossmatch_dir, transform_type="5d_drop_rv", save_spherical=False)
exoplanet_gaia_crossmatch(crossmatch_dir, transform_type="5d_drop_vz", save_spherical=False)

# 3. Calculate phase space density for neighbours of exoplanet hosts

In [3]:
def calculate_densities(star_labels_filename, dataset_filename, crossmatch_dir, densities_dir, exoplanets_only=True, start=0, stop=1000, step=1, run_on_gpu=False):
    """
    Calculate phase space density for given set of stars.
    
    :param: star_labels_filename: Name of the file containing star labels.
    :param: dataset_filename: Name of the file containing coordinates of the stars.
    :param: exoplanets_only: Compute density only for a list of exoplanets (~1000).
    :param: n_stars: Numeber of stars to calculate density for.
    :param: run_on_gpu: Use GPU accelerated pipeline.
    """
    
    labels_file = pd.read_csv(os.path.join(crossmatch_dir, star_labels_filename), dtype={"source_id": np.float64, "Host": np.object})
    gaia = pd.read_csv(os.path.join(crossmatch_dir, dataset_filename))
                
    if gaia.shape[1] == 6:
        name = dataset_filename.split("_")[0] + f"_{dataset_filename[-6:-4]}"
    else:
        name = dataset_filename.split("_")[0] + f"_{dataset_filename[-14:-4]}"
            
    if exoplanets_only:
        labels = labels_file["Host"].dropna()
        start = 0
        stop = labels.shape[0]
        step = 1
        name = name + "_only-" + dataset_filename.split("_")[1] + "s"
    else:
        labels = labels_file["source_id"]
        name = name + f"_{str(start)}_{str(stop)}"

    densities, dropped = get_densities(labels.to_numpy(), gaia.to_numpy(), start=start, stop=stop, step=step, run_on_gpu=run_on_gpu)
    
    with open(f"{densities_dir}/densities_{name}.data", "wb") as f:
        pickle.dump(densities, f)
    with open(f"{densities_dir}/dropped_densities_{name}.data", "wb") as f:
        pickle.dump(dropped, f)

In [4]:
n_stars = 10
for fname in os.listdir(crossmatch_dir):
    if "cartesian_6d" in fname:
        for i in range(0, 10, 1):
            start = i * n_stars
            stop = (i+1) * n_stars
            step = 1
            tstart = time.perf_counter()
            calculate_densities("gaiaedr3_star_labels.csv", fname, crossmatch_dir, densities_dir, exoplanets_only=False, start=start, stop=stop, step=step, run_on_gpu=True)
            tend = time.perf_counter()
            print(f"{fname} completed in: {str(tend-tstart)}")

gaiaedr3_exoplanet_cross_cartesian_6d.csv completed in: 14.817765488000077
gaiaedr3_exoplanet_cross_cartesian_6d.csv completed in: 13.269045785999879


KeyboardInterrupt: 

In [17]:
gaia

36

# 3. Fit gaussian mixture model to predict if target star belongs to overdensity or underdensity group. Return scores and attributes of the model.

In [33]:
def fit_gaussian_mixture(file_name, classification_dir, densities_dir, show_graph=False, save_graph=False):
    """
    :param: star_labels_filename:
    """
    sigma = 2
    fig_dir=None
    file_name = file_name.split(".")[0]
    if save_graph:  
        if os.path.isdir(f"figures/{file_name}"):
            shutil.rmtree(f"figures/{file_name}")
        fig_dir = file_name
    
        os.mkdir(f"figures/{fig_dir}")
    
    with open(f"{densities_dir}/{file_name}.data", "rb") as f:
        densities = pickle.load(f)
    
    results = []
    for i in densities:
        # Compute log10 of the host density and expand dimensions for further use
        target = np.expand_dims(np.log10(i[1]), axis=0).T

        # Remove outliers outside sigma
        data = remove_outliers(i[4], sigma=sigma)
        
        # Apply gaussian mixture model to the data
        model, scores = gaussian_mixture(data, [target], components=2, scores_only=False)

        # Create list consisting of star name and its density for graph drawing
        scores.insert(0, target[0])
        scores.insert(0, i[4].shape[0])
        scores.insert(0, i[0])
        
        results.append(scores)
        
        # Draw best fit mixture
        if type(i[0]) != str:
            host = [f"{i[0]:.0f}", target]
        else:
            host = [i[0], target]

        if show_graph or save_graph:
            best_fit_mixture(model, data, host, fig_dir, show_graph, save_graph)

    
    df = pd.DataFrame(results, columns=["Host", "n_stars", "density", "Plow", "Phigh", "mean_low", "mean_high", "cov_low",
                                        "cov_high", "aic", "bic"])
    df.to_csv(f"{classification_dir}/features_{file_name}.csv")

In [34]:
for fname in os.listdir(densities_dir):
    if "dropped" in fname:
        continue
    if "only-exoplanets" in fname:
        fit_gaussian_mixture(fname, classification_dir, densities_dir, show_graph=False, save_graph=False)
    else:
        fit_gaussian_mixture(fname, classification_dir, densities_dir, show_graph=False, save_graph=False)
        