In [1]:
import pandas as pd
from models.classification import random_forest as rfc
from models.classification import get_scores, perm_importance, data_split
from models.regression import random_forest as rfr
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import pickle
import os
from pathlib import Path

In [2]:
with open(f"data/densities/dr3/densities_gaiaedr3_6d_100000-stars.data", "rb") as f:
    df = pickle.load(f)
features = []
for i in df:
    features.append((i[4].mean(), i[4].std(), i[4].min(), i[4].max()))
feat = pd.DataFrame(features, columns=["mean", "std", "min", "max"])

In [3]:
with open(f"data/densities/dr3/densities_gaiaedr3_5d_drop_rv_100000-stars.data", "rb") as f:
    df = pickle.load(f)
features = []
for i in df:
    features.append((i[4].mean(), i[4].std(), i[4].min(), i[4].max()))
feat1 = pd.DataFrame(features, columns=["mean", "std", "min", "max"])

In [3]:
def labels(row):
    if row['Plow'] >= 0.84:
        return '0'
    elif row['Phigh'] >= 0.84:
        return '2'
    else:
        return '1'

In [4]:
def classification_task(ground_truth_file, training_data_file):

    results_dir = "results/classification"
    features_dir = "data/classification/dr3"
    results_file = "results" + "_".join(training_data_file.rsplit("_", 4)[1:])
    gt = pd.read_csv(os.path.join(features_dir, ground_truth_file), index_col=0)
    train = pd.read_csv(os.path.join(features_dir, training_data_file), index_col=0)

    gt["class"] = gt.apply(lambda row: labels(row), axis=1)
    train["class"] = train.apply(lambda row: labels(row), axis=1)
    #gt = gt[gt["class"] != "1"]
    #train = train[train["class"] != "1"]

    train["class"] = gt["class"]

    #train = train[train["n_stars"] > 400]
    label = train["class"]
    #train.drop(["Host", "Plow", "Phigh", "class"], axis=1, inplace=True)
    
    train = train[["density"]]
    train[["mean", "std", "min", "max"]] = feat1[["mean", "std", "min", "max"]]
    print(train[:5])
    #train["mean"] = feat1["mean"]
    
    print(train.columns)
    x_train, x_test, y_train, y_test = data_split(train, label, 0.2)
    model, model_name = rfc(x_train, y_train)
    scores = get_scores(model, train.columns.to_list(), x_test, y_test)
    
    #perms = perm_importance(model, x_train, x_test, y_test)
    columns = ["Classifier", "Features", "Accuracy", "Precision", "Recall", "F1-score"]
    df = pd.DataFrame([scores], columns=columns)

    if Path(results_file).is_file():
        df.to_csv(os.path.join(results_dir, results_file), mode='a', header=False, index=False)
    else:
        df.to_csv(os.path.join(results_dir, results_file), index=False)

In [7]:
classification_task("features_densities_gaiaedr3_6d_100000-stars.csv", "features_densities_gaiaedr3_5d_drop_rv_100000-stars.csv")

    density       mean         std           min          max
0 -1.050389  11.925268  136.713327  1.959479e-06  3036.752994
1  0.158247   1.284580    1.742365  5.760859e-05    36.437481
2 -1.508119   1.134064    0.880591  9.529604e-05     6.549403
3 -0.396675   1.599838    1.736066  3.350778e-10    13.453237
4 -0.108797   1.631239    1.790833  2.987316e-10    17.195434
Index(['density', 'mean', 'std', 'min', 'max'], dtype='object')
              precision    recall  f1-score   support

           0       0.94      0.92      0.93      1259
           1       0.77      0.78      0.77       886
           2       0.95      0.96      0.96      2674

    accuracy                           0.92      4819
   macro avg       0.89      0.89      0.89      4819
weighted avg       0.92      0.92      0.92      4819

[[1156  101    2]
 [  77  690  119]
 [   2  105 2567]]


In [73]:
classification_task("features_densities_gaiaedr3_6d_100000-stars.csv", "features_densities_gaiaedr3_6d_100000-stars.csv")

Index(['density'], dtype='object')
              precision    recall  f1-score   support

           0       0.93      0.92      0.93      1259
           1       0.76      0.76      0.76       886
           2       0.95      0.95      0.95      2674

    accuracy                           0.91      4819
   macro avg       0.88      0.88      0.88      4819
weighted avg       0.91      0.91      0.91      4819

[[1164   91    4]
 [  85  672  129]
 [   6  119 2549]]


In [98]:
classification_task("features_densities_gaiaedr3_6d_100000-stars.csv", "features_densities_gaiaedr3_5d_drop_rv_100000-stars.csv")

Index(['density', 'mean', 'std', 'min', 'max'], dtype='object')
              precision    recall  f1-score   support

           0       0.94      0.91      0.92      1259
           1       0.78      0.76      0.77       886
           2       0.95      0.97      0.96      2674

    accuracy                           0.91      4819
   macro avg       0.89      0.88      0.88      4819
weighted avg       0.91      0.91      0.91      4819

[[1148  109    2]
 [  77  673  136]
 [   1   85 2588]]


In [80]:
classification_task("features_densities_gaiaedr3_6d_100000-stars.csv", "features_densities_gaiaedr3_5d_drop_rv_100000-stars.csv")

Index(['density'], dtype='object')
              precision    recall  f1-score   support

           0       0.87      0.89      0.88      1259
           1       0.64      0.60      0.62       886
           2       0.92      0.93      0.93      2674

    accuracy                           0.86      4819
   macro avg       0.81      0.81      0.81      4819
weighted avg       0.86      0.86      0.86      4819

[[1123  128    8]
 [ 152  535  199]
 [  16  171 2487]]


In [25]:
def regression_task(ground_truth_file, training_data_file):
    features_dir = "data/classification/dr3"
    gt = pd.read_csv(os.path.join(features_dir, ground_truth_file), index_col=0)
    output = gt["Phigh"]
    train = pd.read_csv(os.path.join(features_dir, training_data_file), index_col=0)
    
    #train[["mean", "std", "min", "max"]] = feat1[["mean", "std", "min", "max"]]
    train = train.drop(["Host", "Plow", "Phigh", "n_stars",
                  "mean_low", "cov_low", "aic", "bic", 'mean_high', 'cov_high'], axis=1)
    print(train.columns)
    rfr(train, output)

In [29]:
regression_task("features_densities_gaiaedr3_6d_100000-stars.csv", "features_densities_gaiaedr3_5d_drop_vz_100000-stars.csv")

Index(['density'], dtype='object')
5828     9.812128e-01
20715    8.634325e-10
5694     8.387193e-02
8594     8.161835e-01
485      7.070679e-01
12120    5.297075e-07
22187    9.862072e-01
14493    9.941497e-01
2454     9.712318e-01
9511     8.233033e-01
23454    7.189230e-07
18168    9.848136e-01
554      9.656294e-01
12948    8.752159e-01
21462    9.459122e-01
21714    2.469882e-01
23182    7.986667e-01
11564    9.903173e-01
7866     3.832875e-03
2313     2.211817e-03
Name: Phigh, dtype: float64
[9.48266969e-01 2.76065013e-06 4.93805605e-01 6.40358679e-01
 8.16661959e-01 4.15277063e-04 9.82623964e-01 9.85132273e-01
 9.83296694e-01 4.87283089e-01 1.80654105e-05 9.94386616e-01
 1.88996357e-01 9.04968654e-01 9.52650612e-01 3.00631551e-01
 8.31837348e-01 9.81904161e-01 2.07279743e-03 3.53004040e-04]
('R2 score for Phigh: ', 0.9152341866573337)
