In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from astropy.visualization import HistEqStretch, ImageNormalize
from mpl_toolkits.axes_grid1.inset_locator import inset_axes

from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegressionCV
from astropy.table import Table

from tqdm import tqdm

import xgboost as xgb

from agabpylib.plotting.plotstyles import useagab, apply_tufte
from agabpylib.stats.robuststats import rse
from reflspectratools import (
    plot_spectra_collection,
    load_dr3_data,
    create_ml_training_classes,
    extract_spectra,
    prep_classifier_inputs,
)

useagab(fontsize=22, linewidths=3, axislinewidths=2)

In [2]:
ssospectra = load_dr3_data(".\data\DR3ReflectanceSpectra.fits", clipspectra=False)
import os


print(ssospectra.head())
cwd = os.getcwd() 
print(cwd)
n_asteroids = ssospectra.number_mp.unique().size
wavelengths = ssospectra["wavelength"].unique()
wavelengths.sort()
ssotypes = create_ml_training_classes(ssospectra, filter=True)

ssoparams = Table.read(".\data\DR3ReflectanceSpectra.fits").to_pandas()

print(ssotypes['C'])





#ssoparams["denomination"] = ssoparams.denomination.str.decode("utf-8")
#ssoparams["parent_name"] = ssoparams.parent_name.str.decode("utf-8")

training_spectra, training_labels, data_to_be_classified = prep_classifier_inputs(
    ssospectra, ssotypes
)

    source_id          solution_id  number_mp denomination  nb_samples  \
0 -4284967286  4167557769573408785          1        ceres          16   
1 -4284967286  4167557769573408785          1        ceres          16   
2 -4284967286  4167557769573408785          1        ceres          16   
3 -4284967286  4167557769573408785          1        ceres          16   
4 -4284967286  4167557769573408785          1        ceres          16   

   num_of_spectra  reflectance_spectrum  reflectance_spectrum_err  wavelength  \
0              19              0.875178                  0.000489       374.0   
1              19              0.935736                  0.000367       418.0   
2              19              0.956022                  0.000360       462.0   
3              19              0.984316                  0.000379       506.0   
4              19              1.000000                  0.000396       550.0   

   reflectance_spectrum_flag  reflectance_spectrum_normalized  
0   

In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,roc_auc_score,make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

rrng = None
training_spectra, training_labels, data_to_be_classified = prep_classifier_inputs(
        ssospectra, ssotypes, rng=rrng
    )

#split the labelled data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(training_spectra, training_labels, random_state=22, stratify=training_labels)
print(len(x_train), len(x_test), len(y_train), len(y_test)) 
#ensure the stratification worked
print(sum(y_train)/len(y_train))
print(sum(y_test)/len(y_test))



663 221 663 221
0.3559577677224736
0.3574660633484163


In [None]:
from sklearn.model_selection import GridSearchCV

rs_param_grid = {
    # max_depth: values from 3 to 12
    'max_depth': [3,4,5],
    # alpha: values 0, .001, .01, .1
    'alpha': [.5,.75,1],

    'scale_pos_weight': [1,2,3],

    'lambda' : [0,1,2],

    'gamma': [0,0.1,0.2],
    # subsample: values 0.25,0.5,0.75, 1
    'subsample': [.25,0.5,.75],
    # learning rate: ten values between 0.01 - 0.5
    'learning_rate': [0.3,0.4,0.5],
    # n_estimators: values 10, 25, 40
    'n_estimators': [8,10,12]
    
    # This gave the following output:
    # Best parameters found:  {'alpha': 0.75, 'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 12, 'subsample': 0.5}
    # Best accuracy found:  0.924481658692185 

    }


# Insantiate XGBoost Clasifier 

# Instantiate RandomizedSearchCV()






variable_model = xgb.XGBClassifier(objective= 'binary:logistic',
                                   seed = 22,
                                   colsample_bytree = 0.5,
                                   eval_metric = 'error')

random_searched_model = GridSearchCV(estimator=variable_model,
                                     param_grid=rs_param_grid,
                                     verbose = 4)
                                     

random_searched_model.fit(x_train, y_train, 
                          eval_set=[(x_test, y_test)], 
                          verbose=True) 



# Print the best parameters and highest accuracy
print("Best parameters found: ", random_searched_model.best_params_)
print("Best accuracy found: ", random_searched_model.best_score_)

best_params = {'alpha': 0.75, 'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 1, 'subsample': 0.5}

In [None]:
optimized_model = xgb.XGBClassifier(objective= 'binary:logistic',
                                   seed = 22,
                                   colsample_bytree = 0.5,
                                   eval_metric="aucpr",
                                   early_stopping_rounds=10,
                                   eval_set=[(x_test, y_test)], 
                                   verbose=True,
                                   
                                   **best_params)


optimized_model.fit(x_train, y_train, eval_set=[(x_test, y_test)])
y_pred = optimized_model.predict(x_test)

#plots a confusion matrix of the labelled data
cm = confusion_matrix(y_test, y_pred, labels=optimized_model.classes_)
cmtrain = confusion_matrix(y_train, optimized_model.predict(x_train), labels=optimized_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Other', 'C'])
traindisp = ConfusionMatrixDisplay(confusion_matrix=cmtrain, display_labels=['Other', 'C'])
traindisp.plot()
disp.plot()
plt.show()

plt.rcParams['figure.figsize'] = (20.0, 8)
xgb.plot_tree(optimized_model)

improved_pred = optimized_model.predict(data_to_be_classified)
improved_prob = optimized_model.predict_proba(data_to_be_classified)[:, 1]
print(improved_prob)

p=0

while p<len(improved_pred):
    if improved_prob[p] < 0.52:
        improved_pred[p] = 0
    p=p+1

improved_ctypes = data_to_be_classified[improved_pred == 1]

print(improved_ctypes)



plt.figure(figsize=(10, 6))
plt.ylim(0.5, 2)
plt.xlim(0, 16)
for i in improved_ctypes:
    plt.plot(i, color = 'cyan', alpha = 0.002)

plt.show()