In [None]:
import warnings

warnings.filterwarnings("ignore", message="A column-vector y was passed when a 1d array was expected")


from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import time
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

import plotly.graph_objects as go
from plotly.graph_objs import *


data_dir = "Data"

XenoSupermanGalaxy_Arms = np.load('{0:s}/XenoSupermanGalaxy_Arms.npy'.format(data_dir))
XenoSupermanGalaxy_CenterGlob = np.load('{0:s}/XenoSupermanGalaxy_CenterGlob.npy'.format(data_dir))
XenoSupermanGalaxy_Stars = np.load('{0:s}/XenoSupermanGalaxy_Stars.npy'.format(data_dir))
XenoSupermanGalaxy_tooClose = np.load('{0:s}/XenoSupermanGalaxy_tooClose.npy'.format(data_dir))

XenoSupermanGalaxy = {}
XenoSupermanGalaxy['Arms'] = XenoSupermanGalaxy_Arms
XenoSupermanGalaxy['CenterGlob'] = XenoSupermanGalaxy_CenterGlob
XenoSupermanGalaxy['Stars'] = XenoSupermanGalaxy_Stars
XenoSupermanGalaxy['tooClose'] = XenoSupermanGalaxy_tooClose

GFFA_Arms = np.load('{0:s}/GFFA_Arms.npy'.format(data_dir))
GFFA_CenterGlob = np.load('{0:s}/GFFA_CenterGlob.npy'.format(data_dir))
GFFA_Stars = np.load('{0:s}/GFFA_Stars.npy'.format(data_dir))
GFFA_tooClose = np.load('{0:s}/GFFA_tooClose.npy'.format(data_dir))

GFFA = {}
GFFA['Arms'] = GFFA_Arms
GFFA['CenterGlob'] = GFFA_CenterGlob
GFFA['Stars'] = GFFA_Stars
GFFA['tooClose'] = GFFA_tooClose

print(GFFA['Stars'].shape,  XenoSupermanGalaxy['Stars'].shape)

plt.style.use('dark_background')

# dataset is subset of stars from each galaxy
TrainingSize = min(len(GFFA['Stars']), len(XenoSupermanGalaxy['Stars']))

collision = dict()
collision['Arms'] = np.vstack((GFFA['Arms'].copy(), XenoSupermanGalaxy['Arms'].copy()))
collision['CenterGlob'] = np.vstack((GFFA['CenterGlob'].copy(), XenoSupermanGalaxy['CenterGlob'].copy()))
collision['Stars'] = np.vstack((GFFA['Stars'].copy(), XenoSupermanGalaxy['Stars'].copy()))
print(collision['Stars'].shape)

# get the index of the stars to use from XenoSupermanGalaxy
XenoIndex = np.random.choice(len(XenoSupermanGalaxy['Stars']), TrainingSize, replace=False)
# get the index of the stars to use from GFFAIndex
GFFAIndex = np.random.choice(len(GFFA['Stars']), TrainingSize, replace=False)

# create a list with a labelforeahc item in the combined training set
# the first hald of the list indicates that class 0 will be for GFFA, 1 will be XenoSupermanGalaxy
y = [0] * TrainingSize + [1] * TrainingSize
# Stack the stars subset in same order as the labels, GFFA first, XenoSupermanGalaxy second
trainGalaxy = np.vstack((GFFA['Stars'][GFFAIndex], XenoSupermanGalaxy['Stars'][XenoIndex]))

# Roberto's add { ################################################################
all_coords_x = []
all_coords_y = []
all_coords_z = []
for coord in trainGalaxy:
    all_coords_x.append(int(coord[0]))  # integers are selected to make the html files smaller
    all_coords_y.append(int(coord[1]))
    all_coords_z.append(int(coord[2]))
# } Roberto's add ################################################################

x_train, x_test, y_train, y_test = train_test_split(trainGalaxy, np.array(y), train_size=0.05)

K = 3
myModels = {'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=K),
            'RandomForestClassifier': RandomForestClassifier(n_jobs=2, random_state=0),}
# sweep through various training split percentage
TrainingSize = [.001, .01, .03, .05, .1, .2, .5, .8]
# TrainingSize = [.001, .05, .1, .5, .8]
bestScore = {}
hi = 0
K = 3

# Roberto's add { ################################################################
layout = Layout(
    paper_bgcolor='rgba({0:d},{0:d},{0:d},{0:d})'.format(20),
    plot_bgcolor ='rgba({0:d},{0:d},{0:d},{0:d})'.format(90),
)
small=1.2
name_legend = ['GFFA', 'Xeno']
galaxy_id = {'GGFA': 0, 'Xeno': 1}

method_to_plot = 'RandomForestClassifier' # as an exercise try also KNeighborsClassifier
if method_to_plot == 'RandomForestClassifier':
    f_prefix = 'RnFo'
    lgnd_title = 'Random Forest Classifier'
if method_to_plot == 'KNeighborsClassifier':
    f_prefix = 'KNei'
    lgnd_title = 'K-Neighbors Classifier'

def PLOT_TRACE(fig, x_test, glx_name):
    if glx_name == 'GFFA': glx_id = 0
    if glx_name == 'Xeno': glx_id = 1
    coord_x = []
    coord_y = []
    coord_z = []

    for p in range(len(y_pred)):
        if y_pred[p] == glx_id:
            coord_x.append(int(x_test[p][0]))
            coord_y.append(int(x_test[p][1]))
            coord_z.append(int(x_test[p][2]))
    fig.add_trace(go.Scatter3d(x=coord_x, y=coord_y, z=coord_z,
                               mode='markers',
                               name='Predicted as {0:s} w/ {1:0.1f}% for testing->Acc={2:0.1f}%'
                               .format(glx_name, 100 * tsz, 100 * roc),
                               marker=dict(size=small)))
    return fig
# } Roberto's add ################################################################

for tsz in TrainingSize:
    x_train, x_test, y_train, y_test = train_test_split( \
        trainGalaxy, np.array(y), train_size=tsz)
    y_train = y_train.ravel()
    y_test = y_test.ravel()

# Roberto's add { ################################################################
    fig = go.Figure(layout=layout)
    fig.add_trace(go.Scatter3d(x=all_coords_x, y=all_coords_y, z=all_coords_z,
                               mode='markers',
                               name='All stars - initial data',
                               marker=dict(size=1.5,  color='white')))
# } Roberto's add ################################################################

    for name, modelFunc in myModels.items():
        start = time.time()
        model = modelFunc

        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        #y_pred = model.predict(trainGalaxy)

        print('Results of {} classification'.format(name))
        print('  K: ', K)
        print('  Training size: ', tsz)
        print('  y_train.shape: ', y_train.shape)
        roc = roc_auc_score(y_test, y_pred)
        print('  roc_auc_score: {:4.1f}'.format(100 * roc))
        print('  Time: {:5.1f} sec\n'.format(time.time() - start))

# Roberto's add { ################################################################
        if name == method_to_plot:
            fig = PLOT_TRACE(fig, x_test, 'GFFA')
            fig = PLOT_TRACE(fig, x_test, 'Xeno')
# } Roberto's add ################################################################

        if roc > hi:
            hi = roc
            bestScore = {'name': name,
                         'roc': roc,
                         'trainingSize': tsz,
                         'confusionMatrix': confusion_matrix(y_test, y_pred),
                         'precision': 100 * precision_score(y_test, y_pred, average='binary'),
                         'recall': recall_score(y_test, y_pred, average='binary')}
# Roberto's add { ################################################################
    fig.update_layout(
        scene=dict(
            xaxis=dict(backgroundcolor="rgba(0, 0, 0,0)", gridcolor="white", showbackground=True,
                       zerolinecolor="white", ),
            yaxis=dict(backgroundcolor="rgba(0, 0, 0,0)", gridcolor="white", showbackground=True,
                       zerolinecolor="white", ),
            zaxis=dict(backgroundcolor="rgba(0, 0, 0,0)", gridcolor="white", showbackground=True,
                       zerolinecolor="white", ), ),
        legend=dict(bgcolor='grey', font=dict(size=20), title=lgnd_title,))
    fig.write_html("{0:s}_{1:02.1f}.html".format(f_prefix, tsz*100))
# } Roberto's add ################################################################

print('bestScore: name', bestScore['name'])
print('bestScore: confusion Matrix', bestScore['confusionMatrix'])
print('bestScore: precision', bestScore['precision'])
print('bestScore: recall', bestScore['recall'])
print('bestScore: roc', bestScore['roc'])