# Classification and Distribution of M-Type Asteroids
## Introduction to Data Science (MATH 4100/COMP 5360) Final Project
## Matt Storie and Ian Wixom
---
## Part 0: Setup and Reading in Data

In [1]:
#Import and Setup
import pandas as pd
import scipy as sc
import numpy as np
import seaborn as sns
from sklearn import svm, metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import scale

#%matplotlib notebook
import matplotlib.pyplot as plt 
plt.style.use('ggplot')
%matplotlib inline  
plt.rcParams['figure.figsize'] = (10, 6) 

#Custom Functions for use later
#Increases print output for one printing
def print_all(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')


In [44]:
#Imports are large so I am keeping them in their own cell for now
astro1 = pd.read_csv('C:/Users/matts/ds/Classification-and-Analysis-of-M-type-Asteroids/split_astro_ds_1.csv', low_memory = False)
astro2 = pd.read_csv('C:/Users/matts/ds/Classification-and-Analysis-of-M-type-Asteroids/split_astro_ds_2.csv', low_memory = False)

In [45]:
# Data Frame Formatting
# The Second file did not have all of the same columns, so we are adding them now
# and filling missing values with NA
astro2.columns = ['a', 'e', 'i', 'om', 'w', 'q', 'ad', 'per_y', 'n_obs_used', 'H',
       'diameter', 'extent', 'albedo', 'rot_per']
astro2['GM'] = np.nan
astro2['BV'] = np.nan
astro2['UB'] = np.nan
astro2['IR'] = np.nan
astro2['spec_B'] = np.nan
astro2['spec_T'] = np.nan
astro = pd.concat([astro1, astro2])

## Part 1 Classification of Unclassified Asteroids
Before we can explore the full set, we are going to need to train a model to classify the unclassified asteroids of the set before exploring the full set. 

In [55]:
# Now that the frames are combined we can perform exploratory analysis
# print(astro.head())

# Examining the number of unclassified asteroids of the dataset
tot_unclassified = len(astro[(astro['spec_T'].isnull()) & (astro['spec_B'].isnull())])
print("There are",len(astro),"total observed objects in the dataset")
print((len(astro)-tot_unclassified), "have been classified, while",tot_unclassified,"remain unclassified.")

# Examing distribution of the classified asteroids
print("---\nSMASS Value Counts")
print(astro['spec_B'].value_counts())
print("---\nTholen Value Counts")
print_all(astro['spec_T'].value_counts())

# astro.head()

# From this we can see that there are some other M class asteroids in combinded classes,
# However, including these only increases the sample from 37 to 41, and includes ambigous
# and/or noisy cases, which would not benefit our model


# astro['diameter'].describe()
# mtype = astro.loc[astro['spec_T'] == "M"]
# mtype['albedo'].describe()
# astro['spec_B'].unique()
# uniques = astro['spec_T'].unique()
# for x in uniques:
#     print(type(x))
#     print(x)


# metal = astro.loc[(astro['spec_T'].str.contains('M')) | (astro['spec_B'].str.contains('X'))]
# metal.shape
# metal = astro.loc[(astro['spec_T'] == "M") | (astro['spec_B'] == 'X') |
#                  (astro['spec_B'] == 'Xc') | (astro['spec_B'] == 'Xe')]
# metal.shape

# unknowns = astro.loc[(astro['spec_T'].isnull()) & (astro['spec_B'].isnull())]
# print(unknowns.shape)

# astro['spec_T'].unique()
# print(astro.shape)
# print(metal.shape)
# astro[astro['spec_T'] == 'nan']
# print(*astro['spec_T'].value_counts())

# astro['n_obs_used'].describe()

There are 1048573 total observed objects in the dataset
1742 have been classified, while 1046831 remain unclassified.
---
SMASS Value Counts
S      348
C      139
Ch     135
X      112
Xc      59
B       56
Sl      47
Sq      44
Xk      39
V       35
L       33
K       31
Cb      30
Sa      29
Xe      24
Cgh     15
T       13
Sk      12
A       12
Ld      11
Sr      11
Cg       9
D        8
R        4
O        1
U        1
Name: spec_B, dtype: int64
---
Tholen Value Counts
S         311
C         137
X          51
M          37
D          33
P          33
F          27
XC         22
CX         21
C:         10
E          10
CP         10
G           9
B           8
FC          7
SU          7
PC          6
T           6
CU          5
CX:         5
A           5
ST          5
XFU         5
DU          4
CSU         4
FXU:        4
CG          4
CF          4
CPF         3
DX          3
BU          3
MU          3
GC          3
DP          3
XD          3
CD          3
I           3
XDC 

Now that we have an idea of the distribution, lets subset the metallic identified asteroids, an equal amount of non-metallic asteroids, and use them to build and test a classification model

### Note: Revisit later and see if we can just use the entire classified set as the training. Do the metallic and non-metallic need to be equal?

In [80]:
# Subsetting metal asteroids and getting an equal amount of classified, nonmetal asteroids
classified = astro.loc[(((astro['spec_T'].notnull()) | (astro['spec_B'].notnull())) & astro['albedo'].notnull())]
metal = classified.loc[(classified['spec_T'] == "M") | (classified['spec_B'] == 'X') | 
                   (classified['spec_B'] == 'S') | (classified['spec_T'] == 'S')]
nonmetal = classified.loc[(classified['spec_T'] != "M") & (classified['spec_B'] != 'X') &
                 (classified['spec_B'] != 'S') & (classified['spec_T'] != 'S')]

# ((astro['spec_T'].notnull()) | (astro['spec_B'].notnull()) & astro['albedo'].notnull())
# (astro['spec_T'].notnull()) | (astro['spec_B'].notnull())


# Sampling from the nonmetal dataset equally
nonmetal_sample = nonmetal.sample(n=(len(metal)))
metal.insert(20, "m_id", '1')
pd.to_numeric(metal["m_id"])
nonmetal_sample.insert(20, "m_id", '0')
pd.to_numeric(nonmetal_sample["m_id"])
training_set = pd.concat([metal, nonmetal_sample])



In [83]:
# X.head()
# Creating the model
y_var = training_set['m_id'].to_numpy()
np.reshape(y_var, (1, -1))
X_var = training_set['albedo'].to_numpy()
X_var = scale(X_var)

## Below this point is not currently working.
x_train, x_test, y_train, y_test = train_test_split(X_var, y_var, random_state=1, test_size=0.8)

x_train = np.reshape(x_train, ((len(x_train), 1)))
# y_train = np.reshape(y_train, ((len(y_train), 1)))
x_test = np.reshape(x_test, ((len(x_test), 1)))
# y_test = np.reshape(y_test, ((len(y_test), 1)))

svm_train = svm.SVC(kernel="rbf", C=100, gamma=0.01)
svm_train.fit(x_train, y_train)
test_pred = svm_train.predict(x_test)
print(metrics.confusion_matrix(y_true = y_test, y_pred = test_pred))
print(metrics.accuracy_score(y_true = y_test, y_pred = test_pred))

[[395 147]
 [ 89 459]]
0.7834862385321101
