In [None]:
#### From the NASA Kepler exoplanet webpage (https://exoplanetarchive.ipac.caltech.edu/docs/API_kepcandidate_columns.html)
We are looking at data from the Kepler spacecraft which was the first 
space mission dedicated to the search for Earth-sized and smaller planets in the habitable zone of other stars in 
our neighborhood of the galaxy. Kepler was a special-purpose spacecraft that precisely measured the light variations 
from thousands of distant stars, looking for planetary transits. When a planet passes in front of its parent star, as 
seen from our solar system, it blocks a small fraction of the light from that star; this is known as a transit.

>  Kepler Object of Interest (KOI). A KOI is a target identified by the Kepler Project that displays at least 
>  one transit-like sequence within Kepler time-series photometry that appears to be of astrophysical origin and 
>  initially consistent with a planetary transit hypothesis.

> OUTPUT or "target" of analysis
* koi_disposition = category describing whether KOI is a candidate, confirmed planet, false 
positive, or not dispositioned 

> FLAGS are set when transit is not planetary in nature
>* koi_fpflag_nt = not transit like flag.  A KOI whose light curve is not consistent with that of a transiting planet. 
This includes, but is not limited to, instrumental artifacts, non-eclipsing variable stars, and spurious (very low SNR) detections.

>* koi_fpflag_ss = stellar eclipse flag.  A KOI that is observed to have a significant secondary event, transit shape, or 
out-of-eclipse variability, which indicates that the transit-like event is most likely caused by an eclipsing binary.

>* koi_fpflag_co = centroid offset flag. The source of the signal is from a nearby star.

>* koi_fpflag_ec = ephemeris match indicates contamination. The KOI shares the same period and epoch as another object and 
is judged to be the result of flux contamination in the aperture or electronic crosstalk.

> TIMES associated with the transit
>* koi_period = orbital period. The interval between consecutive planetary transits.

>* koi_time0bk = transit epoch. The time corresponding to the center of the first detected transit in Barycentric Julian Day (BJD) minus a constant offset of 2,454,833.0 days. 
The offset corresponds to 12:00 on Jan 1, 2009 UTC. Other columns have error for this parameter

> PHYSICAL parameters of star being transitted
>* koi_steff = stellar effective temperature. The photospheric temperature of the star.

>* koi_slogg = stellar surface gravity. The base-10 logarithm of the acceleration due to gravity at the surface of the star.

>* koi_srad = stellar radius. The photospheric radius of the star.

> ORBITAL geometry and location
>* ra = right ascension
>* dec = declination
>* koi_kepmap = Kepler-band

#### HOUSEKEEPING
##### Update sklearn to prevent version mismatches
pip install sklearn --upgrade
##### Install joblib. This will be used to save your model. Restart your kernel after installing 
pip install joblib

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

### Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


#### From the column definitions, 
We note that the parameters which most likely indicate a exoplanet will involve size of anomaly and periodicity.  
Therefore, we choose the initial set of features to include the TIME, PHYSICAL, and ORBITAL parameters.  
Also, include all of the flags that indicate some type of contamination.

In [73]:
# Set features. This will also be used as your x values.
selected_features = df[['koi_fpflag_nt','koi_fpflag_ss','koi_fpflag_co','koi_fpflag_ec','koi_period', 'koi_time0bk', 'koi_slogg', 'koi_steff', 'koi_srad', 'koi_kepmag', 'koi_disposition']]

In [75]:
selected_features.tail()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_slogg,koi_steff,koi_srad,koi_kepmag,koi_disposition
6986,0,0,0,1,8.589871,132.0161,4.296,5638,1.088,14.478,FALSE POSITIVE
6987,0,1,1,0,0.527699,131.705093,4.529,5638,0.903,14.082,FALSE POSITIVE
6988,0,0,0,0,1.739849,133.00127,4.444,6119,1.031,14.757,CANDIDATE
6989,0,0,1,0,0.681402,132.18175,4.447,6173,1.041,15.385,FALSE POSITIVE
6990,0,0,1,1,4.856035,135.9933,4.385,6469,1.193,14.826,FALSE POSITIVE


#### Create a Train Test Split
Note there are 6990 records in the data set from the cell above.
Use `koi_disposition` for the y ("target") values

In [76]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
# from tensorflow.keras.utils import to_categorical

#assign x and y values
X = selected_features
X = selected_features.drop("koi_disposition", axis=1)
y = selected_features["koi_disposition"]

feature_names = X.columns

#split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=63, stratify=y)

In [77]:
X_train.tail()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_slogg,koi_steff,koi_srad,koi_kepmag
1092,0,0,0,0,24.757765,142.06826,4.487,6106,0.966,14.49
3784,1,0,0,0,0.867721,132.161646,4.186,5510,1.276,16.09
4823,0,1,0,0,2.174915,131.93954,4.143,6792,1.636,14.281
5592,0,1,0,0,0.540246,131.5521,4.249,7296,1.498,13.681
2614,0,0,0,0,5.374933,170.93379,4.037,6167,1.689,13.294


### Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [78]:
### Scale your data
from sklearn.preprocessing import StandardScaler

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Support vector machines (SVMs) 
A set of supervised learning methods used for classification, 
regression and outliers detection.

In [79]:
### Train the Model

from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)
predictions = model.predict(X_test)


In [80]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.7875262254434484
Testing Data Score: 0.7911899313501144


### Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [81]:
### Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0005, 0.0001, 0.005, .001]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [82]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.784, total=   0.1s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.789, total=   0.1s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.786, total=   0.1s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.784, total=   0.1s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.789, total=   0.1s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.786, total=   0.1s
[CV] C=1, gamma=0.005 ................................................
[CV] ...........

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:    4.1s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=None,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 5, 10, 50],
                         'gamma': [0.0005, 0.0001, 0.005, 0.001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [83]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 50, 'gamma': 0.0005}
0.789624260919321


In [84]:
# Calculate classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test_scaled)
print(classification_report(y_test, predictions))

                precision    recall  f1-score   support

     CANDIDATE       0.84      0.20      0.33       422
     CONFIRMED       0.56      0.94      0.70       450
FALSE POSITIVE       0.98      1.00      0.99       876

      accuracy                           0.79      1748
     macro avg       0.79      0.71      0.67      1748
  weighted avg       0.84      0.79      0.76      1748



# Save the Model

In [85]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'Model1_Colton.sav'
joblib.dump(model, filename)

['Model1_Colton.sav']