In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade


Requirement already up-to-date: sklearn in /Applications/anaconda3/lib/python3.7/site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib




In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()


Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [3]:
len(df)


6991

In [4]:
df['koi_disposition'].value_counts()


FALSE POSITIVE    3504
CONFIRMED         1800
CANDIDATE         1687
Name: koi_disposition, dtype: int64

# Select your features (columns)

In [5]:
# Set features. This will also be used as your x values.
selected_features = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_time0bk', 'koi_slogg', 'koi_srad', 'ra', 'dec', 'koi_kepmag']]

# Create a Train Test Split

Use `koi_disposition` for the y values

In [6]:
X = selected_features
y = df["koi_disposition"]
print(X.shape, y.shape)


(6991, 11) (6991,)


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)


In [8]:
X_train.head()


Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_slogg,koi_srad,ra,dec,koi_kepmag
4002,0,0,1,0,99.673478,219.33483,4.777,0.492,293.05801,45.248821,15.801
4246,0,1,0,0,0.592244,131.654831,4.664,0.591,290.28094,45.46426,15.653
548,0,1,1,0,9.991625,137.447816,4.338,1.096,301.04239,45.022888,14.039
3953,0,1,0,0,178.41299,218.225235,4.346,1.148,288.32785,38.627621,13.944
2362,0,0,0,0,45.294223,138.678725,4.347,1.044,285.67938,50.241299,10.961


In [9]:
len(X_train)


5243

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [22]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical


In [23]:
X_scaler = MinMaxScaler().fit(X_train)


In [24]:
# Transform the training and testing data using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [None]:
# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)


# Train the Model



In [25]:
from sklearn.svm import SVC 
model = SVC(kernel='linear')


In [26]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001]}
grid = GridSearchCV(model, param_grid, verbose=3)


In [44]:
grid.fit(X_train_scaled, y_train)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.807, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.784, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.6s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.789, total=   0.3s
[CV] C=5, gamma=0.0001 ...............................................
[CV] ................... C=5, gamma=0.0001, score=0.807, total=   0.3s
[CV] C=5, gamma=0.0001 ...............................................
[CV] ................... C=5, gamma=0.0001, score=0.781, total=   0.2s
[CV] C=5, gamma=0.0001 ...............................................
[CV] ................... C=5, gamma=0.0001, score=0.788, total=   0.3s
[CV] C=10, gamma=0.0001 ..............................................
[CV] .................. C=10, gamma=0.0001, score=0.807, total=   0.2s
[CV] C=10, gamma=0.0001 ..............................................
[CV] .................. C=10, gamma=0.0001, score=0.782, total=   0.2s
[CV] C=10, gamma=0.0001 ..............................................
[CV] .................. C=10, gamma=0.0001, score=0.787, total=   0.2s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    2.4s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=None,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [45]:
print(grid.best_params_)


{'C': 1, 'gamma': 0.0001}


In [46]:
print(grid.best_score_)


0.793438870875453


In [47]:
predictions = grid.predict(X_test_scaled)
print(predictions[:10])
print(y_test[:10])

['CONFIRMED' 'FALSE POSITIVE' 'FALSE POSITIVE' 'CANDIDATE'
 'FALSE POSITIVE' 'CANDIDATE' 'FALSE POSITIVE' 'FALSE POSITIVE'
 'CANDIDATE' 'FALSE POSITIVE']
1981         CANDIDATE
5609    FALSE POSITIVE
532     FALSE POSITIVE
6558         CANDIDATE
1249    FALSE POSITIVE
237          CONFIRMED
3247    FALSE POSITIVE
6859    FALSE POSITIVE
1687         CONFIRMED
1143         CONFIRMED
Name: koi_disposition, dtype: object


In [48]:
print(f"Training Data Score: {grid.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {grid.score(X_test_scaled, y_test)}")


Training Data Score: 0.7921037573908067
Testing Data Score: 0.7877574370709383


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [49]:
# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [50]:
# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [53]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(units=10, activation='relu', input_dim=11))
model.add(Dense(units=10, activation='relu'))
model.add(Dense(units=3, activation='softmax'))

In [54]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=0
)

<tensorflow.python.keras.callbacks.History at 0x1c35507690>

In [55]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

1748/1748 - 0s - loss: 0.3749 - acc: 0.8015
Normal Neural Network - Loss: 0.3748872145089732, Accuracy: 0.801487386226654


In [65]:
predictions = model.predict_classes(X_test_scaled)
print(predictions[:20])
print(y_test[:20])

[0 2 2 0 2 0 2 2 0 2 2 2 0 2 1 0 2 2 2 2]
1981         CANDIDATE
5609    FALSE POSITIVE
532     FALSE POSITIVE
6558         CANDIDATE
1249    FALSE POSITIVE
237          CONFIRMED
3247    FALSE POSITIVE
6859    FALSE POSITIVE
1687         CONFIRMED
1143         CONFIRMED
6956    FALSE POSITIVE
3078    FALSE POSITIVE
3945         CONFIRMED
4304    FALSE POSITIVE
2493         CONFIRMED
3618         CANDIDATE
281     FALSE POSITIVE
5131    FALSE POSITIVE
4016    FALSE POSITIVE
647     FALSE POSITIVE
Name: koi_disposition, dtype: object


In [63]:
encoded_results = label_encoder.inverse_transform(predictions)
print(encoded_results[:20])
print(y_test[:20])

['CANDIDATE' 'FALSE POSITIVE' 'FALSE POSITIVE' 'CANDIDATE'
 'FALSE POSITIVE' 'CANDIDATE' 'FALSE POSITIVE' 'FALSE POSITIVE'
 'CANDIDATE' 'FALSE POSITIVE' 'FALSE POSITIVE' 'FALSE POSITIVE'
 'CANDIDATE' 'FALSE POSITIVE' 'CONFIRMED' 'CANDIDATE' 'FALSE POSITIVE'
 'FALSE POSITIVE' 'FALSE POSITIVE' 'FALSE POSITIVE']
1981         CANDIDATE
5609    FALSE POSITIVE
532     FALSE POSITIVE
6558         CANDIDATE
1249    FALSE POSITIVE
237          CONFIRMED
3247    FALSE POSITIVE
6859    FALSE POSITIVE
1687         CONFIRMED
1143         CONFIRMED
6956    FALSE POSITIVE
3078    FALSE POSITIVE
3945         CONFIRMED
4304    FALSE POSITIVE
2493         CONFIRMED
3618         CANDIDATE
281     FALSE POSITIVE
5131    FALSE POSITIVE
4016    FALSE POSITIVE
647     FALSE POSITIVE
Name: koi_disposition, dtype: object


# Save the Model

In [68]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
# import joblib
# filename = 'DeepModel.sav'
# joblib.dump(model, filename)

In [67]:
model.save("DeepModel.h5")

In [69]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [70]:
classifier.fit(X_train_scaled, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [71]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.7947739843600992
Testing Data Score: 0.801487414187643


In [72]:
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
1981,CONFIRMED,CANDIDATE
5609,FALSE POSITIVE,FALSE POSITIVE
532,FALSE POSITIVE,FALSE POSITIVE
6558,CANDIDATE,CANDIDATE
1249,FALSE POSITIVE,FALSE POSITIVE
...,...,...
2516,CONFIRMED,CONFIRMED
322,FALSE POSITIVE,FALSE POSITIVE
1154,CANDIDATE,CONFIRMED
1696,CONFIRMED,CONFIRMED
