In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in c:\users\esc\anaconda3\envs\pythondata\lib\site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [5]:
# Set features. This will also be used as your x values.
selected_features = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_time0bk', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_prad', 'koi_teq', 'koi_insol', 'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_slogg', 'koi_srad', 'ra', 'dec', 'koi_kepmag']]
# selected_features = df[['koi_model_snr', 'koi_fpflag_nt', 'koi_fpflag_co', 'koi_fpflag_ss', 'koi_prad', 'koi_depth']]
selected_features.head()
X = selected_features
y = df[['koi_disposition']]
print(X.shape, y.shape)

(6991, 20) (6991, 1)


# Create a Train Test Split

Use `koi_disposition` for the y values

In [6]:
from sklearn.model_selection import train_test_split

# y = pd.get_dummies(y)
# X = pd.get_dummies(X)
# X = pd.get_dummies(X).iloc[:,1:]
# y = pd.get_dummies(y).iloc[:,1:]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [7]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
6122,0,0,0,0,6.768901,133.07724,0.15,3.616,123.1,1.24,1017,253.3,10.8,1,5737,4.327,1.125,294.40472,39.351681,14.725
6370,0,1,0,1,0.733726,132.02005,0.291,2.309,114.6,0.86,1867,2891.64,13.8,1,5855,4.578,0.797,284.50391,42.46386,15.77
2879,1,0,0,0,7.652707,134.46038,0.97,79.8969,641.1,3.21,989,226.81,254.3,1,6328,4.481,0.963,295.50211,38.98354,13.099
107,0,0,0,0,7.953547,174.66224,0.3,2.6312,875.4,2.25,696,55.37,38.4,1,4768,4.536,0.779,291.15878,40.750271,15.66
29,0,0,0,0,4.959319,172.258529,0.831,2.22739,9802.0,12.21,1103,349.4,696.5,1,5712,4.359,1.082,292.16705,48.727589,15.263


In [8]:
y_train.head()

Unnamed: 0,koi_disposition
6122,CANDIDATE
6370,FALSE POSITIVE
2879,FALSE POSITIVE
107,CONFIRMED
29,CANDIDATE


In [9]:
print(X_train.shape, y_train.shape)

(5243, 20) (5243, 1)


In [10]:
print(X_test.shape, y_test.shape)

(1748, 20) (1748, 1)


# Pre-processing
Scale the data using the MinMaxScaler and perform some feature selection

In [11]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
# y_scaler = MinMaxScaler().fit(y_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
# y_train_scaled = y_scaler.transform(y_train)
# y_test_scaled = y_scaler.transform(y_test)

# y = y.values.reshape(-1, 1)

# DECISION TREE

In [12]:
# DECISION TREE
from sklearn import tree
# Create and score a decision tree classifier
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train_scaled, y_train)

In [13]:
#DECISION TREE TRAIN THE MODEL
print('Train score: {}'.format(clf.score(X_train_scaled, y_train)))
print('Test score: {}'.format(clf.score(X_test_scaled, y_test)))

Train score: 1.0
Test score: 0.8449656750572082


# RANDOM FOREST

In [14]:
# RANDOM FOREST
from sklearn.ensemble import RandomForestClassifier
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_scaled, y_train)

  """


In [15]:
#RANDOM FOREST TRAIN THE MODEL
print(f"Training Data Score: {rf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rf.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.9113272311212814


In [16]:
#RANDOM FOREST FEATURE IMPORTANCE
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_
importances

array([0.13010644, 0.10885433, 0.12137076, 0.04829452, 0.04294141,
       0.02892556, 0.04080376, 0.03037223, 0.05010331, 0.07437994,
       0.0327567 , 0.02728082, 0.12947306, 0.00838538, 0.02262576,
       0.01925957, 0.02012501, 0.0217392 , 0.02118592, 0.0210163 ])

In [17]:
#RANDOM FOREST SORT THE FEATURES
# We can sort the features by their importance
sorted(zip(rf.feature_importances_, X_train), reverse=True)

[(0.13010644246799238, 'koi_fpflag_nt'),
 (0.12947305822558616, 'koi_model_snr'),
 (0.12137076483942182, 'koi_fpflag_co'),
 (0.10885432945053107, 'koi_fpflag_ss'),
 (0.07437994288228675, 'koi_prad'),
 (0.05010330670778221, 'koi_depth'),
 (0.048294522212882024, 'koi_fpflag_ec'),
 (0.04294141342326237, 'koi_period'),
 (0.040803762560003315, 'koi_impact'),
 (0.03275670376829341, 'koi_teq'),
 (0.030372230246813375, 'koi_duration'),
 (0.02892555556793805, 'koi_time0bk'),
 (0.027280820147504856, 'koi_insol'),
 (0.02262576317658833, 'koi_steff'),
 (0.021739200130348718, 'ra'),
 (0.021185924134375762, 'dec'),
 (0.021016297339216426, 'koi_kepmag'),
 (0.02012501095563887, 'koi_srad'),
 (0.019259569973899333, 'koi_slogg'),
 (0.008385381789634728, 'koi_tce_plnt_num')]

# LOGISTIC REGRESSION

In [18]:
#LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
#LOGISTIC REGRESSION
classifier.fit(X_train_scaled, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
#LOGISTIC REGRESSION
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.8199504100705702
Testing Data Score: 0.8009153318077803


# BEST MODEL FIT: RANDOM FOREST

In [21]:
print(f"Training Data Score: {rf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rf.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.9113272311212814


In [22]:
# Tried resetting the X features based on sort features by their importance, keeping those > or = 0.05; reset X features to new set.
# However, that model did not perform as well as the original 20 features selected for X; reset X features to original features
# (0.13135475534809135, 'koi_model_snr'),
#  (0.12834398898359337, 'koi_fpflag_nt'),
#  (0.12696540961328096, 'koi_fpflag_co'),
#  (0.08828266624057494, 'koi_fpflag_ss'),
#  (0.08478735056027449, 'koi_prad'),
#  (0.054379039195254796, 'koi_depth')

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [23]:
# Create the GridSearchCV model

In [24]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [25]:
# Train the model with GridSearch

In [26]:
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [100, 1000]}
              #'gamma': [0.01, 0.05, 0.1, 0.5]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [27]:
# Fit the model using the grid search estimator. 
# This will take the SVC model and try each combination of parameters
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] n_estimators=100 ................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  estimator.fit(X_train, y_train, **fit_params)


[CV] .................... n_estimators=100, score=0.913, total=   1.1s
[CV] n_estimators=100 ................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV] .................... n_estimators=100, score=0.896, total=   1.1s
[CV] n_estimators=100 ................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.1s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV] .................... n_estimators=100, score=0.892, total=   1.1s
[CV] n_estimators=100 ................................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] .................... n_estimators=100, score=0.887, total=   1.1s
[CV] n_estimators=100 ................................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] .................... n_estimators=100, score=0.893, total=   1.1s
[CV] n_estimators=1000 ...............................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ................... n_estimators=1000, score=0.910, total=  10.7s
[CV] n_estimators=1000 ...............................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ................... n_estimators=1000, score=0.899, total=  10.6s
[CV] n_estimators=1000 ...............................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ................... n_estimators=1000, score=0.889, total=  10.8s
[CV] n_estimators=1000 ...............................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ................... n_estimators=1000, score=0.882, total=  10.5s
[CV] n_estimators=1000 ...............................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ................... n_estimators=1000, score=0.891, total=  10.7s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   58.8s finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              ra

In [28]:
print(grid.best_params_)
print(grid.best_score_)

{'n_estimators': 100}
0.8964309884368247


In [29]:
predictions = grid.predict(X_test_scaled)

In [30]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=["CANDIDATE", "CONFIRMED", "FALSE POSITIVE"]))

                precision    recall  f1-score   support

     CANDIDATE       0.83      0.79      0.81       411
     CONFIRMED       0.83      0.85      0.84       484
FALSE POSITIVE       0.98      1.00      0.99       853

      accuracy                           0.91      1748
     macro avg       0.88      0.88      0.88      1748
  weighted avg       0.90      0.91      0.90      1748



# Save the Model

In [31]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'Williams.sav'

# joblib.dump(your_model, filename)
joblib.dump(grid, filename)

['Williams.sav']