In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in c:\users\klsis\anaconda3\envs\penn_flask\lib\site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("Data/exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [5]:
# Set features. This will also be used as your x values.
selected_features = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 
                        'koi_period','koi_period_err1', 'koi_period_err2', 
                        'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2',
                        'koi_impact', 'koi_impact_err1', 'koi_impact_err2',
                        'koi_duration', 'koi_duration_err1', 'koi_duration_err2', 
                        'koi_depth', 'koi_depth_err1','koi_depth_err2', 
                        'koi_prad', 'koi_prad_err1', 'koi_prad_err2', 'koi_teq', 
                        'koi_insol','koi_insol_err1', 'koi_insol_err2', 'koi_model_snr', 
                        'koi_steff', 'koi_steff_err1','koi_steff_err2', 
                        'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2', 
                        'koi_srad', 'koi_srad_err1', 'koi_srad_err2','ra', 'dec', 'koi_kepmag']]
selected_features.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,-0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Create a Train Test Split

Use `koi_disposition` for the y values

In [6]:
y = df["koi_disposition"]
X = selected_features

In [7]:
y.value_counts()

FALSE POSITIVE    3504
CONFIRMED         1800
CANDIDATE         1687
Name: koi_disposition, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [9]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
6080,1,0,0,0,12.496435,0.0002213,-0.0002213,132.0358,0.0143,-0.0143,...,-286,3.805,0.39,-0.13,2.73,0.535,-1.248,289.2308,44.412483,13.054
3001,0,0,0,0,11.615625,0.0001528,-0.0001528,131.96843,0.00823,-0.00823,...,-72,4.083,0.368,-0.092,1.453,0.218,-0.51,293.52756,41.111439,15.162
570,0,1,0,0,10.980246,6.93e-07,-6.93e-07,137.137607,5.3e-05,-5.3e-05,...,-159,4.462,0.098,-0.182,0.897,0.238,-0.119,282.79764,43.578129,14.212
4897,1,0,0,0,466.90824,0.01194,-0.01194,136.3731,0.019,-0.019,...,-146,4.456,0.102,-0.361,0.867,0.448,-0.103,297.65436,43.178551,15.202
625,0,1,1,1,1.061933,1.25e-06,-1.25e-06,133.850441,0.000978,-0.000978,...,-167,3.975,0.259,-0.111,1.851,0.383,-0.575,288.90253,44.632992,12.953


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [10]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler

# Create a StandardScater model and fit it to the training data
X_scaler = MinMaxScaler().fit(X_train)

In [11]:
# Transform the training and testing data using the X_scaler and y_scaler models
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model

In [12]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000)
rf = rf.fit(X_train_scaled, y_train)
predictions = rf.predict(X_test_scaled)

print(f"Training Data Score: {rf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rf.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.9004576659038902


In [13]:
importances = rf.feature_importances_
importances

array([0.09415135, 0.06680473, 0.11120584, 0.03617965, 0.02273197,
       0.01693594, 0.01775835, 0.01378599, 0.02265995, 0.0222599 ,
       0.01802123, 0.01092676, 0.0110487 , 0.02466263, 0.03115496,
       0.03410905, 0.02029085, 0.01456613, 0.01394701, 0.050289  ,
       0.03360872, 0.0295425 , 0.01606174, 0.01291777, 0.01757275,
       0.01366165, 0.05709972, 0.00965563, 0.03334691, 0.02819966,
       0.00963864, 0.00892608, 0.0105224 , 0.00953583, 0.01153729,
       0.00873927, 0.01363734, 0.01075575, 0.01155035])

In [14]:
# We can sort the features by their importance
sorted(zip(rf.feature_importances_, selected_features), reverse=True)

[(0.11120584047222325, 'koi_fpflag_co'),
 (0.09415135007417084, 'koi_fpflag_nt'),
 (0.06680472546323404, 'koi_fpflag_ss'),
 (0.0570997208452053, 'koi_model_snr'),
 (0.05028900420418592, 'koi_prad'),
 (0.03617964757603287, 'koi_fpflag_ec'),
 (0.03410905446839247, 'koi_duration_err2'),
 (0.03360872107512779, 'koi_prad_err1'),
 (0.03334690578643789, 'koi_steff_err1'),
 (0.03115496408823798, 'koi_duration_err1'),
 (0.02954250233972939, 'koi_prad_err2'),
 (0.028199656207777413, 'koi_steff_err2'),
 (0.02466262834885104, 'koi_duration'),
 (0.022731974682198997, 'koi_period'),
 (0.02265995432118005, 'koi_time0bk_err1'),
 (0.022259902780666526, 'koi_time0bk_err2'),
 (0.020290846889706924, 'koi_depth'),
 (0.01802122590924649, 'koi_impact'),
 (0.01775835049929565, 'koi_period_err2'),
 (0.017572751354810727, 'koi_insol_err1'),
 (0.016935943622924825, 'koi_period_err1'),
 (0.0160617419944138, 'koi_teq'),
 (0.014566133334438207, 'koi_depth_err1'),
 (0.013947012598477675, 'koi_depth_err2'),
 (0.01378

In [15]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=['CONFIRMED', 'FALSE POSITIVE', 'CANDIDATE']))

                precision    recall  f1-score   support

     CONFIRMED       0.83      0.76      0.79       422
FALSE POSITIVE       0.82      0.84      0.83       450
     CANDIDATE       0.97      1.00      0.99       876

      accuracy                           0.90      1748
     macro avg       0.87      0.87      0.87      1748
  weighted avg       0.90      0.90      0.90      1748



# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [16]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'criterion': ['gini', 'entropy'],
              'n_estimators': [100, 500, 1000]}
grid = GridSearchCV(rf, param_grid, verbose=3)

In [17]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] criterion=gini, n_estimators=100 ................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .... criterion=gini, n_estimators=100, score=0.886, total=   1.2s
[CV] criterion=gini, n_estimators=100 ................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s


[CV] .... criterion=gini, n_estimators=100, score=0.887, total=   1.2s
[CV] criterion=gini, n_estimators=100 ................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.4s remaining:    0.0s


[CV] .... criterion=gini, n_estimators=100, score=0.895, total=   1.2s
[CV] criterion=gini, n_estimators=100 ................................
[CV] .... criterion=gini, n_estimators=100, score=0.893, total=   1.3s
[CV] criterion=gini, n_estimators=100 ................................
[CV] .... criterion=gini, n_estimators=100, score=0.874, total=   1.3s
[CV] criterion=gini, n_estimators=500 ................................
[CV] .... criterion=gini, n_estimators=500, score=0.888, total=   6.2s
[CV] criterion=gini, n_estimators=500 ................................
[CV] .... criterion=gini, n_estimators=500, score=0.888, total=   6.3s
[CV] criterion=gini, n_estimators=500 ................................
[CV] .... criterion=gini, n_estimators=500, score=0.899, total=   6.5s
[CV] criterion=gini, n_estimators=500 ................................
[CV] .... criterion=gini, n_estimators=500, score=0.893, total=   6.4s
[CV] criterion=gini, n_estimators=500 ................................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  4.3min finished


GridSearchCV(estimator=RandomForestClassifier(n_estimators=1000),
             param_grid={'criterion': ['gini', 'entropy'],
                         'n_estimators': [100, 500, 1000]},
             verbose=3)

In [18]:
print(grid.best_params_)
print(grid.best_score_)

{'criterion': 'entropy', 'n_estimators': 1000}
0.8912817732627948


In [19]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'rf_model.sav'
joblib.dump(rf, filename)

['rf_model.sav']