In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in c:\users\dinks\anaconda3\lib\site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

Three categories are present within the koi_disposition column: CANDICATE, CONFIRMED, and FALSE POSITIVE. Candidate pertains to objects of interest that have potential for being an exoplanet, but they have not yet been confirmed or rejected. Therefore, there is an intrinsic level of uncertainty associated with each candidates, which would impact the accuracy of the prediction model. For this reason, candidates are being removed from the training dataset.

In [7]:
# create a dataframe with only confirmed and false positive objects
cond = (df['koi_disposition']=='CONFIRMED')| (df['koi_disposition']=='FALSE POSITIVE')
df = df.loc[cond, :]
df['koi_disposition'].value_counts()

FALSE POSITIVE    3504
CONFIRMED         1800
Name: koi_disposition, dtype: int64

In [8]:
# Set features. This will also be used as your x values.
selected_features = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec',
                        'koi_period', 'koi_time0bk', 'koi_impact', 'koi_duration',
                        'koi_depth', 'koi_prad', 'koi_teq', 'koi_insol', 'koi_model_snr',
                        'koi_steff', 'koi_slogg', 'koi_srad']]

# Create a Train Test Split

Use `koi_disposition` for the y values

In [9]:
X = selected_features
y = df['koi_disposition']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_steff,koi_slogg,koi_srad
979,0,1,1,1,41.077297,379.12848,0.03,5.525,1216.8,3.25,523,17.74,28.8,5665,4.381,0.938
6201,0,1,0,0,13.63948,144.531439,1.033,3.12422,81103.0,44.0,680,50.51,664.1,5461,4.517,0.846
5648,0,1,0,0,14.854416,136.559783,0.57,5.13753,77410.0,29.77,798,95.83,2822.1,6214,4.444,1.022
229,0,0,0,0,10.681695,131.55082,0.646,3.605,307.8,2.3,897,152.75,21.9,5553,4.221,1.25
1302,0,0,0,0,12.509458,136.99217,0.626,3.479,155.3,1.54,875,138.54,16.0,5949,4.303,1.184


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [11]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
 # Create a random forest classifier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf = rf.fit(X_train_scaled, y_train)

# Train the Model



In [13]:
print(f"Training Data Score: {rf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rf.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.9849170437405732


In [14]:
# Evaluate the importance of each feature and sort them
importances = rf.feature_importances_
sorted(zip(importances, selected_features), reverse=True)

[(0.1921944131144145, 'koi_fpflag_co'),
 (0.16120662189651447, 'koi_fpflag_nt'),
 (0.15302596077687397, 'koi_fpflag_ss'),
 (0.11380443564132466, 'koi_prad'),
 (0.08376730366761583, 'koi_model_snr'),
 (0.06044538667612113, 'koi_fpflag_ec'),
 (0.05110784331230748, 'koi_period'),
 (0.03957155883999202, 'koi_teq'),
 (0.03156622024031409, 'koi_impact'),
 (0.029429457371056105, 'koi_insol'),
 (0.029125319015354152, 'koi_depth'),
 (0.017916417370721065, 'koi_time0bk'),
 (0.014441771030817969, 'koi_duration'),
 (0.008071940862685789, 'koi_steff'),
 (0.007398139109210857, 'koi_slogg'),
 (0.00692721107467603, 'koi_srad')]

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

A 98% accuracy using the test data is already very high, yet the following section evaluates whether such accuracy can be improved by means of hyperparameter tuning.

In [15]:
# Create the GridSearchCV model with coarse hyperparameters
new_rf = RandomForestClassifier()
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [100,200, 500, 5000],
             'max_depth':[1,2,4,6,8],
             'min_samples_leaf':[0.05,0.1,0.2]}
grid = GridSearchCV(estimator = new_rf, param_grid = param_grid, verbose=3, n_jobs=-1)

In [16]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   32.6s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 10.7min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              ra

In [17]:
print(grid.best_params_)
print(grid.best_score_)

{'max_depth': 2, 'min_samples_leaf': 0.05, 'n_estimators': 100}
0.9771252488859391


Hyperparameter tuning in this case does not have a significant effect on the accuracy of the original model, which was run using default settings.

Let's evaluate if a classification gradient boosting algorithm such as AdaBoost may have positive effect.

In [18]:
# Implement AdaBoost classifier
from sklearn.ensemble import AdaBoostClassifier

random_tree = RandomForestClassifier()

ada_boost = AdaBoostClassifier(base_estimator = random_tree, n_estimators =100)

In [19]:
# Fit the AdaBoostClassifier to the scaled training dataset
ada_boost.fit(X_train_scaled, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=RandomForestClassifier(bootstrap=True,
                                                         ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=None,
                                                         max_features='auto',
                                                         max_leaf_nodes=None,
                                                         max_samples=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                       

In [20]:
# Extract the accuracy score from the classifier
print(f"Training Data Score: {ada_boost.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {ada_boost.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.9856711915535445


A very slight increase in the accuracy score occurs after applying the AdaBoost algorithm, from 0.9849 with default parameters to 0.9856 with AdaBoost.

# Create a Confusion Matrix for Each Model

In [21]:
 # Make predictions with the untuned model
untuned_prediction = rf.predict(X_test_scaled)

In [22]:
 # Calculate classification report of untuned model
from sklearn.metrics import classification_report
print(classification_report(y_test, untuned_prediction))

                precision    recall  f1-score   support

     CONFIRMED       1.00      0.96      0.98       451
FALSE POSITIVE       0.98      1.00      0.99       875

      accuracy                           0.98      1326
     macro avg       0.99      0.98      0.98      1326
  weighted avg       0.99      0.98      0.98      1326



In [23]:
 # Make predictions with the hypertuned model
predictions = grid.predict(X_test_scaled)

In [24]:
 # Calculate classification report
print(classification_report(y_test, predictions))

                precision    recall  f1-score   support

     CONFIRMED       0.97      0.86      0.91       451
FALSE POSITIVE       0.93      0.99      0.96       875

      accuracy                           0.94      1326
     macro avg       0.95      0.92      0.93      1326
  weighted avg       0.94      0.94      0.94      1326



In [25]:
 # Make predictions with the ada_boost model
ada_prediction = ada_boost.predict(X_test_scaled)

In [26]:
# And print classification report for the ada_boost model
print(classification_report(y_test, ada_prediction))

                precision    recall  f1-score   support

     CONFIRMED       1.00      0.96      0.98       451
FALSE POSITIVE       0.98      1.00      0.99       875

      accuracy                           0.99      1326
     macro avg       0.99      0.98      0.98      1326
  weighted avg       0.99      0.99      0.99      1326



# Save the Model

In [27]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'jonathan_antia_randomforest.sav'
joblib.dump(ada_boost, filename)

['jonathan_antia_randomforest.sav']