In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import joblib
import warnings 
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, r2_score, max_error
from sklearn.ensemble import RandomForestClassifier



# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("data/exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [None]:
df.describe().T

In [None]:
sns.heatmap(df.corr(), cmap='RdYlGn')

In [None]:
selected_features = df[["koi_period","koi_period_err1","koi_period_err2","koi_time0bk","koi_time0bk_err1",
                     "koi_steff_err2","koi_slogg","koi_slogg_err1","koi_slogg_err2","koi_srad",
                     "koi_srad_err1","koi_srad_err2","ra","dec","koi_kepmag"]]

In [None]:
selected_features.corr()

In [None]:
sns.heatmap(selected_features.corr(), cmap='RdYlGn')

In [None]:
# Check for null values
df.info(verbose=True)

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
sns.countplot(df['koi_disposition'])

In [None]:
# Set features. This will also be used as your x values.


# Create a Train Test Split

Use `koi_disposition` for the y values

In [3]:
# create features
X = df.drop("koi_disposition", axis=1)
feature_names = X.columns
# create labels
y = df["koi_disposition"]

print("Shape: ", X.shape, y.shape)


Shape:  (6991, 40) (6991,)


In [4]:
# split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
X_train.head()

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [5]:
# scale the data
X_scaler = MinMaxScaler().fit(X_train)


In [6]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


# Train the Model



In [7]:
# create a Gaussian Classifier with default n_esimators value
model = RandomForestClassifier(n_estimators = 100)

# train the model
model = model.fit(X_train_scaled, y_train)

print(f"Training Data Score: {model.score(X_train_scaled, y_train):.4f}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test):.4f}")


Training Data Score: 1.0000
Testing Data Score: 0.8890


In [8]:
# predict test data set
y_pred = model.predict(X_test_scaled)


In [9]:
# check performance of model with classification report
print(classification_report(y_test, y_pred))


                precision    recall  f1-score   support

     CANDIDATE       0.82      0.73      0.77       422
     CONFIRMED       0.79      0.83      0.81       450
FALSE POSITIVE       0.97      1.00      0.98       876

      accuracy                           0.89      1748
     macro avg       0.86      0.85      0.85      1748
  weighted avg       0.89      0.89      0.89      1748



In [10]:
confusion_matrix(y_test,y_pred)
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)


Predicted,CANDIDATE,CONFIRMED,FALSE POSITIVE,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CANDIDATE,309,99,14,422
CONFIRMED,65,373,12,450
FALSE POSITIVE,3,1,872,876
All,377,473,898,1748


In [11]:
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.8890160183066361


In [None]:
importances = model.feature_importances_
feature_imp = pd.Series(model.feature_importances_, index = feature_names).sort_values(ascending=False)


In [None]:
# Creating a bar plot
plt.figure(figsize=(10,12))
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [12]:
# example of grid searching key hyperparametres for KNeighborsClassifier
max_features = []

# set range 1 to half the number of input features ie 20
for i in range(1,20):
    max_features.append(i)

# define grid search
param_grid = {'max_features': max_features, 
              'n_estimators': [200, 210, 220],
              'max_depth': [20, 25, 30]
             }


In [13]:
grid = GridSearchCV(model, param_grid, n_jobs = -1, verbose = 3)


In [14]:
grid.get_params(deep=True)

{'cv': None,
 'error_score': nan,
 'estimator__bootstrap': True,
 'estimator__ccp_alpha': 0.0,
 'estimator__class_weight': None,
 'estimator__criterion': 'gini',
 'estimator__max_depth': None,
 'estimator__max_features': 'auto',
 'estimator__max_leaf_nodes': None,
 'estimator__max_samples': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_impurity_split': None,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__n_estimators': 100,
 'estimator__n_jobs': None,
 'estimator__oob_score': False,
 'estimator__random_state': None,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'estimator': RandomForestClassifier(),
 'iid': 'deprecated',
 'n_jobs': -1,
 'param_grid': {'max_features': [1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19],
  'n_estimators': [200, 210, 220],
  'max_depth': [20, 25, 30]},
 'pre_dispatch': '2*n

In [15]:
# Train the model with GridSearch (cross validate to determine hyper parameter values for best accuracy)
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 171 candidates, totalling 855 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed: 20.6min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 41.5min
[Parallel(n_jobs=-1)]: Done 855 out of 855 | elapsed: 45.6min finished


GridSearchCV(estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [20, 25, 30],
                         'max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                          13, 14, 15, 16, 17, 18, 19],
                         'n_estimators': [200, 210, 220]},
             verbose=3)

In [16]:
print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

{'max_depth': 30, 'max_features': 15, 'n_estimators': 200}
0.9067350584708083
RandomForestClassifier(max_depth=30, max_features=15, n_estimators=200)


In [17]:
print(f"Training Data Score: {grid.score(X_train_scaled, y_train):.4f}")
print(f"Testing Data Score: {grid.score(X_test_scaled, y_test):.4f}")

Training Data Score: 1.0000
Testing Data Score: 0.8947


In [20]:
print(grid.cv_results_)

{'mean_fit_time': array([  2.02159367,   2.30044699,   2.31480823,   2.57311807,
         3.19026799,   3.59119558,   3.92131391,   4.90927038,
         5.1284852 ,   4.86598945,   5.17695422,   5.35168948,
         5.81046152,   6.40187936,   6.15553927,   6.73638616,
         6.28080368,   7.82766833,   7.59867949,   7.39283004,
         7.15486631,   7.30695505,   8.50379171,   8.41928568,
         7.96869059,   8.4647644 ,   9.40784197,   8.47114711,
         9.11382871,   9.8937428 ,  10.15883417,  12.59771204,
        12.46067801,  10.79553061,  11.06381025,  12.58374939,
        15.61963058,  12.15429792,  12.72138128,  13.84058881,
        14.7553412 ,  14.08274045,  12.39186239,  15.45225863,
        15.40917344,  13.30222831,  13.91758747,  16.75100489,
        17.46203556,  16.4999651 ,  17.28476162,  18.51858315,
        15.71377902,  17.59215598,  18.51548629,  18.91867871,
        18.66590805,   2.36148529,   2.67005968,   3.08834062,
         4.17942367,   6.73849578,   

In [21]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test_scaled)

In [22]:
# check performance of model with classification report
print(classification_report(y_test, predictions))

                precision    recall  f1-score   support

     CANDIDATE       0.82      0.76      0.79       422
     CONFIRMED       0.79      0.82      0.81       450
FALSE POSITIVE       0.98      1.00      0.99       876

      accuracy                           0.89      1748
     macro avg       0.87      0.86      0.86      1748
  weighted avg       0.89      0.89      0.89      1748



In [None]:
confusion_matrix(y_test, predictions)
pd.crosstab(y_test, predictions, rownames=['True'], colnames=['Predicted'], margins=True)

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash

filename = 'your_name.sav'
joblib.dump(your_model, filename)