In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in c:\users\jack pan\anaconda3\lib\site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("data/exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [22]:
# y values
target = df["koi_disposition"]
target 

0            CONFIRMED
1       FALSE POSITIVE
2       FALSE POSITIVE
3            CONFIRMED
4            CONFIRMED
             ...      
6986    FALSE POSITIVE
6987    FALSE POSITIVE
6988         CANDIDATE
6989    FALSE POSITIVE
6990    FALSE POSITIVE
Name: koi_disposition, Length: 6991, dtype: object

In [24]:
# As these are not numerical values, we can't use them to fit our nnet.
# To fix this, we must convert each class label to a numerical value.

#label encode to receive float outcomes 

from sklearn.preprocessing import LabelEncoder

# Step 0: Reformat data
data = df.values
y = data[:, 0]
y


array(['CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE', ..., 'CANDIDATE',
       'FALSE POSITIVE', 'FALSE POSITIVE'], dtype=object)

In [26]:
# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)
encoded_y

#candidate = 0 , confirmed = 1, false positive =2 

array([1, 2, 2, ..., 0, 2, 2])

In [27]:
feature_names = df.columns
feature_names

Index(['koi_disposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
       'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1',
       'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
       'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec',
       'koi_kepmag'],
      dtype='object')

In [28]:
# Set features. This will also be used as your x values.
# takes out the errors?
selected_features = df[['koi_period',
                        'koi_time0bk',
                        'koi_slogg',
                        'koi_srad',
                        'koi_impact',
                       'koi_fpflag_nt',
                       'koi_fpflag_ss',
                       'koi_fpflag_co',
                       'koi_fpflag_ec',
                       'koi_duration',
                       'koi_depth',
                       'koi_prad',
                       'koi_teq',
                       'koi_insol',
                       'koi_model_snr',
                        'koi_tce_plnt_num',
                        'koi_steff',
                        'ra',
                        'dec',
                        'koi_kepmag'
                       ]]

# selected_features = df.drop("koi_disposition", axis=1)

selected_features.head()

Unnamed: 0,koi_period,koi_time0bk,koi_slogg,koi_srad,koi_impact,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,ra,dec,koi_kepmag
0,54.418383,162.51384,4.467,0.927,0.586,0,0,0,0,4.507,874.8,2.83,443,9.11,25.8,2,5455,291.93423,48.141651,15.347
1,19.89914,175.850252,4.544,0.868,0.969,0,1,0,0,1.7822,10829.0,14.6,638,39.3,76.3,1,5853,297.00482,48.134129,15.436
2,1.736952,170.307565,4.564,0.791,1.276,0,1,0,0,2.40641,8079.2,33.46,1395,891.96,505.6,1,5805,285.53461,48.28521,15.597
3,2.525592,171.59555,4.438,1.046,0.701,0,0,0,0,1.6545,603.3,2.75,1406,926.16,40.9,1,6031,288.75488,48.2262,15.509
4,4.134435,172.97937,4.486,0.972,0.762,0,0,0,0,3.1402,686.0,2.77,1160,427.65,40.2,2,6046,296.28613,48.22467,15.714


# Create a Train Test Split

Use `koi_disposition` for the y values

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(selected_features, encoded_y, random_state=42)

In [30]:
X_train.head()

Unnamed: 0,koi_period,koi_time0bk,koi_slogg,koi_srad,koi_impact,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,ra,dec,koi_kepmag
6122,6.768901,133.07724,4.327,1.125,0.15,0,0,0,0,3.616,123.1,1.24,1017,253.3,10.8,1,5737,294.40472,39.351681,14.725
6370,0.733726,132.02005,4.578,0.797,0.291,0,1,0,1,2.309,114.6,0.86,1867,2891.64,13.8,1,5855,284.50391,42.46386,15.77
2879,7.652707,134.46038,4.481,0.963,0.97,1,0,0,0,79.8969,641.1,3.21,989,226.81,254.3,1,6328,295.50211,38.98354,13.099
107,7.953547,174.66224,4.536,0.779,0.3,0,0,0,0,2.6312,875.4,2.25,696,55.37,38.4,1,4768,291.15878,40.750271,15.66
29,4.959319,172.258529,4.359,1.082,0.831,0,0,0,0,2.22739,9802.0,12.21,1103,349.4,696.5,1,5712,292.16705,48.727589,15.263


In [31]:
y_train

array([0, 2, 2, ..., 2, 2, 2])

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [32]:
# models that use gradient descent need normalization to help the algorithms converge to a local optima
# Scale your data
from sklearn.preprocessing import MinMaxScaler #use standardscaler is data if features are expected to be normally distributed
X_scaler = MinMaxScaler().fit(X_train)

# Transform the training and testing data using the X_scaler and y_scaler models
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model



In [39]:
# Create the model and fit the model to the data linear regression model

from sklearn.linear_model import LogisticRegression

model2 = LogisticRegression()
model2.fit(X_train_scaled, y_train)

LogisticRegression()

In [40]:
print(f"Training Data Score: {model2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model2.score(X_test_scaled, y_test)}")

Training Data Score: 0.8199504100705702
Testing Data Score: 0.8009153318077803


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

GridSearchCV is a library function that is a member of sklearn’s model_selection package. It helps to loop through predefined hyperparameters and fit your estimator (model) on your training set. So, in the end, you can select the best parameters from the listed hyperparameters.

In [48]:
model2.get_params().keys()

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'])

In [59]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5 , 10, 50,],
              'max_iter': [250, 500, 1000, 5000]}

grid = GridSearchCV(model2, param_grid, verbose=3)

In [60]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] C=1, max_iter=250 ...............................................
[CV] ................... C=1, max_iter=250, score=0.827, total=   0.1s
[CV] C=1, max_iter=250 ...............................................
[CV] ................... C=1, max_iter=250, score=0.813, total=   0.1s
[CV] C=1, max_iter=250 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV] ................... C=1, max_iter=250, score=0.818, total=   0.1s
[CV] C=1, max_iter=250 ...............................................
[CV] ................... C=1, max_iter=250, score=0.806, total=   0.1s
[CV] C=1, max_iter=250 ...............................................
[CV] ................... C=1, max_iter=250, score=0.818, total=   0.1s
[CV] C=1, max_iter=500 ...............................................
[CV] ................... C=1, max_iter=500, score=0.827, total=   0.1s
[CV] C=1, max_iter=500 ...............................................
[CV] ................... C=1, max_iter=500, score=0.813, total=   0.1s
[CV] C=1, max_iter=500 ...............................................
[CV] ................... C=1, max_iter=500, score=0.818, total=   0.1s
[CV] C=1, max_iter=500 ...............................................
[CV] ................... C=1, max_iter=500, score=0.806, total=   0.1s
[CV] C=1, max_iter=500 ...............................................
[CV] .

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] .................. C=50, max_iter=250, score=0.846, total=   0.3s
[CV] C=50, max_iter=250 ..............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] .................. C=50, max_iter=250, score=0.819, total=   0.3s
[CV] C=50, max_iter=250 ..............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] .................. C=50, max_iter=250, score=0.816, total=   0.3s
[CV] C=50, max_iter=250 ..............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] .................. C=50, max_iter=250, score=0.807, total=   0.3s
[CV] C=50, max_iter=250 ..............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] .................. C=50, max_iter=250, score=0.832, total=   0.3s
[CV] C=50, max_iter=500 ..............................................
[CV] .................. C=50, max_iter=500, score=0.846, total=   0.4s
[CV] C=50, max_iter=500 ..............................................
[CV] .................. C=50, max_iter=500, score=0.819, total=   0.4s
[CV] C=50, max_iter=500 ..............................................
[CV] .................. C=50, max_iter=500, score=0.815, total=   0.4s
[CV] C=50, max_iter=500 ..............................................
[CV] .................. C=50, max_iter=500, score=0.807, total=   0.4s
[CV] C=50, max_iter=500 ..............................................
[CV] .................. C=50, max_iter=500, score=0.832, total=   0.4s
[CV] C=50, max_iter=1000 .............................................
[CV] ................. C=50, max_iter=1000, score=0.846, total=   0.4s
[CV] C=50, max_iter=1000 .............................................
[CV] .

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:   17.2s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': [1, 5, 10, 50],
                         'max_iter': [250, 500, 1000, 5000]},
             verbose=3)

In [62]:
# List the best parameters for this dataset
print(grid.best_params_)
# List the best R2 for this dataset
print(grid.best_score_)

{'C': 50, 'max_iter': 250}
0.8239541111491133


In [66]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test_scaled)
predictions

array([2, 0, 2, ..., 0, 0, 0])

In [67]:
# Calculate classification report
from sklearn.metrics import classification_report
#candidate = 0 , confirmed = 1, false positive =2 
print(classification_report(y_test, predictions,
                            target_names=["Candidate", "Confirmed", "False Positive" ]))

                precision    recall  f1-score   support

     Candidate       0.61      0.63      0.62       411
     Confirmed       0.68      0.65      0.66       484
False Positive       0.98      1.00      0.99       853

      accuracy                           0.81      1748
     macro avg       0.76      0.76      0.76      1748
  weighted avg       0.81      0.81      0.81      1748



# Save the Model

In [71]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'logreg.sav'
joblib.dump(model2, filename)

['logreg.sav']