In [36]:
#import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
import joblib
from sklearn.linear_model import LogisticRegression

In [10]:
# read csv and drop null values
data_df = pd.read_csv("exoplanet_data.csv")

#drop the null values in both rows and columns
data_df = data_df.dropna()
data_df = data_df.dropna(axis='columns', how = 'all')

In [15]:
# split the data into train and test
X = data_df.drop(columns=["koi_disposition"])
y = data_df["koi_disposition"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [18]:
# scale the data
X_scale = MinMaxScaler().fit(X_train)
X_train_scale = X_scale.transform(X_train)
X_test_scale = X_scale.transform(X_test)

# SVM

In [42]:
# train the model using SVC
svc = SVC(kernel="linear")
svc.fit(X_train_scale, y_train)

print(f"SVC Training Data Score: {svc.score(X_train_scale, y_train)}")
print(f"SVC Testing Data Score: {svc.score(X_test_scale, y_test)}")

SVC Training Data Score: 0.8373068853709709
SVC Testing Data Score: 0.8558352402745996


In [53]:
# tune parameters 
svm_para = {'C': [1, 5, 10],
       'gamma': [0.0001, 0.001, 0.01]}
svm_grid = GridSearchCV(svc, svm_para, verbose = 3)

svm_grid.fit(X_train_scale, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=1, gamma=0.0001, score=0.848, total=   0.2s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.839, total=   0.2s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.827, total=   0.2s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.820, total=   0.2s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.832, total=   0.2s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.848, total=   0.2s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.839, total=   0.2s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.827, total=   0.2s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.820, total=   0.2s
[CV] C=1, gamma=0.001 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    8.2s finished


GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             verbose=3)

In [58]:
# print out the best svm parameters and score
print(f"SVM best parameters: {svm_grid.best_params_}")
print(f"SVM best score: {svm_grid.best_score_}")

SVM best parameters: {'C': 10, 'gamma': 0.0001}
SVM best score: 0.8661053056709772


In [59]:
# export 
svm_model = 'svm_model.sav'
joblib.dump(svm_grid, svm_model)

['svm_model.sav']

# Logistic Regression

In [45]:
# train the model using Linear Regression
LR = LogisticRegression(max_iter=1000)
LR.fit(X_train_scale, y_train)

print(f"Linear Regression Training Data Score: {LR.score(X_train_scale, y_train)}")
print(f"Linear Regression Testing Data Score: {LR.score(X_test_scale, y_test)}")

Linear Regression Training Data Score: 0.8464619492656876
Linear Regression Testing Data Score: 0.8638443935926774


In [49]:
# tune parameters 
LR_param = {'C': [1, 5, 10],
              'penalty': ["l1", "l2"]}
LR_model = LogisticRegression(solver='liblinear')
LR_grid = GridSearchCV(LR_model, LR_param, verbose=3)
LR_grid.fit(X_train_scale, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] C=1, penalty=l1 .................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ..................... C=1, penalty=l1, score=0.873, total=   0.3s
[CV] C=1, penalty=l1 .................................................
[CV] ..................... C=1, penalty=l1, score=0.868, total=   0.2s
[CV] C=1, penalty=l1 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV] ..................... C=1, penalty=l1, score=0.864, total=   0.3s
[CV] C=1, penalty=l1 .................................................
[CV] ..................... C=1, penalty=l1, score=0.853, total=   0.4s
[CV] C=1, penalty=l1 .................................................
[CV] ..................... C=1, penalty=l1, score=0.862, total=   0.3s
[CV] C=1, penalty=l2 .................................................
[CV] ..................... C=1, penalty=l2, score=0.852, total=   0.0s
[CV] C=1, penalty=l2 .................................................
[CV] ..................... C=1, penalty=l2, score=0.833, total=   0.0s
[CV] C=1, penalty=l2 .................................................
[CV] ..................... C=1, penalty=l2, score=0.825, total=   0.0s
[CV] C=1, penalty=l2 .................................................
[CV] ..................... C=1, penalty=l2, score=0.825, total=   0.1s
[CV] C=1, penalty=l2 .................................................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   22.2s finished


Linear Regression best parameters: {LR_grid.best_params_}
Linear Regression best parameters: {LR_grid.best_score_}
{'C': 5, 'penalty': 'l1'}
0.8752601532539168


In [57]:
# print out the best parameter and score
print(f"Linear Regression best parameter: {LR_grid.best_params_}")
print(f"Linear Regression best score: {LR_grid.best_score_}")

Linear Regression best parameter: {'C': 5, 'penalty': 'l1'}
Linear Regression best score: 0.8752601532539168


In [60]:
# export
LR_model = 'LR_model.sav'
joblib.dump(LR_grid, LR_model)

['LR_model.sav']