In [0]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE
import random as r
import sys
r.seed(300)


In [8]:
exoData=pd.read_csv('dataset/simpleImputedMiceRf')

#These are the row_ids of exoplanets that are known to be habitable
habitableRows=[151, 152, 153, 1604, 2155, 2223, 2882, 3133, 3606, 3716, 3742,3743, 3744]

#Create a Series that uses 1 or 0 to indicate whether a corresponding record in the exoData
#Store row ids and habitable series and remove for scaling all data to values between 0 and 1
habitable=exoData.row_id.isin(habitableRows).replace(True,1).rename('habitable')
row_id=exoData.row_id
data=exoData.drop('row_id',axis=1)
scaledData=pd.DataFrame(MinMaxScaler().fit_transform(data),columns=data.columns)

#KEEP 98% of variance, get rid of other components using PCA
#join the data columns, row_id and habitable column 
#into the variable preprocessed and shuffle
preprocessed=pd.concat([row_id,scaledData,habitable],axis=1)
preprocessed=shuffle(preprocessed,random_state=100).reset_index()
preprocessed=preprocessed.drop('index',axis=1)
print('Done')

Done


In [12]:
#List of columns to be used for training
#it will be all columns in preprocessed except for "habitable" and "row_id"
trainCols=[x for x in preprocessed.columns if x not in ['habitable','row_id']]
validate=[]
Hcopy=habitableRows.copy()

#numHidden sets the number of habitable planets to use for validation of the model
numHidden=round(len(Hcopy)/2)

#Randomly select "numHidden" number of habitable exoplanets and
#add their row_id to validation set 
print("Hiding ",numHidden," habitable(",end="")
for i in range(numHidden): 
    randNum=r.randint(0,len(Hcopy)-1)
    validate.append(Hcopy[randNum])
    print(Hcopy[randNum],end=",")
    del Hcopy[randNum]
print("\b )")

#Add row_id of non-habitable planets to the validation set till its length becomes 100
while len(validate)<100:
    temp=r.randint(0,3500)
    if temp not in habitableRows:
        validate.append(temp)

#Take all columns of the planets whose row_id is in "validate" variable(in the validation set) and
#store it in "validate" variable
validate=preprocessed[preprocessed.row_id.isin(validate)]

#Store the planets that are not in the validation set in the training set
trainingSet=preprocessed[~preprocessed.row_id.isin(validate.row_id)]

#Store the training features in X and target feature(habitable or not) in y 
X=trainingSet[trainCols]
y=trainingSet.habitable

#the SMOTE library mutates existing data to creating more data
#Here we use SMOTE to increase the number of habitable planets in the training and validation data
smote = SMOTE(ratio='minority')
X_sm, y_sm = smote.fit_sample(X, y)
validateX,validateY=smote.fit_sample(validate[trainCols],validate.habitable)

Hiding  6  habitable(1604,3742,151,2882,3606,153, )


In [13]:
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import balanced_accuracy_score,roc_auc_score

Bestscores=[-1,{}]
n_neighbours=[1,2,5,7]
algorithm=['auto','ball_tree','kd_tree','brute']
leaf_size=[10,20,30,40]
p=[1,2,5]
num=0
for n in n_neighbours:
    for a in algorithm:
        for l in leaf_size:
            for P in p:
                test_k=KNeighborsClassifier(n_neighbors=n,algorithm=a,leaf_size=l,p=P)
                test_k.fit(X_sm,y_sm)
                y_preds=test_k.predict(validateX)
                num+=1
                print("Done with",test_k.get_params(),"\n",num)
                currScore=balanced_accuracy_score(validateY,y_preds)
                if(currScore>Bestscores[0]):
                    Bestscores[0]=currScore
                    Bestscores[1]=test_k.get_params()
Bestscores

Done with {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 1, 'p': 1, 'weights': 'uniform'} 
 1
Done with {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 1, 'p': 2, 'weights': 'uniform'} 
 2
Done with {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 1, 'p': 5, 'weights': 'uniform'} 
 3
Done with {'algorithm': 'auto', 'leaf_size': 20, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 1, 'p': 1, 'weights': 'uniform'} 
 4
Done with {'algorithm': 'auto', 'leaf_size': 20, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 1, 'p': 2, 'weights': 'uniform'} 
 5
Done with {'algorithm': 'auto', 'leaf_size': 20, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 1, 'p': 5, 'weights': 'uniform'} 
 6
Done with {'algorithm'

[0.8783783783783784,
 {'algorithm': 'auto',
  'leaf_size': 10,
  'metric': 'minkowski',
  'metric_params': None,
  'n_jobs': None,
  'n_neighbors': 5,
  'p': 1,
  'weights': 'uniform'}]

In [14]:
final_k=KNeighborsClassifier(n_neighbors=5,algorithm='auto',leaf_size=10,p=1,n_jobs=-1)
final_k.fit(X_sm,y_sm)
balanced_accuracy_score(validateY,final_k.predict(validateX))

0.8783783783783784

In [15]:
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import balanced_accuracy_score,roc_auc_score
# for q in range()
for n in range(len(trainCols)):
    g=trainCols.copy()
    del g[n]    
    X=Preprocessed[g]
    y=Preprocessed.habitable
    smote = SMOTE(ratio='minority')
    X_sm, y_sm = smote.fit_sample(X, y)
    validateX,validateY=smote.fit_sample(validate[g],validate.habitable)
    final_k=KNeighborsClassifier(n_neighbors=5,algorithm='auto',leaf_size=10,p=1,n_jobs=-1)
    final_k.fit(X_sm,y_sm)
#     print(len(trainCols))    
    print(trainCols[n],balanced_accuracy_score(validateY,final_k.predict(validateX)))

pl_controvflag 0.8648648648648649
pl_pnum 0.8851351351351351
pl_orbper 0.9054054054054055
pl_orbsmax 0.9054054054054055
pl_radj 0.8986486486486487
pl_ttvflag 0.9121621621621622
pl_kepflag 0.9121621621621622
pl_k2flag 0.8918918918918919
ra 0.9459459459459459
dec 0.8986486486486487
st_dist 0.9189189189189189
st_optmag 0.9256756756756757
gaia_gmag 0.8851351351351351
st_teff 0.8783783783783784
st_mass 0.8918918918918919
st_rad 0.9189189189189189
pl_tranflag 0.9189189189189189
pl_rvflag 0.8716216216216216
pl_imgflag 0.8918918918918919
pl_astflag 0.9121621621621622
pl_omflag 0.8918918918918919
pl_cbflag 0.8986486486486487
pl_angsep 0.9189189189189189
pl_rade 0.9324324324324325
pl_rads 0.9459459459459459
pl_trandur 0.8918918918918919
pl_tranmid 0.8918918918918919
pl_ratror 0.9054054054054055
pl_mnum 0.9054054054054055
pl_st_npar 0.9256756756756757
pl_st_nref 0.8648648648648649
st_rah 0.9054054054054055
st_glon 0.9054054054054055
st_glat 0.8716216216216216
st_elon 0.8716216216216216
st_elat 0.