In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import balanced_accuracy_score
from catboost import CatBoostClassifier
import math as m
from imblearn.over_sampling import SMOTE
import random as r
import sys

r.seed(300)




In [3]:
exoData=pd.read_csv('../simpleImputedMiceRf')
#These are the row_ids of exoplanets that are known to be habitable
habitableRows=[151, 152, 153, 1604, 2155, 2223, 2882, 3133, 3606, 3716, 3742,3743, 3744]

#Creates a Series that uses 1 or 0 to indicate whether a corresponding record in the exoData
habitable=exoData.row_id.isin(habitableRows).replace(True,1).rename('habitable')

#Store row ids series and remove for scaling all data to values between 0 and 1
row_id=exoData.row_id
data=exoData.drop('row_id',axis=1)
scaledData=pd.DataFrame(StandardScaler().fit_transform(data),columns=data.columns)

#join the scaled data columns, row_id and habitable column,into the variable preprocessed and shuffle
preprocessed=pd.concat([row_id,scaledData,habitable],axis=1)
preprocessed=shuffle(preprocessed,random_state=100).reset_index()
preprocessed=preprocessed.drop('index',axis=1)   
print('Done')

Done


In [4]:
#List of columns to be used for training
#it will be all columns in preprocessed except for "habitable" and "row_id"
trainCols=[x for x in preprocessed.columns if x not in ['habitable','row_id']]
validate=[]
Hcopy=habitableRows.copy()

#numHidden sets the number of habitable planets to use for validation of the model
numHidden=round(len(Hcopy)/2)

#Randomly select "numHidden" number of habitable exoplanets and
#add their row_id to validation set 
print("Hiding ",numHidden," habitable(",end="")
for i in range(numHidden): 
    randNum=r.randint(0,len(Hcopy)-1)
    validate.append(Hcopy[randNum])
    print(Hcopy[randNum],end=",")
    del Hcopy[randNum]
print("\b )")

#Add row_id of non-habitable planets to the validation set till its length becomes 100
while len(validate)<100:
    temp=r.randint(0,3500)
    if temp not in habitableRows:
        validate.append(temp)

#Take all columns of the planets whose row_id is in "validate" variable(in the validation set) and
#store it in "validate" variable
validate=preprocessed[preprocessed.row_id.isin(validate)]

#Store the planets that are not in the validation set in the training set
trainingSet=preprocessed[~preprocessed.row_id.isin(validate.row_id)]

#Store the training features in X and target feature(habitable or not) in y 
X=trainingSet[trainCols]
y=trainingSet.habitable

#the SMOTE library mutates existing data to creating more data
#Here we use SMOTE to increase the number of habitable planets in the training and validation data
smote = SMOTE(ratio='minority')
X_sm, y_sm = smote.fit_sample(X, y)
validateX,validateY=smote.fit_sample(validate[trainCols],validate.habitable)

Hiding  6  habitable(3716,2223,2882,151,3743,3606, )


In [0]:

Bestscores=[-1,{}]
n_iterations=[10,100,500,1000,2000]
LR=[0.1,0.01,0.001,0.00001]
n_estimators=[10,50,100,200]
max_features=[1,0.5,0.1,0.01]
depth=[2,3,5,7]
num=0
loss_function=['Logloss','CrossEntropy','MultiClass', 'MultiClassOneVsAll' ]
for n in n_estimators:
    for d in depth:
        for lr in LR:
            for ls in loss_function:
                        test_model=CatBoostClassifier(iterations=n,learning_rate=lr,depth=d,loss_function=ls,logging_level='Silent',thread_count=-1)
                        test_model.fit(X_sm,y_sm)
                        y_preds=test_model.predict(validateX)
                        num+=1
                        print("Done with",test_model.get_params(),"\n",num)
                        currScore=balanced_accuracy_score(validateY,y_preds)
                        if(currScore>Bestscores[0]):
                            Bestscores[0]=currScore
                            Bestscores[1]=test_model.get_params()
Bestscores

Done with {'logging_level': 'Silent', 'loss_function': 'Logloss', 'depth': 2, 'learning_rate': 0.1, 'iterations': 10} 
 1
Done with {'logging_level': 'Silent', 'loss_function': 'CrossEntropy', 'depth': 2, 'learning_rate': 0.1, 'iterations': 10} 
 2
Done with {'logging_level': 'Silent', 'loss_function': 'MultiClass', 'depth': 2, 'learning_rate': 0.1, 'iterations': 10} 
 3
Done with {'logging_level': 'Silent', 'loss_function': 'MultiClassOneVsAll', 'depth': 2, 'learning_rate': 0.1, 'iterations': 10} 
 4
Done with {'logging_level': 'Silent', 'loss_function': 'Logloss', 'depth': 2, 'learning_rate': 0.01, 'iterations': 10} 
 5
Done with {'logging_level': 'Silent', 'loss_function': 'CrossEntropy', 'depth': 2, 'learning_rate': 0.01, 'iterations': 10} 
 6
Done with {'logging_level': 'Silent', 'loss_function': 'MultiClass', 'depth': 2, 'learning_rate': 0.01, 'iterations': 10} 
 7
Done with {'logging_level': 'Silent', 'loss_function': 'MultiClassOneVsAll', 'depth': 2, 'learning_rate': 0.01, 'ite

[0.8246753246753247,
 {'depth': 3,
  'iterations': 10,
  'learning_rate': 0.1,
  'logging_level': 'Silent',
  'loss_function': 'MultiClass'}]

In [22]:
final=CatBoostClassifier(depth=3,iterations=10,learning_rate=0.1,logging_level='Silent',loss_function='MultiClass')
final.fit(X_sm,y_sm)
#Get Feature importance from best model
feats=final.get_feature_importance(type='PredictionValuesChange')
feats=list(X.columns)
for value in range(len(a)):
    print(feats[value],a[value])


pl_controvflag 0.0
pl_pnum 0.0
pl_orbper 3.1598259123680057
pl_orbsmax 0.0
pl_radj 0.0
pl_ttvflag 0.0
pl_kepflag 0.0
pl_k2flag 0.0
ra 1.2818512570677536
dec 0.0
st_dist 0.0
st_optmag 1.3134362370487753
gaia_gmag 1.1029854204385408
st_teff 0.0
st_mass 4.646859774253571
st_rad 0.0
pl_tranflag 0.0
pl_rvflag 0.0
pl_imgflag 0.0
pl_astflag 0.0
pl_omflag 0.0
pl_cbflag 0.0
pl_angsep 0.0
pl_rade 0.0
pl_rads 0.0
pl_trandur 0.0
pl_tranmid 0.0
pl_ratror 0.0
pl_mnum 0.0
pl_st_npar 0.22932228758691564
pl_st_nref 0.0
st_rah 0.0
st_glon 0.0
st_glat 0.0
st_elon 2.4495670104719447
st_elat 0.0
gaia_plx 0.0
gaia_dist 2.726371700115607
st_pmra 0.0
st_pmdec 0.3936826722614326
st_pm 0.0
gaia_pmra 60.60460491047441
gaia_pmdec 0.0
gaia_pm 20.809147141735924
st_logg 0.0
st_metfe 0.0
st_j 0.0
st_h 0.0
st_k 0.0
st_wise1 0.0
st_wise2 0.0
st_wise3 0.0
st_wise4 0.0
st_jmh2 0.6134555463443907
st_hmk2 0.0
st_jmk2 0.6688901298327542
