In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.svm import SVC
import random as r
from imblearn.over_sampling import SMOTE
r.seed(300)




In [3]:
exoData=pd.read_csv('../simpleImputedMiceRf')
#These are the row_ids of exoplanets that are known to be habitable
habitableRows=[151, 152, 153, 1604, 2155, 2223, 2882, 3133, 3606, 3716, 3742,3743, 3744]

#Creates a Series that uses 1 or 0 to indicate whether a corresponding record in the exoData
habitable=exoData.row_id.isin(habitableRows).replace(True,1).rename('habitable')

#Store row ids series and remove for scaling all data to values between 0 and 1
row_id=exoData.row_id
data=exoData.drop('row_id',axis=1)
scaledData=pd.DataFrame(StandardScaler().fit_transform(data),columns=data.columns)

#join the scaled data columns, row_id and habitable column,into the variable preprocessed and shuffle
preprocessed=pd.concat([row_id,scaledData,habitable],axis=1)
preprocessed=shuffle(preprocessed,random_state=100).reset_index()
preprocessed=preprocessed.drop('index',axis=1)   
print('Done')

Done


In [4]:
#List of columns to be used for training
#it will be all columns in preprocessed except for "habitable" and "row_id"
trainCols=[x for x in preprocessed.columns if x not in ['habitable','row_id']]
validate=[]
Hcopy=habitableRows.copy()

#numHidden sets the number of habitable planets to use for validation of the model
numHidden=round(len(Hcopy)/2)

#Randomly select "numHidden" number of habitable exoplanets and
#add their row_id to validation set 
print("Hiding ",numHidden," habitable(",end="")
for i in range(numHidden): 
    randNum=r.randint(0,len(Hcopy)-1)
    validate.append(Hcopy[randNum])
    print(Hcopy[randNum],end=",")
    del Hcopy[randNum]
print("\b )")

#Add row_id of non-habitable planets to the validation set till its length becomes 100
while len(validate)<100:
    temp=r.randint(0,3500)
    if temp not in habitableRows:
        validate.append(temp)

#Take all columns of the planets whose row_id is in "validate" variable(in the validation set) and
#store it in "validate" variable
validate=preprocessed[preprocessed.row_id.isin(validate)]

#Store the planets that are not in the validation set in the training set
trainingSet=preprocessed[~preprocessed.row_id.isin(validate.row_id)]

#Store the training features in X and target feature(habitable or not) in y 
X=trainingSet[trainCols]
y=trainingSet.habitable

#the SMOTE library mutates existing data to creating more data
#Here we use SMOTE to increase the number of habitable planets in the training and validation data
smote = SMOTE(ratio='minority')
X_sm, y_sm = smote.fit_sample(X, y)
validateX,validateY=smote.fit_sample(validate[trainCols],validate.habitable)

Hiding  6  habitable(3716,2223,2882,151,3743,3606, )


In [5]:
#SVC TEST
#Find the optimal hyperparameters for SVC using the following scoring meassures
from sklearn.metrics import balanced_accuracy_score

# final_model=SVC(C=0.001,coef0=0,decision_function_shape='ovo',gamma='auto',kernel='linear',max_iter=-1,shrinking=True,tol=0.001,verbose=False,random_state=n)
#Parameters to check:
coef=[0,0.1,0.001,0.01]
degree=[1,2,3]
C =[1,0.1,0.01,0.001,0.00001]
shrink=[True,False]
decision_shape=['ovo','ovr']
tol=[1,0.1,0.01,0.001,0.0001]

Bestscores={balanced_accuracy_score:['balanced_accuracy_score',-1,{}]}
n=0
#Training the data on some probable hyperparameters
#storing the best parameters in the Bestscores dictionary
for c in C:
    for dec in decision_shape:
        for d in degree:
            for co in coef:
                for t in tol:
                    for s in shrink:
                        n+=1
                        testSvc=SVC(C=c,coef0=co,tol=t,kernel='linear',degree=d,gamma='auto',shrinking=s,decision_function_shape=dec)
                        testSvc.fit(X_sm,y_sm)
                        y_preds=testSvc.predict(validateX)
                        for score_method in Bestscores:
                            currScore=score_method(validateY,y_preds)
                            if(currScore>Bestscores[score_method][1]):
                                Bestscores[score_method][1]=currScore
                                Bestscores[score_method][2]=testSvc.get_params()

Bestscores

{<function sklearn.metrics.classification.balanced_accuracy_score>: ['balanced_accuracy_score',
  0.8506493506493507,
  {'C': 0.001,
   'cache_size': 200,
   'class_weight': None,
   'coef0': 0,
   'decision_function_shape': 'ovo',
   'degree': 1,
   'gamma': 'auto',
   'kernel': 'linear',
   'max_iter': -1,
   'probability': False,
   'random_state': None,
   'shrinking': True,
   'tol': 1,
   'verbose': False}]}


# Getting Feature importance:
### I use the feature weights as the importance as larger the absolute value of the weight, larger would be its impact on the result. Perceptron has a .coef_ attribute to get feature weights.

In [7]:
final_model=SVC(C=0.001,coef0=0,decision_function_shape='ovo',gamma='auto',kernel='linear',max_iter=-1,shrinking=True,tol=1,verbose=False,random_state=n)
final_model.fit(X_sm,y_sm)
#Getting Feature weights from best model
a=final_model.coef_
for n in range(len(a[0])):
    print(list(X.columns)[n],a[0][n])

pl_controvflag 0.07001099056395808
pl_pnum -0.02271914062498304
pl_orbper -0.00915420179401016
pl_orbsmax -0.018308040349642143
pl_radj -0.014465643559427284
pl_ttvflag -0.033993633189637315
pl_kepflag 0.026899269714568873
pl_k2flag -0.0026217271088529535
ra 0.051236979812831346
dec 0.04648636304278767
st_dist -0.014897876820458353
st_optmag 0.040040429386226685
gaia_gmag 0.01918893773572667
st_teff -0.011234710081077015
st_mass 0.012175526413982722
st_rad 0.0062777373625333595
pl_tranflag -0.01771308660466241
pl_rvflag -0.04511724232967508
pl_imgflag -0.02683271735560755
pl_astflag 9.107298248878237e-18
pl_omflag -0.045123472212485874
pl_cbflag -0.015921808193265646
pl_angsep -0.02000683208459723
pl_rade -0.012020116459188577
pl_rads -0.013791093769181719
pl_trandur 0.024604397586547422
pl_tranmid -0.01640388859078081
pl_ratror -0.013291728671301671
pl_mnum 0.0
pl_st_npar 0.053795313432173246
pl_st_nref 0.029202086777772175
st_rah 0.05123697933974705
st_glon 0.015267349833853356
st_gl