In [1]:
# Clone the entire repo.
!git clone -l -s git://github.com/NeilBotelho/YSP-Exoplanets.git cloned-repo
%cd cloned-repo
!ls

fatal: destination path 'cloned-repo' already exists and is not an empty directory.
/content/cloned-repo
allFeatures.txt  datasetmaker.py    hmisc.R	    nonCategorical
badrecords	 exoplanets.csv     importance	    scikitImputer.py
cloned-repo	 expsOfSelected     imputation.R    technical.py
corr.py		 FixMiceImputed.py  investigate.py  Testimputing.R
dataset		 getexp.py	    missforest.R    unimputedDataset


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.linear_model import Perceptron
from sklearn.metrics import balanced_accuracy_score,roc_auc_score
import math as m
from matplotlib import pyplot as plt
from imblearn.over_sampling import SMOTE
import random as r
import sys
r.seed(300)




In [3]:
exoData=pd.read_csv('dataset/simpleImputedMiceRf')
#These are the row_ids of exoplanets that are known to be habitable
habitableRows=[151, 152, 153, 1604, 2155, 2223, 2882, 3133, 3606, 3716, 3742,3743, 3744]

#Creates a Series that uses 1 or 0 to indicate whether a corresponding record in the exoData
habitable=exoData.row_id.isin(habitableRows).replace(True,1).rename('habitable')

#Store row ids series and remove for scaling all data to values between 0 and 1
row_id=exoData.row_id
data=exoData.drop('row_id',axis=1)
scaledData=pd.DataFrame(StandardScaler().fit_transform(data),columns=data.columns)

#join the scaled data columns, row_id and habitable column,into the variable preprocessed and shuffle
preprocessed=pd.concat([row_id,scaledData,habitable],axis=1)
preprocessed=shuffle(preprocessed,random_state=100).reset_index()
preprocessed=preprocessed.drop('index',axis=1)   
print('Done')

Done


In [4]:
#List of columns to be used for training
#it will be all columns in preprocessed except for "habitable" and "row_id"
trainCols=[x for x in preprocessed.columns if x not in ['habitable','row_id']]
validate=[]
Hcopy=habitableRows.copy()

#numHidden sets the number of habitable planets to use for validation of the model
numHidden=round(len(Hcopy)/2)

#Randomly select "numHidden" number of habitable exoplanets and
#add their row_id to validation set 
print("Hiding ",numHidden," habitable(",end="")
for i in range(numHidden): 
    randNum=r.randint(0,len(Hcopy)-1)
    validate.append(Hcopy[randNum])
    print(Hcopy[randNum],end=",")
    del Hcopy[randNum]
print("\b )")

#Add row_id of non-habitable planets to the validation set till its length becomes 100
while len(validate)<100:
    temp=r.randint(0,3500)
    if temp not in habitableRows:
        validate.append(temp)

#Take all columns of the planets whose row_id is in "validate" variable(in the validation set) and
#store it in "validate" variable
validate=preprocessed[preprocessed.row_id.isin(validate)]

#Store the planets that are not in the validation set in the training set
trainingSet=preprocessed[~preprocessed.row_id.isin(validate.row_id)]

#Store the training features in X and target feature(habitable or not) in y 
X=trainingSet[trainCols]
y=trainingSet.habitable

#the SMOTE library mutates existing data to creating more data
#Here we use SMOTE to increase the number of habitable planets in the training and validation data
smote = SMOTE(ratio='minority')
X_sm, y_sm = smote.fit_sample(X, y)
validateX,validateY=smote.fit_sample(validate[trainCols],validate.habitable)

Hiding  6  habitable(3716,2223,2882,151,3743,3606, )


In [5]:

Bestscores=[-1,{}]
penalty=[None, 'l2' , 'elasticnet']
alpha=[ 1,0.1,0.01, 0.001]
max_iter=[500,1000,2000]
tol=[0.00001,0.0001,0.001,0.01]
n_iter_no_change=[10,13,20]
eta0=[1,10,20,50,100,200]
num=0
for p in penalty:
    for a in alpha:
        for m in max_iter:
            for t in tol:
                for e in eta0:
                    for n in n_iter_no_change:
                            testPercep= Perceptron(penalty=p,alpha=a,tol=t,n_iter_no_change=n,max_iter=m,n_jobs=-1,early_stopping=True,eta0=e)
                            testPercep.fit(X_sm,y_sm)
                            y_preds=testPercep.predict(validateX)
                            num+=1
                            print("Done with",testPercep.get_params(),"\n",num)
                            currScore=balanced_accuracy_score(validateY,y_preds)
                            if(currScore>Bestscores[0]):
                                Bestscores[0]=currScore
                                Bestscores[1]=testPercep.get_params()
Bestscores

Done with {'alpha': 1, 'class_weight': None, 'early_stopping': True, 'eta0': 1, 'fit_intercept': True, 'max_iter': 500, 'n_iter_no_change': 10, 'n_jobs': -1, 'penalty': None, 'random_state': 0, 'shuffle': True, 'tol': 1e-05, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False} 
 1
Done with {'alpha': 1, 'class_weight': None, 'early_stopping': True, 'eta0': 1, 'fit_intercept': True, 'max_iter': 500, 'n_iter_no_change': 13, 'n_jobs': -1, 'penalty': None, 'random_state': 0, 'shuffle': True, 'tol': 1e-05, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False} 
 2
Done with {'alpha': 1, 'class_weight': None, 'early_stopping': True, 'eta0': 1, 'fit_intercept': True, 'max_iter': 500, 'n_iter_no_change': 20, 'n_jobs': -1, 'penalty': None, 'random_state': 0, 'shuffle': True, 'tol': 1e-05, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False} 
 3
Done with {'alpha': 1, 'class_weight': None, 'early_stopping': True, 'eta0': 10, 'fit_intercept': True, 'max_iter': 500, 'n_

[0.8311688311688312,
 {'alpha': 0.001,
  'class_weight': None,
  'early_stopping': True,
  'eta0': 1,
  'fit_intercept': True,
  'max_iter': 500,
  'n_iter_no_change': 20,
  'n_jobs': -1,
  'penalty': 'l2',
  'random_state': 0,
  'shuffle': True,
  'tol': 1e-05,
  'validation_fraction': 0.1,
  'verbose': 0,
  'warm_start': False}]

In [6]:
finalPercep=Perceptron(alpha=0.001,eta0=1,n_iter_no_change=20,max_iter=500,n_jobs=-1,penalty='l2',tol=1e-05)
finalPercep.fit(X_sm,y_sm)
wghts=finalPercep.coef_
for n in range(len(wghts[0])):
    print(list(X.columns)[n],wghts[0][n])

pl_controvflag 17.070387719349135
pl_pnum -2.399796575590618
pl_orbper -4.383012790485503
pl_orbsmax -1.6559364820263864
pl_radj -3.1632199880006646
pl_ttvflag -7.34493293825067
pl_kepflag 3.8720300387413484
pl_k2flag -2.3792510540317466
ra 5.2222164250460175
dec 5.109426240197718
st_dist -1.2721630741846914
st_optmag -0.1456844668704719
gaia_gmag 0.008054577772769958
st_teff -2.8418867363226927
st_mass 4.885303411531236
st_rad -14.066377977843501
pl_tranflag -4.734399620434479
pl_rvflag -5.671279727008409
pl_imgflag -7.738074186725737
pl_astflag -0.05135005679350824
pl_omflag -5.2563338040538055
pl_cbflag -0.18644107422447925
pl_angsep -0.9550207492611127
pl_rade -2.6668130887472814
pl_rads -2.7835787343509732
pl_trandur 2.456394906752452
pl_tranmid -1.7792734867290998
pl_ratror -2.594998714561675
pl_mnum 0.0
pl_st_npar 0.31439388313155925
pl_st_nref 4.315569794093457
st_rah 5.222216394680186
st_glon 1.4414712976139457
st_glat 2.2627783247199122
st_elon 3.7877506170782733
st_elat 4.37