In [0]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import math as m
from matplotlib import pyplot as plt
from imblearn.over_sampling import SMOTE
import random as r
r.seed(300)



In [0]:
# Clone the entire repo.
!git clone -l -s git://github.com/NeilBotelho/YSP-Exoplanets.git cloned-repo
%cd cloned-repo
!ls

Cloning into 'cloned-repo'...
remote: Enumerating objects: 151, done.[K
remote: Counting objects:   0% (1/151)[Kremote: Counting objects:   1% (2/151)[Kremote: Counting objects:   2% (4/151)[Kremote: Counting objects:   3% (5/151)[Kremote: Counting objects:   4% (7/151)[Kremote: Counting objects:   5% (8/151)[Kremote: Counting objects:   6% (10/151)[Kremote: Counting objects:   7% (11/151)[Kremote: Counting objects:   8% (13/151)[Kremote: Counting objects:   9% (14/151)[Kremote: Counting objects:  10% (16/151)[Kremote: Counting objects:  11% (17/151)[Kremote: Counting objects:  12% (19/151)[Kremote: Counting objects:  13% (20/151)[Kremote: Counting objects:  14% (22/151)[Kremote: Counting objects:  15% (23/151)[Kremote: Counting objects:  16% (25/151)[Kremote: Counting objects:  17% (26/151)[Kremote: Counting objects:  18% (28/151)[Kremote: Counting objects:  19% (29/151)[Kremote: Counting objects:  20% (31/151)[Kremote: Counting objects:  21%

In [0]:
exoData=pd.read_csv('dataset/simpleImputedMiceRf')
#These are the row_ids of exoplanets that are known to be habitable
habitableRows=[151, 152, 153, 1604, 2155, 2223, 2882, 3133, 3606, 3716, 3742,3743, 3744]

#Create a Series that uses 1 or 0 to indicate whether a corresponding record in the exoData
#Store row ids and habitable series and remove for scaling all data to values between 0 and 1
habitable=exoData.row_id.isin(habitableRows).replace(True,1).rename('habitable')
row_id=exoData.row_id
data=exoData.drop('row_id',axis=1)
scaledData=pd.DataFrame(StandardScaler().fit_transform(data),columns=data.columns)

#KEEP 98% of variance, get rid of other components using PCA
pca = PCA().fit(scaledData)
numComponents=0
for n in np.cumsum(pca.explained_variance_ratio_):
    if n<0.980:
        numComponents+=1
pca=PCA(n_components=numComponents).fit_transform(scaledData)

#join the data columns, row_id and habitable column 
#into the variable preprocessed and shuffle
preprocessed=pd.concat([row_id,scaledData,habitable],axis=1)
preprocessed=shuffle(preprocessed,random_state=100).reset_index()
preprocessed=preprocessed.drop('index',axis=1)
print('Done')

Done


In [0]:
trainCols=[x for x in preprocessed.columns if x not in ['habitable','row_id']]
#Get an idea for what SMOTE oversampling is doing.
#The graph is a representation of the data in 2 dimensions and 
#the axis do not represent anything in particular
validate=[]
Hcopy=habitableRows.copy()
numHidden=round(len(Hcopy)/2)
print(")Hiding ",numHidden," habitable(",end="")
for i in range(numHidden): 
    randNum=r.randint(0,len(Hcopy)-1)
    validate.append(Hcopy[randNum])
    print(Hcopy[randNum],end=",")
    del Hcopy[randNum]
print("\b )")
for n in range(100):
  validate.append(50+n)

validate=preprocessed[preprocessed.row_id.isin(validate)]
Preprocessed=preprocessed[~preprocessed.row_id.isin(validate.row_id)]
X=Preprocessed[trainCols]
y=Preprocessed.habitable
smote = SMOTE(ratio='minority')
X_sm, y_sm = smote.fit_sample(X, y)
validateX,validateY=smote.fit_sample(validate[trainCols],validate.habitable)
trainX, testX,trainY,testY=train_test_split(X_sm,y_sm)

)Hiding  6  habitable(3716,2223,2882,151,3743,3606, )


In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score,roc_auc_score
Bestscores={balanced_accuracy_score:['balanced_accuracy_score',-1,{}],
                            roc_auc_score:['roc_auc_score',-1,{}],
            
                                           
           }
estimators=[x for x in range(0,12)]
criterion=['gini','entropy']
for e in estimators:
  for c in criterion:
    testForest=RandomForestClassifier(n_estimators=5,criterion=c,n_jobs=-1)
    testForest.fit(X_sm,y_sm)
    y_preds=testForest.predict(validateX)
    print("Done with",testForest.get_params())
    for score_method in Bestscores:
        currScore=score_method(validateY,y_preds)
        if(currScore>Bestscores[score_method][1]):
            Bestscores[score_method][1]=currScore
            Bestscores[score_method][2]=testForest.get_params()
                    

Done with {'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 5, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
Done with {'bootstrap': True, 'class_weight': None, 'criterion': 'entropy', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 5, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
Done with {'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 

In [0]:
#Result of the test
for n in Bestscores:
    print(Bestscores[n][0]," (",Bestscores[n][1],"):\n",Bestscores[n][2],"\n\n")

balanced_accuracy_score  ( 0.7586206896551724 ):
 {'bootstrap': True, 'class_weight': None, 'criterion': 'entropy', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 5, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False} 


roc_auc_score  ( 0.7586206896551724 ):
 {'bootstrap': True, 'class_weight': None, 'criterion': 'entropy', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 5, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False} 




In [0]:
finalForest=RandomForestClassifier(n_estimators=5,criterion='entropy')
finalForest.fit(X_sm,y_sm)
preds=finalForest.predict(validateX)
balanced_accuracy_score(validateY,preds)


0.7413793103448276

In [0]:
a=finalForest.feature_importances_
for n in range(len(a)):
    print(list(X.columns)[n],a[n])

pl_controvflag 0.06327046208475358
pl_pnum 0.0017311129327239757
pl_orbper 0.00938026639705469
pl_orbsmax 0.01268747661847062
pl_radj 0.004996008112419692
pl_ttvflag 0.0006144932974894915
pl_kepflag 0.0
pl_k2flag 0.03755102301537202
ra 0.006835079598596127
dec 0.0
st_dist 0.025716571250098453
st_optmag 0.013364648027455666
gaia_gmag 0.003315216217356095
st_teff 0.0003358747655568993
st_mass 0.0021826323117662273
st_rad 0.02691622392727403
pl_tranflag 0.0008837003357811503
pl_rvflag 0.0
pl_imgflag 0.0
pl_astflag 0.0
pl_omflag 0.0
pl_cbflag 0.0
pl_angsep 0.0
pl_rade 0.015966477849135473
pl_rads 0.018679479477606006
pl_trandur 0.013090394042392334
pl_tranmid 0.0
pl_ratror 0.02380265328948833
pl_mnum 0.0
pl_st_npar 0.0038334538656769625
pl_st_nref 0.0041081883080457075
st_rah 0.03506071597019257
st_glon 0.006535009737745931
st_glat 0.007891048135392582
st_elon 0.02082440258668159
st_elat 0.002929686049013726
gaia_plx 0.0029992315869606535
gaia_dist 0.0147647619694036
st_pmra 0.032139046584