In [0]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE
import random as r
r.seed(300)

In [43]:
exoData=pd.read_csv('../simpleImputedMiceRf')
#These are the row_ids of exoplanets that are known to be habitable
habitableRows=[151, 152, 153, 1604, 2155, 2223, 2882, 3133, 3606, 3716, 3742,3743, 3744]

#Create a Series that uses 1 or 0 to indicate whether a corresponding record in the exoData
#Store row ids and habitable series and remove for scaling all data to values between 0 and 1
habitable=exoData.row_id.isin(habitableRows).replace(True,1).rename('habitable')
row_id=exoData.row_id
data=exoData.drop('row_id',axis=1)
scaledData=pd.DataFrame(StandardScaler().fit_transform(data),columns=data.columns)

#KEEP 98% of variance, get rid of other components using PCA
pca = PCA().fit(scaledData)
numComponents=0
for n in np.cumsum(pca.explained_variance_ratio_):
    if n<0.980:
        numComponents+=1
pca=PCA(n_components=numComponents).fit_transform(scaledData)

#join the data columns, row_id and habitable column 
#into the variable preprocessed and shuffle
preprocessed=pd.concat([row_id,scaledData,habitable],axis=1)
preprocessed=shuffle(preprocessed,random_state=100).reset_index()
preprocessed=preprocessed.drop('index',axis=1)
print('Done')

Done


In [44]:
trainCols=[x for x in preprocessed.columns if x not in ['habitable','row_id']]
#Get an idea for what SMOTE oversampling is doing.
#The graph is a representation of the data in 2 dimensions and 
#the axis do not represent anything in particular
validate=[]
Hcopy=habitableRows.copy()
numHidden=round(len(Hcopy)/2)
print(")Hiding ",numHidden," habitable(",end="")
for i in range(numHidden): 
    randNum=r.randint(0,len(Hcopy)-1)
    validate.append(Hcopy[randNum])
    print(Hcopy[randNum],end=",")
    del Hcopy[randNum]
print("\b )")
for n in range(100):
  validate.append(50+n)

validate=preprocessed[preprocessed.row_id.isin(validate)]
Preprocessed=preprocessed[~preprocessed.row_id.isin(validate.row_id)]
X=Preprocessed[trainCols]
y=Preprocessed.habitable
smote = SMOTE(ratio='minority')
X_sm, y_sm = smote.fit_sample(X, y)
validateX,validateY=smote.fit_sample(validate[trainCols],validate.habitable)
trainX, testX,trainY,testY=train_test_split(X_sm,y_sm)

)Hiding  6  habitable(3716,2223,2882,151,3743,3606, )


In [52]:
Bestscores={balanced_accuracy_score:['balanced_accuracy_score',-1,{}]}
estimators=[x for x in range(1,100)]
criterion=['gini','entropy']
max_depth=[6,7,8,9]
for e in estimators:
  for c in criterion:
    for m in max_depth:
        testForest=RandomForestClassifier(n_estimators=e,max_depth=m,criterion=c,n_jobs=-1)
        testForest.fit(X_sm,y_sm)
        y_preds=testForest.predict(validateX)
        print("Done with",testForest.get_params())
        for score_method in Bestscores:
            currScore=score_method(validateY,y_preds)
            if(currScore>Bestscores[score_method][1]):
                Bestscores[score_method][1]=currScore
                Bestscores[score_method][2]=testForest.get_params()
Bestscores

Done with {'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': 6, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 1, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
Done with {'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': 7, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 1, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
Done with {'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': 8, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples

{<function sklearn.metrics.classification.balanced_accuracy_score>: ['balanced_accuracy_score',
  0.8103448275862069,
  {'bootstrap': True,
   'class_weight': None,
   'criterion': 'entropy',
   'max_depth': 9,
   'max_features': 'auto',
   'max_leaf_nodes': None,
   'min_impurity_decrease': 0.0,
   'min_impurity_split': None,
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'min_weight_fraction_leaf': 0.0,
   'n_estimators': 3,
   'n_jobs': -1,
   'oob_score': False,
   'random_state': None,
   'verbose': 0,
   'warm_start': False}]}

In [53]:
finalForest=RandomForestClassifier(n_estimators=3,criterion='entropy',max_depth=9)
#Getting Feature weights from best model
finalForest.fit(X_sm,y_sm)
a=finalForest.feature_importances_
for n in range(len(a)):
    print(list(X.columns)[n],a[n])

pl_controvflag 0.016407830943765443
pl_pnum 0.0
pl_orbper 0.04785791684023528
pl_orbsmax 0.013474329425701793
pl_radj 0.0
pl_ttvflag 0.0
pl_kepflag 0.0
pl_k2flag 0.00267323557658314
ra 0.0
dec 0.0
st_dist 0.0
st_optmag 0.013752930318941122
gaia_gmag 0.0
st_teff 0.07898255694485688
st_mass 0.0
st_rad 0.0
pl_tranflag 0.0016203489258966065
pl_rvflag 0.0
pl_imgflag 0.0
pl_astflag 0.0
pl_omflag 0.0
pl_cbflag 0.0
pl_angsep 0.0
pl_rade 0.022519611661728983
pl_rads 0.03807083402292596
pl_trandur 0.020852753775769048
pl_tranmid 0.023176997415918884
pl_ratror 0.026762517889169214
pl_mnum 0.0
pl_st_npar 0.0
pl_st_nref 0.0
st_rah 0.006567362696220452
st_glon 0.011286414558524466
st_glat 0.12733635957378886
st_elon 0.036382948089122506
st_elat 0.0
gaia_plx 0.16160548423824025
gaia_dist 0.036436730704936075
st_pmra 0.06409868734023975
st_pmdec 0.0
st_pm 0.004300776199491736
gaia_pmra 0.08439690069090496
gaia_pmdec 0.0682663600405557
gaia_pm 0.0
st_logg 0.01116019637547909
st_metfe 0.0140215520028773