In [1]:
import pandas as pd
import random as r
from sklearn.metrics import balanced_accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

Using TensorFlow backend.


In [2]:
preprocessed=pd.read_csv("../PreprocessedDataset.csv")
habitableRows=list(preprocessed.rowid[preprocessed.habitable==True])

In [3]:
def prepareData():
    #List of columns to be used for training
    #it will be all columns in preprocessed except for "habitable" and "rowid"
    trainCols=[x for x in preprocessed.columns if x not in ['habitable','rowid']]
    validate=[]
    Hcopy=habitableRows.copy()
    #numHidden sets the number of habitable planets to use for validation of the model
    numHidden=round(len(Hcopy)/2)

    #Randomly select habitable exoplanets and 
    #add their row_id to validation set 
    print("Hiding ",numHidden," habitable(",end="")
    for i in range(numHidden): 
        randNum=r.randint(0,len(Hcopy)-1)  
        validate.append(Hcopy[randNum])
        print(Hcopy[randNum],end=",")
        del Hcopy[randNum]
    print("\b )")

    #Add row_id of non-habitable planets to the validation set till its length becomes 200
    while len(validate)<200:
        temp=r.randint(0,len(preprocessed)-1)
        if temp not in habitableRows and temp not in validate:
            validate.append(temp)

    #Take all columns of the planets whose row_id is in "validate" variable(in the validation set) and
    #store it in "validate" variable
    validate=preprocessed[preprocessed.rowid.isin(validate)]

    #Store the planets that are not in the validation set in the training set
    trainingSet=preprocessed[~preprocessed.rowid.isin(validate.rowid)]

    #Store the training features in X and target feature(habitable or not) in y 
    X=trainingSet[trainCols]
    y=trainingSet.habitable

    #the SMOTE library mutates existing data to creating more data
    #Here we use SMOTE to increase the number of habitable planets in the training and validation data
    smote = SMOTE(ratio='minority')
    X_sm, y_sm = smote.fit_sample(X, y)
    validateX,validateY=smote.fit_sample(validate[trainCols],validate.habitable)
    return X_sm,y_sm,validateX,validateY

# Effect of Random Seed on model
 Does changing the random seed during training but keeping the same dataset substantially change the outcome for this model?

In [4]:
RandomSeed=42
X_sm,y_sm,validateX,validateY=prepareData()
estimators=[x for x in range(1,100)]
criterion=['gini','entropy']
max_depth=[6,7,8,9]

for testNumber in range(5):
    print("*"*10,end="\n\n")
    print("TEST NUMBER",testNumber+1,"Random Seed =",RandomSeed)
    r.seed(RandomSeed)
    RandomSeed=RandomSeed+10
    Bestscores=[0,0]
    for e in estimators:
        for c in criterion:
            for m in max_depth:
                    model=RandomForestClassifier(n_estimators=e,max_depth=m,criterion=c,n_jobs=-1)
                    model.fit(X_sm,y_sm)
                    y_preds=model.predict(validateX)
                    currScore=balanced_accuracy_score(validateY,y_preds)
                    if(currScore>Bestscores[0]):
                        Bestscores[0]=currScore
                        Bestscores[1]=model.get_params()
    print("BEST SCORE:",str(Bestscores[0])+"\n"+"PARAMS:",Bestscores[1])

Hiding  24  habitable(3716,3233,2542,3115,2902,3133,3744,152,2223,3132,2547,2189,2503,986,1205,163,3606,1137,1227,2135,703,153,2541,2882, )
**********

TEST NUMBER 1 Random Seed = 42
BEST SCORE: 0.9090909090909092
PARAMS: {'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': 6, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 7, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
**********

TEST NUMBER 2 Random Seed = 52
BEST SCORE: 0.9204545454545454
PARAMS: {'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': 6, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 5, 'n_jobs': -1, 'oob_score': False, 'r

# Effect of changing random seed when preparing data on model

In [5]:
RandomSeed=42
estimators=[x for x in range(1,100)]
criterion=['gini','entropy']
max_depth=[6,7,8,9]

for testNumber in range(5):
    print("*"*10,end="\n\n")
    print("TEST NUMBER",testNumber+1,"Random Seed =",RandomSeed)
    r.seed(RandomSeed)
    RandomSeed=RandomSeed+10
    X_sm,y_sm,validateX,validateY=prepareData()
    Bestscores=[0,0]
    for e in estimators:
        for c in criterion:
            for m in max_depth:
                    model=RandomForestClassifier(n_estimators=e,max_depth=m,criterion=c,n_jobs=-1)
                    model.fit(X_sm,y_sm)
                    y_preds=model.predict(validateX)
                    currScore=balanced_accuracy_score(validateY,y_preds)
                    if(currScore>Bestscores[0]):
                        Bestscores[0]=currScore
                        Bestscores[1]=model.get_params()
    print("BEST SCORE:",str(Bestscores[0])+"\n"+"PARAMS:",Bestscores[1])

**********

TEST NUMBER 1 Random Seed = 42
Hiding  24  habitable(3233,163,117,2031,2014,1845,1137,703,3716,153,3922,2883,130,128,1205,2156,2223,151,2542,1604,3133,3115,3741,2880, )
BEST SCORE: 0.9772727272727273
PARAMS: {'bootstrap': True, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 6, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 15, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
**********

TEST NUMBER 2 Random Seed = 52
Hiding  24  habitable(2014,130,3962,2882,2547,2189,2503,128,1137,1227,2883,1604,2541,3233,114,152,3132,3742,117,1424,163,1205,1845,2542, )
BEST SCORE: 0.9005681818181818
PARAMS: {'bootstrap': True, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 6, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_spli