In [1]:
import pandas as pd
import random as r
from sklearn.metrics import balanced_accuracy_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split


Using TensorFlow backend.


In [2]:
preprocessed=pd.read_csv("../PreprocessedDataset.csv")
habitableRows=list(preprocessed.rowid[preprocessed.habitable==True])

In [3]:
def prepareData():
    #List of columns to be used for training
    #it will be all columns in preprocessed except for "habitable" and "rowid"
    trainCols=[x for x in preprocessed.columns if x not in ['habitable','rowid']]
    validate=[]
    Hcopy=habitableRows.copy()
    #numHidden sets the number of habitable planets to use for validation of the model
    numHidden=round(len(Hcopy)/2)

    #Randomly select habitable exoplanets and 
    #add their row_id to validation set 
    print("Hiding ",numHidden," habitable(",end="")
    for i in range(numHidden): 
        randNum=r.randint(0,len(Hcopy)-1)  
        validate.append(Hcopy[randNum])
        print(Hcopy[randNum],end=",")
        del Hcopy[randNum]
    print("\b )")

    #Add row_id of non-habitable planets to the validation set till its length becomes 200
    while len(validate)<200:
        temp=r.randint(0,len(preprocessed)-1)
        if temp not in habitableRows and temp not in validate:
            validate.append(temp)

    #Take all columns of the planets whose row_id is in "validate" variable(in the validation set) and
    #store it in "validate" variable
    validate=preprocessed[preprocessed.rowid.isin(validate)]

    #Store the planets that are not in the validation set in the training set
    trainingSet=preprocessed[~preprocessed.rowid.isin(validate.rowid)]

    #Store the training features in X and target feature(habitable or not) in y 
    X=trainingSet[trainCols]
    y=trainingSet.habitable

    #the SMOTE library mutates existing data to creating more data
    #Here we use SMOTE to increase the number of habitable planets in the training and validation data
    smote = SMOTE(ratio='minority')
    X_sm, y_sm = smote.fit_sample(X, y)
    trainX, testX,trainY,testY=train_test_split(X_sm,y_sm)
    validateX,validateY=smote.fit_sample(validate[trainCols],validate.habitable)
    return trainX, testX,trainY,testY,validateX,validateY

## XGBoostClassifier

### Effect of Random Seed on model
 Does changing the random seed during training but keeping the same dataset substantially change the outcome for this model?

In [4]:
RandomSeed=42
Bestscores=[0,0]
trainX, testX,trainY,testY,validateX,validateY=prepareData()

max_depth=[3,5,7,10]
learning_rate=[1,0.1,0.01,0.001]
n_estimators=[10,50,100,150,200]
early_stopping_rounds=[3,5,10,20]
booster=["gbtree","dart"]
n_jobs=-1
loss_function=['Logloss','CrossEntropy','MultiClass', 'MultiClassOneVsAll' ]

for testNumber in range(5):
    print("*"*10,end="\n\n")
    print("TEST NUMBER",testNumber+1,"Random Seed =",RandomSeed)
    Bestscores=[0,0]
    r.seed(RandomSeed)
    RandomSeed=RandomSeed+10
    for l in learning_rate:
        for n in n_estimators:
            for b in [0,1]:
                for m in max_depth:
                    for early in early_stopping_rounds:
                        model=XGBClassifier(verbosity=0,max_depth=m,learning_rate=l,booster=booster[int(b)])
                        model.fit(trainX,trainY,eval_set=[(testX,testY)],early_stopping_rounds=early,verbose=False)
                        y_preds=model.predict(validateX)
                        currScore=balanced_accuracy_score(validateY,y_preds)
                        if(currScore>Bestscores[0]):
                            Bestscores[0]=currScore
                            Bestscores[1]=model.get_params()
    print("BEST SCORE:",str(Bestscores[0])+"\n"+"PARAMS:",Bestscores[1])


Hiding  24  habitable(128,2189,2316,2547,986,117,1424,1845,3115,3606,2097,2441,1205,3962,2031,153,2882,152,3743,1147,2902,2156,2883,2155, )
**********

TEST NUMBER 1 Random Seed = 42
BEST SCORE: 0.9261363636363635
PARAMS: {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'gamma': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 1, 'missing': None, 'n_estimators': 100, 'n_jobs': 1, 'nthread': None, 'objective': 'binary:logistic', 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': None, 'subsample': 1, 'verbosity': 0}
**********

TEST NUMBER 2 Random Seed = 52
BEST SCORE: 0.9261363636363635
PARAMS: {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'gamma': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 1, 'missing': None, 'n_estimators': 100, 'n_jobs': 

### Effect of changing random seed when preparing data on model

In [5]:
RandomSeed=42
Bestscores=[0,0]
forCalc=[]
max_depth=[3,5,7,10]
learning_rate=[1,0.1,0.01,0.001]
n_estimators=[10,50,100,150,200]
early_stopping_rounds=[3,5,10,20]
booster=["gbtree","dart"]
n_jobs=-1
loss_function=['Logloss','CrossEntropy','MultiClass', 'MultiClassOneVsAll' ]

for testNumber in range(5):
    print("*"*10,end="\n\n")
    print("TEST NUMBER",testNumber+1,"Random Seed =",RandomSeed)
    Bestscores=[0,0]
    r.seed(RandomSeed)
    RandomSeed=RandomSeed+10
    trainX, testX,trainY,testY,validateX,validateY=prepareData()
    for l in learning_rate:
        for n in n_estimators:
            for b in [0,1]:
                for m in max_depth:
                    for early in early_stopping_rounds:
                        model=XGBClassifier(verbosity=0,max_depth=m,learning_rate=l,booster=booster[int(b)])
                        model.fit(trainX,trainY,eval_set=[(testX,testY)],early_stopping_rounds=early,verbose=False)
                        y_preds=model.predict(validateX)
                        currScore=balanced_accuracy_score(validateY,y_preds)
                        if(currScore>Bestscores[0]):
                            Bestscores[0]=currScore
                            Bestscores[1]=model.get_params()
    print("BEST SCORE:",str(Bestscores[0])+"\n"+"PARAMS:",Bestscores[1])
    forCalc.append(Bestscores[0])

**********

TEST NUMBER 1 Random Seed = 42
Hiding  24  habitable(3233,163,117,2031,2014,1845,1137,703,3716,153,3922,2883,130,128,1205,2156,2223,151,2542,1604,3133,3115,3741,2880, )
BEST SCORE: 0.9772727272727273
PARAMS: {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'gamma': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 1, 'missing': None, 'n_estimators': 100, 'n_jobs': 1, 'nthread': None, 'objective': 'binary:logistic', 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': None, 'subsample': 1, 'verbosity': 0}
**********

TEST NUMBER 2 Random Seed = 52
Hiding  24  habitable(2014,130,3962,2882,2547,2189,2503,128,1137,1227,2883,1604,2541,3233,114,152,3132,3742,117,1424,163,1205,1845,2542, )
BEST SCORE: 0.9346590909090909
PARAMS: {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'gam

## KNeighborsClassifier

### Effect of Random Seed on model
 Does changing the random seed during training but keeping the same dataset substantially change the outcome for this model?

In [4]:
RandomSeed=42
X_sm,y_sm,validateX,validateY=prepareData()
n_neighbours=[1,2,5,7]
algorithm=['auto','ball_tree','kd_tree','brute']
leaf_size=[10,20,30,40]
p=[1,2,5]
for testNumber in range(5):
    print("*"*10,end="\n\n")
    print("TEST NUMBER",testNumber+1,"Random Seed =",RandomSeed)
    r.seed(RandomSeed)
    RandomSeed=RandomSeed+10
    Bestscores=[0,0]
    for n in n_neighbours:
        for a in algorithm:
            for l in leaf_size:
                for P in p:
                    model=KNeighborsClassifier(n_neighbors=n,algorithm=a,leaf_size=l,p=P)
                    model.fit(X_sm,y_sm)
                    y_preds=model.predict(validateX)
                    currScore=balanced_accuracy_score(validateY,y_preds)
                    if(currScore>Bestscores[0]):
                        Bestscores[0]=currScore
                        Bestscores[1]=model.get_params()
    print("BEST SCORE:",str(Bestscores[0])+"\n"+"PARAMS:",Bestscores[1])

Hiding  24  habitable(2883,3716,3132,1604,2882,2129,2156,3741,3742,1845,151,1137,2155,2189,2316,1147,3115,1205,2014,130,1227,114,2547,2223, )
**********

TEST NUMBER 1 Random Seed = 42
BEST SCORE: 0.8039772727272727
PARAMS: {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 7, 'p': 5, 'weights': 'uniform'}
**********

TEST NUMBER 2 Random Seed = 52
BEST SCORE: 0.8039772727272727
PARAMS: {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 7, 'p': 5, 'weights': 'uniform'}
**********

TEST NUMBER 3 Random Seed = 62
BEST SCORE: 0.8039772727272727
PARAMS: {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 7, 'p': 5, 'weights': 'uniform'}
**********

TEST NUMBER 4 Random Seed = 72
BEST SCORE: 0.8039772727272727
PARAMS: {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'minkowski', 'metric_params': Non

### Effect of changing random seed when preparing data on model

In [5]:
RandomSeed=42
n_neighbours=[1,2,5,7]
algorithm=['auto','ball_tree','kd_tree','brute']
leaf_size=[10,20,30,40]
p=[1,2,5]
for testNumber in range(5):
    print("*"*10,end="\n\n")
    print("TEST NUMBER",testNumber+1,"Random Seed =",RandomSeed)
    r.seed(RandomSeed)
    X_sm,y_sm,validateX,validateY=prepareData()
    RandomSeed=RandomSeed+10
    Bestscores=[0,0]
    for n in n_neighbours:
        for a in algorithm:
            for l in leaf_size:
                for P in p:
                    model=KNeighborsClassifier(n_neighbors=n,algorithm=a,leaf_size=l,p=P)
                    model.fit(X_sm,y_sm)
                    y_preds=model.predict(validateX)
                    currScore=balanced_accuracy_score(validateY,y_preds)
                    if(currScore>Bestscores[0]):
                        Bestscores[0]=currScore
                        Bestscores[1]=model.get_params()
    print("BEST SCORE:",str(Bestscores[0])+"\n"+"PARAMS:",Bestscores[1])

**********

TEST NUMBER 1 Random Seed = 42
Hiding  24  habitable(3233,163,117,2031,2014,1845,1137,703,3716,153,3922,2883,130,128,1205,2156,2223,151,2542,1604,3133,3115,3741,2880, )
BEST SCORE: 0.7869318181818181
PARAMS: {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 7, 'p': 5, 'weights': 'uniform'}
**********

TEST NUMBER 2 Random Seed = 52
Hiding  24  habitable(2014,130,3962,2882,2547,2189,2503,128,1137,1227,2883,1604,2541,3233,114,152,3132,3742,117,1424,163,1205,1845,2542, )
BEST SCORE: 0.7471590909090909
PARAMS: {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 7, 'p': 2, 'weights': 'uniform'}
**********

TEST NUMBER 3 Random Seed = 62
Hiding  24  habitable(2902,1147,151,2014,2547,2135,3962,2223,1205,3233,1424,1227,114,2156,1845,986,128,3744,2441,2541,117,3741,2129,3115, )
BEST SCORE: 0.7244318181818181
PARAMS: {'algorithm': 'auto', 'leaf_size': 1

## Perceptron

### Effect of Random Seed on model
 Does changing the random seed during training but keeping the same dataset substantially change the outcome for this model?

In [4]:
RandomSeed=42
X_sm,y_sm,validateX,validateY=prepareData()
penalty=[None, 'l2' , 'elasticnet']
alpha=[ 1,0.1,0.01, 0.001]
max_iter=[500,1000,2000]
tol=[0.00001,0.0001,0.001,0.01]
n_iter_no_change=[10,13,20]
eta0=[1,10,20,50,100,200]
for testNumber in range(5):
    print("*"*10,end="\n\n")
    print("TEST NUMBER",testNumber+1,"Random Seed =",RandomSeed)
    r.seed(RandomSeed)
    RandomSeed=RandomSeed+10
    Bestscores=[0,0]
    for p in penalty:
        for a in alpha:
            for m in max_iter:
                for t in tol:
                    for e in eta0:
                        for n in n_iter_no_change:
                                model= Perceptron(penalty=p,alpha=a,tol=t,n_iter_no_change=n,max_iter=m,n_jobs=-1,early_stopping=True,eta0=e)
                                model.fit(X_sm,y_sm)
                                y_preds=model.predict(validateX)
                                currScore=balanced_accuracy_score(validateY,y_preds)
                                if(currScore>Bestscores[0]):
                                    Bestscores[0]=currScore
                                    Bestscores[1]=model.get_params()
    print("BEST SCORE:",str(Bestscores[0])+"\n"+"PARAMS:",Bestscores[1])

Hiding  24  habitable(2129,128,1227,3742,3606,151,3115,1845,2316,3744,1205,1424,2156,2547,3132,2902,2031,3133,2882,1604,2155,2541,986,163, )
**********

TEST NUMBER 1 Random Seed = 42
BEST SCORE: 0.8835227272727273
PARAMS: {'alpha': 0.001, 'class_weight': None, 'early_stopping': True, 'eta0': 1, 'fit_intercept': True, 'max_iter': 500, 'n_iter_no_change': 10, 'n_jobs': -1, 'penalty': 'l2', 'random_state': 0, 'shuffle': True, 'tol': 1e-05, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
**********

TEST NUMBER 2 Random Seed = 52
BEST SCORE: 0.8835227272727273
PARAMS: {'alpha': 0.001, 'class_weight': None, 'early_stopping': True, 'eta0': 1, 'fit_intercept': True, 'max_iter': 500, 'n_iter_no_change': 10, 'n_jobs': -1, 'penalty': 'l2', 'random_state': 0, 'shuffle': True, 'tol': 1e-05, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
**********

TEST NUMBER 3 Random Seed = 62
BEST SCORE: 0.8835227272727273
PARAMS: {'alpha': 0.001, 'class_weight': None, 'early_st

### Effect of changing random seed when preparing data on model

In [5]:
RandomSeed=42
penalty=[None, 'l2' , 'elasticnet']
alpha=[ 1,0.1,0.01, 0.001]
max_iter=[500,1000,2000]
tol=[0.00001,0.0001,0.001,0.01]
n_iter_no_change=[10,13,20]
eta0=[1,10,20,50,100,200]
for testNumber in range(5):
    print("*"*10,end="\n\n")
    print("TEST NUMBER",testNumber,"Random Seed =",RandomSeed)
    r.seed(RandomSeed)
    RandomSeed=RandomSeed+10
    X_sm,y_sm,validateX,validateY=prepareData()
    Bestscores=[0,0]
    for p in penalty:
        for a in alpha:
            for m in max_iter:
                for t in tol:
                    for e in eta0:
                        for n in n_iter_no_change:
                                model= Perceptron(penalty=p,alpha=a,tol=t,n_iter_no_change=n,max_iter=m,n_jobs=-1,early_stopping=True,eta0=e)
                                model.fit(X_sm,y_sm)
                                y_preds=model.predict(validateX)
                                currScore=balanced_accuracy_score(validateY,y_preds)
                                if(currScore>Bestscores[0]):
                                    Bestscores[0]=currScore
                                    Bestscores[1]=model.get_params()
    print("BEST SCORE:",str(Bestscores[0])+"\n"+"PARAMS:",Bestscores[1])

**********

TEST NUMBER 0 Random Seed = 42
Hiding  24  habitable(3233,163,117,2031,2014,1845,1137,703,3716,153,3922,2883,130,128,1205,2156,2223,151,2542,1604,3133,3115,3741,2880, )
BEST SCORE: 0.8607954545454546
PARAMS: {'alpha': 0.001, 'class_weight': None, 'early_stopping': True, 'eta0': 50, 'fit_intercept': True, 'max_iter': 500, 'n_iter_no_change': 13, 'n_jobs': -1, 'penalty': 'l2', 'random_state': 0, 'shuffle': True, 'tol': 1e-05, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
**********

TEST NUMBER 1 Random Seed = 52
Hiding  24  habitable(2014,130,3962,2882,2547,2189,2503,128,1137,1227,2883,1604,2541,3233,114,152,3132,3742,117,1424,163,1205,1845,2542, )
BEST SCORE: 0.84375
PARAMS: {'alpha': 0.01, 'class_weight': None, 'early_stopping': True, 'eta0': 1, 'fit_intercept': True, 'max_iter': 500, 'n_iter_no_change': 13, 'n_jobs': -1, 'penalty': 'l2', 'random_state': 0, 'shuffle': True, 'tol': 1e-05, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
****

## SVC

### Effect of Random Seed on model
 Does changing the random seed during training but keeping the same dataset substantially change the outcome for this model?

In [4]:
RandomSeed=42
X_sm,y_sm,validateX,validateY=prepareData()
coef=[0,0.1,0.01,0.001]
degree=[1,2,3]
C =[1,0.1,0.01,0.001]
shrink=[True,False]
decision_shape=['ovo','ovr']
tol=[1,0.1,0.01,0.001]

for testNumber in range(5):
    print("*"*10,end="\n\n")
    print("TEST NUMBER",testNumber+1,"Random Seed =",RandomSeed)
    Bestscores=[0,0]
    r.seed(RandomSeed)
    RandomSeed=RandomSeed+10
    for c in C:
        for dec in decision_shape:
            for d in degree:
                for co in coef:
                    for t in tol:
                        for s in shrink:
                            model=SVC(C=c,coef0=co,tol=t,kernel='linear',degree=d,gamma='auto',shrinking=s,decision_function_shape=dec)
                            model.fit(X_sm,y_sm)
                            y_preds=model.predict(validateX)
                            currScore=balanced_accuracy_score(validateY,y_preds)
                            if(currScore>Bestscores[0]):
                                Bestscores[0]=currScore
                                Bestscores[1]=model.get_params()
    print("BEST SCORE:",str(Bestscores[0])+"\n"+"PARAMS:",Bestscores[1])                            



Hiding  24  habitable(2135,2156,703,2503,3606,117,2155,2031,2547,128,1604,986,1205,3922,2223,2883,2014,3743,152,114,130,1227,2882,1137, )
**********

TEST NUMBER 1 Random Seed = 42
BEST SCORE: 0.7556818181818181
PARAMS: {'C': 0.001, 'cache_size': 200, 'class_weight': None, 'coef0': 0, 'decision_function_shape': 'ovo', 'degree': 1, 'gamma': 'auto', 'kernel': 'linear', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 1, 'verbose': False}
**********

TEST NUMBER 2 Random Seed = 52
BEST SCORE: 0.7556818181818181
PARAMS: {'C': 0.001, 'cache_size': 200, 'class_weight': None, 'coef0': 0, 'decision_function_shape': 'ovo', 'degree': 1, 'gamma': 'auto', 'kernel': 'linear', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 1, 'verbose': False}
**********

TEST NUMBER 3 Random Seed = 62
BEST SCORE: 0.7556818181818181
PARAMS: {'C': 0.001, 'cache_size': 200, 'class_weight': None, 'coef0': 0, 'decision_function_shape': 'ovo', '

### Effect of changing random seed when preparing data on model

In [5]:
RandomSeed=42
Bestscores=[0,0]
coef=[0,0.1,0.01,0.001]
degree=[1,2,3]
C =[1,0.1,0.01,0.001]
shrink=[True,False]
decision_shape=['ovo','ovr']
tol=[1,0.1,0.01,0.001]

for testNumber in range(5):
    print("*"*10,end="\n\n")
    print("TEST NUMBER",testNumber+1,"Random Seed =",RandomSeed)
    r.seed(RandomSeed)
    RandomSeed=RandomSeed+10
    X_sm,y_sm,validateX,validateY=prepareData()
    for c in C:
        for dec in decision_shape:
            for d in degree:
                for co in coef:
                    for t in tol:
                        for s in shrink:
                            model=SVC(C=c,coef0=co,tol=t,kernel='linear',degree=d,gamma='auto',shrinking=s,decision_function_shape=dec)
                            model.fit(X_sm,y_sm)
                            y_preds=model.predict(validateX)
                            currScore=balanced_accuracy_score(validateY,y_preds)
                            if(currScore>Bestscores[0]):
                                Bestscores[0]=currScore
                                Bestscores[1]=model.get_params()
    print("BEST SCORE:",str(Bestscores[0])+"\n"+"PARAMS:",Bestscores[1])                            



**********

TEST NUMBER 1 Random Seed = 42
Hiding  24  habitable(3233,163,117,2031,2014,1845,1137,703,3716,153,3922,2883,130,128,1205,2156,2223,151,2542,1604,3133,3115,3741,2880, )
BEST SCORE: 0.8238636363636364
PARAMS: {'C': 0.001, 'cache_size': 200, 'class_weight': None, 'coef0': 0, 'decision_function_shape': 'ovo', 'degree': 1, 'gamma': 'auto', 'kernel': 'linear', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': False, 'tol': 0.1, 'verbose': False}
**********

TEST NUMBER 2 Random Seed = 52
Hiding  24  habitable(2014,130,3962,2882,2547,2189,2503,128,1137,1227,2883,1604,2541,3233,114,152,3132,3742,117,1424,163,1205,1845,2542, )
BEST SCORE: 0.8238636363636364
PARAMS: {'C': 0.001, 'cache_size': 200, 'class_weight': None, 'coef0': 0, 'decision_function_shape': 'ovo', 'degree': 1, 'gamma': 'auto', 'kernel': 'linear', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': False, 'tol': 0.1, 'verbose': False}
**********

TEST NUMBER 3 Random Seed

## RandomForestClassifier

### Effect of Random Seed on model
 Does changing the random seed during training but keeping the same dataset substantially change the outcome for this model?

In [4]:
RandomSeed=42
X_sm,y_sm,validateX,validateY=prepareData()
estimators=[x for x in range(1,100)]
criterion=['gini','entropy']
max_depth=[6,7,8,9]

for testNumber in range(5):
    print("*"*10,end="\n\n")
    print("TEST NUMBER",testNumber+1,"Random Seed =",RandomSeed)
    r.seed(RandomSeed)
    RandomSeed=RandomSeed+10
    Bestscores=[0,0]
    for e in estimators:
        for c in criterion:
            for m in max_depth:
                    model=RandomForestClassifier(n_estimators=e,max_depth=m,criterion=c,n_jobs=-1)
                    model.fit(X_sm,y_sm)
                    y_preds=model.predict(validateX)
                    currScore=balanced_accuracy_score(validateY,y_preds)
                    if(currScore>Bestscores[0]):
                        Bestscores[0]=currScore
                        Bestscores[1]=model.get_params()
    print("BEST SCORE:",str(Bestscores[0])+"\n"+"PARAMS:",Bestscores[1])

Hiding  24  habitable(3716,3233,2542,3115,2902,3133,3744,152,2223,3132,2547,2189,2503,986,1205,163,3606,1137,1227,2135,703,153,2541,2882, )
**********

TEST NUMBER 1 Random Seed = 42
BEST SCORE: 0.9090909090909092
PARAMS: {'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': 6, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 7, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
**********

TEST NUMBER 2 Random Seed = 52
BEST SCORE: 0.9204545454545454
PARAMS: {'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': 6, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 5, 'n_jobs': -1, 'oob_score': False, 'r

### Effect of changing random seed when preparing data on model

In [5]:
RandomSeed=42
estimators=[x for x in range(1,100)]
criterion=['gini','entropy']
max_depth=[6,7,8,9]

for testNumber in range(5):
    print("*"*10,end="\n\n")
    print("TEST NUMBER",testNumber+1,"Random Seed =",RandomSeed)
    r.seed(RandomSeed)
    RandomSeed=RandomSeed+10
    X_sm,y_sm,validateX,validateY=prepareData()
    Bestscores=[0,0]
    for e in estimators:
        for c in criterion:
            for m in max_depth:
                    model=RandomForestClassifier(n_estimators=e,max_depth=m,criterion=c,n_jobs=-1)
                    model.fit(X_sm,y_sm)
                    y_preds=model.predict(validateX)
                    currScore=balanced_accuracy_score(validateY,y_preds)
                    if(currScore>Bestscores[0]):
                        Bestscores[0]=currScore
                        Bestscores[1]=model.get_params()
    print("BEST SCORE:",str(Bestscores[0])+"\n"+"PARAMS:",Bestscores[1])

**********

TEST NUMBER 1 Random Seed = 42
Hiding  24  habitable(3233,163,117,2031,2014,1845,1137,703,3716,153,3922,2883,130,128,1205,2156,2223,151,2542,1604,3133,3115,3741,2880, )
BEST SCORE: 0.9772727272727273
PARAMS: {'bootstrap': True, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 6, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 15, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
**********

TEST NUMBER 2 Random Seed = 52
Hiding  24  habitable(2014,130,3962,2882,2547,2189,2503,128,1137,1227,2883,1604,2541,3233,114,152,3132,3742,117,1424,163,1205,1845,2542, )
BEST SCORE: 0.9005681818181818
PARAMS: {'bootstrap': True, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 6, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_spli