In [1]:
import pandas as pd
import random as r
from sklearn.metrics import balanced_accuracy_score
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE

Using TensorFlow backend.


In [2]:
preprocessed=pd.read_csv("../input/finalexoplanetsdataset/PreprocessedDataset.csv")
habitableRows=list(preprocessed.rowid[preprocessed.habitable==True])

In [3]:
import time
begin=time.time()
def prepareData():
    #List of columns to be used for training
    #it will be all columns in preprocessed except for "habitable" and "rowid"
    trainCols=[x for x in preprocessed.columns if x not in ['habitable','rowid']]
    validate=[]
    Hcopy=habitableRows.copy()
    #numHidden sets the number of habitable planets to use for validation of the model
    numHidden=round(len(Hcopy)/2)

    #Randomly select habitable exoplanets and 
    #add their row_id to validation set 
    print("Hiding ",numHidden," habitable(",end="")
    for i in range(numHidden): 
        randNum=r.randint(0,len(Hcopy)-1)  
        validate.append(Hcopy[randNum])
        print(Hcopy[randNum],end=",")
        del Hcopy[randNum]
    print("\b )")

    #Add row_id of non-habitable planets to the validation set till its length becomes 200
    while len(validate)<200:
        temp=r.randint(0,len(preprocessed)-1)
        if temp not in habitableRows and temp not in validate:
            validate.append(temp)

    #Take all columns of the planets whose row_id is in "validate" variable(in the validation set) and
    #store it in "validate" variable
    validate=preprocessed[preprocessed.rowid.isin(validate)]

    #Store the planets that are not in the validation set in the training set
    trainingSet=preprocessed[~preprocessed.rowid.isin(validate.rowid)]

    #Store the training features in X and target feature(habitable or not) in y 
    X=trainingSet[trainCols]
    y=trainingSet.habitable

    #the SMOTE library mutates existing data to creating more data
    #Here we use SMOTE to increase the number of habitable planets in the training and validation data
    smote = SMOTE(ratio='minority')
    X_sm, y_sm = smote.fit_sample(X, y)
    validateX,validateY=smote.fit_sample(validate[trainCols],validate.habitable)
    return X_sm,y_sm,validateX,validateY

# Effect of Random Seed on model
 Does changing the random seed during training but keeping the same dataset substantially change the outcome for this model?

In [4]:
RandomSeed=42
X_sm,y_sm,validateX,validateY=prepareData()
coef=[0,0.1,0.01,0.001]
degree=[1,2,3]
C =[1,0.1,0.01,0.001]
shrink=[True,False]
decision_shape=['ovo','ovr']
tol=[1,0.1,0.01,0.001]

for testNumber in range(5):
    print("*"*10,end="\n\n")
    print("TEST NUMBER",testNumber+1,"Random Seed =",RandomSeed)
    Bestscores=[0,0]
    r.seed(RandomSeed)
    RandomSeed=RandomSeed+10
    for c in C:
        for dec in decision_shape:
            for d in degree:
                for co in coef:
                    for t in tol:
                        for s in shrink:
                            model=SVC(C=c,coef0=co,tol=t,kernel='linear',degree=d,gamma='auto',shrinking=s,decision_function_shape=dec)
                            model.fit(X_sm,y_sm)
                            y_preds=model.predict(validateX)
                            currScore=balanced_accuracy_score(validateY,y_preds)
                            if(currScore>Bestscores[0]):
                                Bestscores[0]=currScore
                                Bestscores[1]=model.get_params()
    print("BEST SCORE:",str(Bestscores[0])+"\n"+"PARAMS:",Bestscores[1])                            



Hiding  24  habitable(2135,2156,703,2503,3606,117,2155,2031,2547,128,1604,986,1205,3922,2223,2883,2014,3743,152,114,130,1227,2882,1137, )
**********

TEST NUMBER 1 Random Seed = 42
BEST SCORE: 0.7556818181818181
PARAMS: {'C': 0.001, 'cache_size': 200, 'class_weight': None, 'coef0': 0, 'decision_function_shape': 'ovo', 'degree': 1, 'gamma': 'auto', 'kernel': 'linear', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 1, 'verbose': False}
**********

TEST NUMBER 2 Random Seed = 52
BEST SCORE: 0.7556818181818181
PARAMS: {'C': 0.001, 'cache_size': 200, 'class_weight': None, 'coef0': 0, 'decision_function_shape': 'ovo', 'degree': 1, 'gamma': 'auto', 'kernel': 'linear', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 1, 'verbose': False}
**********

TEST NUMBER 3 Random Seed = 62
BEST SCORE: 0.7556818181818181
PARAMS: {'C': 0.001, 'cache_size': 200, 'class_weight': None, 'coef0': 0, 'decision_function_shape': 'ovo', '

# Effect of changing random seed when preparing data on model

In [5]:
RandomSeed=42
Bestscores=[0,0]
coef=[0,0.1,0.01,0.001]
degree=[1,2,3]
C =[1,0.1,0.01,0.001]
shrink=[True,False]
decision_shape=['ovo','ovr']
tol=[1,0.1,0.01,0.001]

for testNumber in range(5):
    print("*"*10,end="\n\n")
    print("TEST NUMBER",testNumber+1,"Random Seed =",RandomSeed)
    r.seed(RandomSeed)
    RandomSeed=RandomSeed+10
    X_sm,y_sm,validateX,validateY=prepareData()
    for c in C:
        for dec in decision_shape:
            for d in degree:
                for co in coef:
                    for t in tol:
                        for s in shrink:
                            model=SVC(C=c,coef0=co,tol=t,kernel='linear',degree=d,gamma='auto',shrinking=s,decision_function_shape=dec)
                            model.fit(X_sm,y_sm)
                            y_preds=model.predict(validateX)
                            currScore=balanced_accuracy_score(validateY,y_preds)
                            if(currScore>Bestscores[0]):
                                Bestscores[0]=currScore
                                Bestscores[1]=model.get_params()
    print("BEST SCORE:",str(Bestscores[0])+"\n"+"PARAMS:",Bestscores[1])                            



**********

TEST NUMBER 1 Random Seed = 42
Hiding  24  habitable(3233,163,117,2031,2014,1845,1137,703,3716,153,3922,2883,130,128,1205,2156,2223,151,2542,1604,3133,3115,3741,2880, )
BEST SCORE: 0.8238636363636364
PARAMS: {'C': 0.001, 'cache_size': 200, 'class_weight': None, 'coef0': 0, 'decision_function_shape': 'ovo', 'degree': 1, 'gamma': 'auto', 'kernel': 'linear', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': False, 'tol': 0.1, 'verbose': False}
**********

TEST NUMBER 2 Random Seed = 52
Hiding  24  habitable(2014,130,3962,2882,2547,2189,2503,128,1137,1227,2883,1604,2541,3233,114,152,3132,3742,117,1424,163,1205,1845,2542, )
BEST SCORE: 0.8238636363636364
PARAMS: {'C': 0.001, 'cache_size': 200, 'class_weight': None, 'coef0': 0, 'decision_function_shape': 'ovo', 'degree': 1, 'gamma': 'auto', 'kernel': 'linear', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': False, 'tol': 0.1, 'verbose': False}
**********

TEST NUMBER 3 Random Seed

In [6]:
end=time.time()
print(begin-end)

-15550.479897975922
