In [None]:
# run in colab!

# when getting the "A UTF-8 locale is required" error
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

# !nvidia-smi

from google.colab import drive
drive.mount('/content/drive')

!pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12 cuml-cu12 cupy-cuda12x


from DataHandler import DataHandler
from SVCModelTrainer import SVCModelTrainer
from SVCModelTester import SVCModelTester


### SVM for photometric features

In [None]:
search_param_distr  = [
    {"kernel": ["linear"], "C": [0.1, 1, 10]},
    {"kernel": ["poly"], "C": [0.1, 1, 10], 'degree' : [2,3,4], "gamma": ['scale', 'auto', 0.005, 0.1, 0.5]},
    {"kernel": ["rbf"], "C": [0.1, 1, 10], "gamma": ['scale', 'auto', 0.005, 0.1, 0.5]},
  ]

search_params = dict(
    cv = 2,
    n_jobs = -1,
    verbose = 10,
    scoring = ['f1'],
    refit = 'f1',
    error_score = 'raise',
)
search_class = "GridSearchCV"
name = "search-1"

trainer = SVCModelTrainer(data = data, name = name)
trainer.params_search(search_param_distr = search_param_distr, search_params = search_params,
                      search_class = search_class, name = name)

In [None]:
search_param_distr = {
    "kernel": ["rbf"],
    "C": [0.005, 0.01, 0.05, 0.1, 0.5, 1, 10],
    "gamma": ['auto', 0.005]
    }
search_params = dict(
    cv = 4,
    n_jobs = -1,
    verbose = 10,
    scoring = ['f1'],
    refit = 'f1',
    error_score = 'raise',
)
search_class = "GridSearchCV"
name = "search-2"

trainer = SVCModelTrainer(data = data, name = name)
trainer.params_search(search_param_distr = search_param_distr, search_params = search_params,
                      search_class = search_class, name = name)

Fitting 4 folds for each of 14 candidates, totalling 56 fits
----------------------------------------------------------------------
Best model: score 0.6835157732035475
SVC()
Best model: not selected


In [None]:
# select best model
params =  {
    "kernel": "rbf",
    "C":  0.01,
    "gamma": 'auto',
}

data = DataHandler(validation_sample= False, features_txt= 'all_features.txt', balance= 'undersample')
data.main()

trainer = SVCModelTrainer(data = data, name = 'all_features')
# trainer.train_model(model_params = params)
trainer.load_model()

Training: 224399 members, 469500 non members
Testing: 96171 members, 201218 non members
----------------------------------------------------------------------
Training members after undersampling...: 224399 (50.00%)
Training non members after undersampling...: 224399 (50.00%)


In [None]:
# save model in google drive
!cp saved_models/all_features.joblib '/content/drive/MyDrive/Cluster-Membership-DATA/'

In [None]:
tester = SVCModelTester(model = trainer.model, data = data, name = 'all_features')
tester.main()

### SVM with BCG redshift


In [None]:
data = DataHandler(validation_sample= False, features_txt= 'all_features_bcg.txt', balance= 'undersample', fields_list = ['W06'])
data.main()

Training: 27361 members, 63746 non members
Testing: 11726 members, 27321 non members
----------------------------------------------------------------------
Training members after undersampling...: 27361 (50.00%)
Training non members after undersampling...: 27361 (50.00%)


In [None]:
search_param_distr  = [
    {"kernel": ["linear"], "C": [0.01, 0.1]},
    {"kernel": ["poly"], "C": [0.01, 0.1], 'degree' : [2,3], "gamma": ['scale', 'auto', 0.005, 0.01]},
    {"kernel": ["rbf"], "C": [0.01, 0.1], "gamma": ['scale', 'auto', 0.005, 0.01]},
  ]
search_params = dict(
    cv = 2,
    n_jobs = -1,
    verbose = 100,
    scoring = ['f1'],
    refit = 'f1',
    error_score = 'raise',
)
search_class = "GridSearchCV"
name = "search-1"

trainer = SVCModelTrainer(data = data, name = name)
trainer.params_search(search_param_distr = search_param_distr, search_params = search_params,
                      search_class = search_class, name = name)

Fitting 2 folds for each of 26 candidates, totalling 52 fits
----------------------------------------------------------------------
Best model: score 0.7659939184424148
SVC()


In [None]:
search_param_distr  = {
    "kernel": ["rbf"],
    "C": [0.05, 0.1, 0.5],
    "gamma": ['auto', 0.01, 0.05]
    }
search_params = dict(
    cv = 2,
    n_jobs = -1,
    verbose = 100,
    scoring = ['f1'],
    refit = 'f1',
    error_score = 'raise',
)
search_class = "GridSearchCV"
name = "search-2"

trainer = SVCModelTrainer(data = data, name = name)
trainer.params_search(search_param_distr = search_param_distr, search_params = search_params,
                      search_class = search_class, name = name)

Fitting 2 folds for each of 9 candidates, totalling 18 fits
----------------------------------------------------------------------
Best model: score 0.8044779405471654
SVC()


In [None]:
search_param_distr  = {
    "kernel": ["rbf"],
    "C": [0.05, 0.3, 0.7],
    "gamma": ['auto', 0.05, 0.1, 0.5]
    }
search_params = dict(
    cv = 2,
    n_jobs = -1,
    verbose = 100,
    scoring = ['f1'],
    refit = 'f1',
    error_score = 'raise',
)
search_class = "GridSearchCV"
name = "search-3"

trainer = SVCModelTrainer(data = data, name = name)
trainer.params_search(search_param_distr = search_param_distr, search_params = search_params,
                      search_class = search_class, name = name)

Fitting 2 folds for each of 12 candidates, totalling 24 fits
----------------------------------------------------------------------
Best model: score 0.8091026953214238
SVC()


In [None]:
search_param_distr  = {
    "kernel": ["rbf"],
    "C": [1, 3, 5, 7, 10],
    "gamma": [0.05, 0.1, 0.2]
    }
search_params = dict(
    cv = 4,
    n_jobs = -1,
    verbose = 100,
    scoring = ['f1'],
    refit = 'f1',
    error_score = 'raise',
)
search_class = "GridSearchCV"
name = "search-4"

trainer = SVCModelTrainer(data = data, name = name)
trainer.params_search(search_param_distr = search_param_distr, search_params = search_params,
                      search_class = search_class, name = name)

Fitting 4 folds for each of 15 candidates, totalling 60 fits
----------------------------------------------------------------------
Best model: score 0.8255947912962573
SVC()


In [None]:
# select best model
params =  {
    "kernel": "rbf",
    "C":  10,
    "gamma": 0.1,
}

data = DataHandler(validation_sample= False, features_txt= 'all_features_bcg.txt', balance= 'undersample')
data.main()

trainer = SVCModelTrainer(data = data, name = 'all_features_bcg')
trainer.train_model(model_params = params)

Training: 224399 members, 469500 non members
Testing: 96171 members, 201218 non members
----------------------------------------------------------------------
Training members after undersampling...: 224399 (50.00%)
Training non members after undersampling...: 224399 (50.00%)


In [None]:
# save model in google drive
!cp saved_models/all_features_bcg.joblib '/content/drive/MyDrive/Cluster-Membership-DATA/'

In [None]:
tester = SVCModelTester(model = trainer.model, data = data, name = 'all_features_bcg')
tester.main()

### SVM with local density

In [None]:
data = DataHandler(validation_sample= False, features_txt= 'all_features_sigmas.txt', balance= 'undersample', fields_list = ['W06'])
data.main()

Training: 27361 members, 63746 non members
Testing: 11726 members, 27321 non members
----------------------------------------------------------------------
Training members after undersampling...: 27361 (50.00%)
Training non members after undersampling...: 27361 (50.00%)


In [None]:
search_param_distr  = [
    {"kernel": ["linear"], "C": [0.1, 1, 5]},
    {"kernel": ["poly"], "C": [0.1, 1, 5], 'degree' : [2,3], "gamma": ['scale', 'auto', 0.005, 0.1]},
    {"kernel": ["rbf"], "C": [0.1, 1, 5], "gamma": ['scale', 'auto', 0.005, 0.1]},
  ]
search_params = dict(
    cv = 2,
    n_jobs = -1,
    verbose = 10,
    scoring = ['f1'],
    refit = 'f1',
    error_score = 'raise',
)
search_class = "GridSearchCV"
name = "search-1"

trainer = SVCModelTrainer(data = data, name = name)
trainer.params_search(search_param_distr = search_param_distr, search_params = search_params,
                      search_class = search_class, name = name)

Fitting 2 folds for each of 39 candidates, totalling 78 fits
----------------------------------------------------------------------
Best model: score 0.7996234022431391
SVC()


In [None]:
search_param_distr  = {
    "kernel": ["rbf"],
    "C": [3, 5, 7, 10],
    "gamma": ['scale', 0.005, 0.01, 0.001]
    },

search_params = dict(
    cv = 2,
    n_jobs = -1,
    verbose = 10,
    scoring = ['f1'],
    refit = 'f1',
    error_score = 'raise',
)
search_class = "GridSearchCV"
name = "search-2"

trainer = SVCModelTrainer(data = data, name = name)
trainer.params_search(search_param_distr = search_param_distr, search_params = search_params,
                      search_class = search_class, name = name)

Fitting 2 folds for each of 16 candidates, totalling 32 fits
----------------------------------------------------------------------
Best model: score 0.8066730738512473
SVC()


In [None]:
search_param_distr  = {
    "kernel": ["rbf"],
    "C": [3, 5, 7, 10, 15],
    "gamma": ['scale', 0.001, 0.0005]
    },

search_params = dict(
    cv = 4,
    n_jobs = -1,
    verbose = 10,
    scoring = ['f1'],
    refit = 'f1',
    error_score = 'raise',
)
search_class = "GridSearchCV"
name = "search-3"

trainer = SVCModelTrainer(data = data, name = name)
trainer.params_search(search_param_distr = search_param_distr, search_params = search_params,
                      search_class = search_class, name = name)

Fitting 4 folds for each of 15 candidates, totalling 60 fits
----------------------------------------------------------------------
Best model: score 0.8167198859241869
SVC()


In [None]:
# select best model
params =  {
    "kernel": "rbf",
    "C":  3,
    "gamma": 0.0005,
}

data = DataHandler(validation_sample= False, features_txt= 'all_features_sigmas.txt', balance= 'undersample')
data.main()

trainer = SVCModelTrainer(data = data, name = 'all_features_sigmas')
trainer.train_model(model_params = params)

Training: 224399 members, 469500 non members
Testing: 96171 members, 201218 non members
----------------------------------------------------------------------
Training members after undersampling...: 224399 (50.00%)
Training non members after undersampling...: 224399 (50.00%)


In [None]:
# save model in google drive
!cp saved_models/all_features_sigmas.joblib '/content/drive/MyDrive/Cluster-Membership-DATA/'

In [None]:
tester = SVCModelTester(model = trainer.model, data = data, name = 'all_features_sigmas')
tester.main()