In [3]:
import numpy as np
from matplotlib import pyplot as plt
import matplotlib
from matplotlib import colors as mcolors
import pandas as pd
import sklearn as sk
import xgboost
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

#%matplotlib inline

***Load training dataset.***

In [2]:
import pandas as pd
dataset = pd.read_csv("training_set.csv")
dataset.head()


Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,615,59750.4229,2,-544.810303,3.622952,1
1,615,59750.4306,1,-816.434326,5.55337,1
2,615,59750.4383,3,-471.385529,3.801213,1
3,615,59750.445,4,-388.984985,11.395031,1
4,615,59752.407,2,-681.858887,4.041204,1


In [3]:
meta_dataset = pd.read_csv("training_set_metadata.csv")
column_names = {6: "class_6", 15: "class_15", 16: "class_16", 42: "class_42", 52: "class_52", 53: "class_53",
                62: "class_62", 64: "class_64", 65: "class_65", 67: "class_67", 88: "class_88", 90: "class_90",
                92: "class_92", 95: "class_95"}
meta_dataset["target"] = list(map(lambda name: column_names[name], meta_dataset["target"]))
meta_dataset.head()

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017,class_92
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,0.007,class_88
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.232,0.2262,0.0157,40.2561,0.021,class_42
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,0.007,class_90
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,0.024,class_90


In [4]:
meta_dataset['in_our_galaxy'] = meta_dataset['distmod'].apply(np.isnan).astype(int)
meta_dataset.head()

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target,in_our_galaxy
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017,class_92,1
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,0.007,class_88,0
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.232,0.2262,0.0157,40.2561,0.021,class_42,0
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,0.007,class_90,0
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,0.024,class_90,0


In [5]:
meta_dataset=meta_dataset.replace(np.nan, 0)

In [6]:
check_nan = meta_dataset.isnull().values.any()
print(check_nan)

False


In [7]:
meta_dataset.head()

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target,in_our_galaxy
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,0.0,0.017,class_92,1
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,0.007,class_88,0
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.232,0.2262,0.0157,40.2561,0.021,class_42,0
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,0.007,class_90,0
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,0.024,class_90,0


In [8]:
meta_dataset.shape

(7848, 13)

In [9]:
meta_dataset = pd.DataFrame(meta_dataset)

In [10]:
meta_dataset = meta_dataset.drop(['hostgal_specz'], axis=1)



In [11]:
meta_dataset.shape

(7848, 12)

In [12]:
training_dataset = pd.merge(dataset, meta_dataset) # with additional computed features


In [13]:
training_dataset.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,ra,decl,gal_l,gal_b,ddf,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target,in_our_galaxy
0,615,59750.4229,2,-544.810303,3.622952,1,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,0.017,class_92,1
1,615,59750.4306,1,-816.434326,5.55337,1,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,0.017,class_92,1
2,615,59750.4383,3,-471.385529,3.801213,1,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,0.017,class_92,1
3,615,59750.445,4,-388.984985,11.395031,1,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,0.017,class_92,1
4,615,59752.407,2,-681.858887,4.041204,1,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,0.017,class_92,1


***Group objects by object_id and then by passband number***

In [14]:
gb = training_dataset.groupby('object_id')
object_id = np.linspace(1, len(set(training_dataset['object_id'])), len(set(training_dataset['object_id'])))

In [15]:
object_id

array([1.000e+00, 2.000e+00, 3.000e+00, ..., 7.846e+03, 7.847e+03,
       7.848e+03])

In [16]:
pass_flu = dict()
for x in gb.groups:
    pass_flu[x] = dict()
    length = len(gb.get_group(x)['mjd'])
    pass_gb = gb.get_group(x)[['mjd', 'passband', 'flux', 'target', 'in_our_galaxy']].groupby('passband')
    for i in pass_gb.groups:
        pass_flu[x][i] = pass_gb.get_group(i)[['mjd', 'flux', 'target', 'in_our_galaxy']].sort_values(by=['mjd'])
        # range of particular object measurements
        pass_flu[x]['range'] = length
        # range of particular passband measurements
        pass_flu[x]['range_{}'.format(i)] = len(pass_flu[x][i])

***Find maximum sample size per passband***

In [17]:
max_n_measurement_samples = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
for i in set(training_dataset['object_id']):
    if pass_flu[i]['range_0'] > max_n_measurement_samples[0]:
        max_n_measurement_samples[0] = pass_flu[i]['range_0']
    if pass_flu[i]['range_1'] > max_n_measurement_samples[1]:
        max_n_measurement_samples[1] = pass_flu[i]['range_1']
    if pass_flu[i]['range_2'] > max_n_measurement_samples[2]:
        max_n_measurement_samples[2] = pass_flu[i]['range_2']
    if pass_flu[i]['range_3'] > max_n_measurement_samples[3]:
        max_n_measurement_samples[3] = pass_flu[i]['range_3']
    if pass_flu[i]['range_4'] > max_n_measurement_samples[4]:
        max_n_measurement_samples[4] = pass_flu[i]['range_4']
    if pass_flu[i]['range_5'] > max_n_measurement_samples[5]:
        max_n_measurement_samples[5] = pass_flu[i]['range_5']
length = sum([value for key, value in max_n_measurement_samples.items()])

In [18]:
def concatention():
    """Concatenate each passband samples to the maximum value (need to be the same length around all objects,
    simple method fills missing values with zeros.)"""
    y = pd.concat([pass_flu[i][0][['mjd', 'flux']],
                   pd.DataFrame(0, index=np.arange(max_n_measurement_samples[0]-pass_flu[i]['range_0']), 
                                columns=['mjd', 'flux']),
                   pass_flu[i][1][['mjd', 'flux']],
                   pd.DataFrame(0, index=np.arange(max_n_measurement_samples[1]-pass_flu[i]['range_1']), 
                                columns=['mjd', 'flux']),
                   pass_flu[i][2][['mjd', 'flux']],
                   pd.DataFrame(0, index=np.arange(max_n_measurement_samples[2]-pass_flu[i]['range_2']), 
                                columns=['mjd', 'flux']),
                   pass_flu[i][3][['mjd', 'flux']],
                   pd.DataFrame(0, index=np.arange(max_n_measurement_samples[3]-pass_flu[i]['range_3']), 
                                columns=['mjd', 'flux']),
                   pass_flu[i][4][['mjd', 'flux']],
                   pd.DataFrame(0, index=np.arange(max_n_measurement_samples[4]-pass_flu[i]['range_4']), 
                                columns=['mjd', 'flux']),
                   pass_flu[i][5][['mjd', 'flux']],
                   pd.DataFrame(0, index=np.arange(max_n_measurement_samples[5]-pass_flu[i]['range_5']), 
                                columns=['mjd', 'flux']),
                  ])
    return y['flux']

***Cesium library usage***

In [19]:
import cesium
from cesium import featurize
features_to_use = ["amplitude",
                   "flux_percentile_ratio_mid20",
                   "flux_percentile_ratio_mid35",
                   "flux_percentile_ratio_mid50",
                   "flux_percentile_ratio_mid65",
                   "flux_percentile_ratio_mid80",
                   "percent_beyond_1_std",
                   "maximum",
                   "max_slope",
                   "median",
                   "median_absolute_deviation",
                   "percent_close_to_median",
                   "percent_difference_flux_percentile",
                   "minimum",
                   "percent_amplitude",
                   "skew",
                   "period_fast",
                   "qso_log_chi2_qsonu",
                   "qso_log_chi2nuNULL_chi2nu",
                   "std",
                   "stetson_j",
                   "stetson_k",
                   "weighted_average",
                   "all_times_nhist_numpeaks",
                   "all_times_nhist_peak1_bin",
                   "all_times_nhist_peak2_bin",
                   "all_times_nhist_peak3_bin",
                   "all_times_nhist_peak4_bin",
                   "all_times_nhist_peak_1_to_2",
                   "all_times_nhist_peak_1_to_3",
                   "all_times_nhist_peak_1_to_4",
                   "all_times_nhist_peak_2_to_3",
                   "all_times_nhist_peak_2_to_4",
                   "all_times_nhist_peak_3_to_4",
                   "all_times_nhist_peak_val",
                   "avg_double_to_single_step",
                   "avg_err",
                   "avgt",
                   "cad_probs_1",
                   "cad_probs_10",
                   "cad_probs_20",
                   "cad_probs_30",
                   "cad_probs_40",
                   "cad_probs_50",
                   "cad_probs_100",
                   "cad_probs_500",
                   "cad_probs_1000",
                   "cad_probs_5000",
                   "cad_probs_10000",
                   "cad_probs_50000",
                   "cad_probs_100000",
                   "cad_probs_500000",
                   "cad_probs_1000000",
                   "cad_probs_5000000",
                   "cad_probs_10000000",
                   "cads_avg",
                   "cads_med",
                   "cads_std",
                   "mean",
                   "med_double_to_single_step",
                   "med_err",
                   "n_epochs",
                   "std_double_to_single_step",
                   "std_err",
                   "total_time"]


***Compute new features by cesium***

In [20]:
gb = dataset.groupby('object_id')
fset_cesium = featurize.featurize_time_series(times=[gb.get_group(x)['mjd'].values for x in gb.groups],
                                              values=[gb.get_group(x)['flux'].values for x in gb.groups],
                                              errors=[gb.get_group(x)['flux_err'].values for x in gb.groups],
                                              features_to_use=features_to_use)

  return (cads[2:] + cads[:-2]) / (cads[1:-1] - cads[:-2])
  x = asanyarray(arr - arrmean)
  linear_scale_data = base ** (exponent * x)
  linear_scale_data = base ** (exponent * x)
  linear_scale_data = base ** (exponent * x)
  linear_scale_data = base ** (exponent * x)
  linear_scale_data = base ** (exponent * x)
  linear_scale_data = base ** (exponent * x)
  linear_scale_data = base ** (exponent * x)
  linear_scale_data = base ** (exponent * x)
  return (cads[2:] + cads[:-2]) / (cads[1:-1] - cads[:-2])
  x = asanyarray(arr - arrmean)
  linear_scale_data = base ** (exponent * x)
  linear_scale_data = base ** (exponent * x)
  return (y_95 - y_5) / y_50
  return (cads[2:] + cads[:-2]) / (cads[1:-1] - cads[:-2])
  x = asanyarray(arr - arrmean)
  linear_scale_data = base ** (exponent * x)
  diff_b_a = subtract(b, a)
  linear_scale_data = base ** (exponent * x)
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  linear_scale_data = base ** (exponent * x)
  diff_b_a = subtract(b, a)
 

In [21]:
fset_cesium.insert(loc=0, column='object_id', value=dataset.groupby('object_id').sum().index)

In [22]:
fset_cesium.columns = fset_cesium.columns.droplevel(1)

In [23]:
fset_cesium.head()

feature,object_id,amplitude,flux_percentile_ratio_mid20,flux_percentile_ratio_mid35,flux_percentile_ratio_mid50,flux_percentile_ratio_mid65,flux_percentile_ratio_mid80,percent_beyond_1_std,maximum,max_slope,...,cads_avg,cads_med,cads_std,mean,med_double_to_single_step,med_err,n_epochs,std_double_to_single_step,std_err,total_time
0,615,880.533203,,,,,,0.392045,660.626343,116805.166334,...,2.489431,0.0111,17.326857,-123.096998,0.006542,3.835268,352.0,,1.742267,873.7903
1,713,14.753032,0.001668,0.005248,0.018069,0.123541,0.254958,0.425714,14.770886,1580.576999,...,2.432148,0.011,18.975828,-1.423351,0.007325,1.998217,350.0,,1.50773,848.8198
2,730,33.234935,0.028363,0.053155,0.095119,0.152027,0.448602,0.154545,47.310059,3949.860787,...,2.595275,0.011,18.160894,2.267434,0.006387,1.990851,330.0,,1.718524,853.8455
3,745,118.144837,0.024878,0.048751,0.082448,0.147631,0.29117,0.082621,220.795212,11188.062767,...,2.43916,0.0111,16.745572,8.909206,0.006367,1.819875,351.0,,3.532281,853.706
4,1124,80.071971,0.009432,0.016523,0.025336,0.042001,0.120672,0.096591,143.600189,13330.938698,...,2.489431,0.0111,17.326857,7.145702,0.006542,2.214854,352.0,,1.931089,873.7903


In [24]:
fset_cesium.isnull().sum().head()

feature
object_id                        0
amplitude                        0
flux_percentile_ratio_mid20    362
flux_percentile_ratio_mid35    362
flux_percentile_ratio_mid50    362
dtype: int64

***Impute missing values with median***

In [25]:
fset_cesium=fset_cesium.replace(np.nan, -1)
fset_cesium=fset_cesium.replace(np.inf, -1)

In [26]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=-1, strategy='median')
x = imp.fit_transform(fset_cesium.values)
x = imp.fit_transform(x)

In [27]:
fset_cesium_imputed = pd.DataFrame(data=x, columns=fset_cesium.columns)

In [28]:
fset_cesium_imputed['object_id'] = fset_cesium_imputed['object_id'].apply(int)
fset_cesium_imputed.head()

feature,object_id,amplitude,flux_percentile_ratio_mid20,flux_percentile_ratio_mid35,flux_percentile_ratio_mid50,flux_percentile_ratio_mid65,flux_percentile_ratio_mid80,percent_beyond_1_std,maximum,max_slope,...,cads_avg,cads_med,cads_std,mean,med_double_to_single_step,med_err,n_epochs,std_double_to_single_step,std_err,total_time
0,615,880.533203,9.12479e-10,4.385178e-09,3.470046e-08,8.375927e-07,0.000408,0.392045,660.626343,116805.166334,...,2.489431,0.0111,17.326857,-123.096998,0.006542,3.835268,352.0,194.500737,1.742267,873.7903
1,713,14.753032,0.001668044,0.005248029,0.01806853,0.1235408,0.254958,0.425714,14.770886,1580.576999,...,2.432148,0.011,18.975828,-1.423351,0.007325,1.998217,350.0,194.500737,1.50773,848.8198
2,730,33.234935,0.02836346,0.05315546,0.09511855,0.1520266,0.448602,0.154545,47.310059,3949.860787,...,2.595275,0.011,18.160894,2.267434,0.006387,1.990851,330.0,194.500737,1.718524,853.8455
3,745,118.144837,0.02487798,0.04875097,0.08244846,0.1476312,0.29117,0.082621,220.795212,11188.062767,...,2.43916,0.0111,16.745572,8.909206,0.006367,1.819875,351.0,194.500737,3.532281,853.706
4,1124,80.071971,0.009431657,0.01652333,0.02533619,0.04200131,0.120672,0.096591,143.600189,13330.938698,...,2.489431,0.0111,17.326857,7.145702,0.006542,2.214854,352.0,194.500737,1.931089,873.7903


***Computing additional features (manual)***

In [29]:
for key, val in {'u': 0, 'g': 1, 'r': 2, 'i': 3, 'z': 4, 'y': 5}.items():
    dataset[key] = (dataset['passband']-val).apply(np.bool).apply(np.logical_not).apply(np.int)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dataset[key] = (dataset['passband']-val).apply(np.bool).apply(np.logical_not).apply(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dataset[key] = (dataset['passband']-val).apply(np.bool).apply(np.logical_not).apply(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dataset[key] = (dataset['passband']-val).apply(np.bool).apply(np.logical_not).apply(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dataset[key] = (dataset['passband']-val).apply(np.bool).apply(np.logical_not).apply(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dataset[key]

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,u,g,r,i,z,y
0,615,59750.4229,2,-544.810303,3.622952,1,0,0,1,0,0,0
1,615,59750.4306,1,-816.434326,5.55337,1,0,1,0,0,0,0
2,615,59750.4383,3,-471.385529,3.801213,1,0,0,0,1,0,0
3,615,59750.445,4,-388.984985,11.395031,1,0,0,0,0,1,0
4,615,59752.407,2,-681.858887,4.041204,1,0,0,1,0,0,0


In [30]:
import gc
gc.enable()
dataset['flux_ratio_sq'] = np.power(dataset['flux'] / dataset['flux_err'], 2.0)
dataset['flux_by_flux_ratio_sq'] = dataset['flux'] * dataset['flux_ratio_sq']

aggs = {
    'mjd': ['min', 'max', 'size'],
    'passband': ['min', 'max', 'mean', 'median', 'std'],
    'flux': ['min', 'max', 'mean', 'median', 'std','skew'],
    'flux_err': ['min', 'max', 'mean', 'median', 'std','skew'],
    'detected': ['mean'],
    'flux_ratio_sq':['sum','skew'],
    'flux_by_flux_ratio_sq':['sum','skew'],
    'r': ['sum', 'mean'],
    'g': ['sum', 'mean'],
    'i': ['sum', 'mean'],
    'u': ['sum', 'mean'],
    'z': ['sum', 'mean'],
    'y': ['sum', 'mean'],
}

agg_train = dataset.groupby('object_id').agg(aggs)
new_columns = [k + '_' + agg for k in aggs.keys() for agg in aggs[k]]
agg_train.columns = new_columns
agg_train['mjd_diff'] = agg_train['mjd_max'] - agg_train['mjd_min']
agg_train['flux_diff'] = agg_train['flux_max'] - agg_train['flux_min']
agg_train['flux_dif2'] = (agg_train['flux_max'] - agg_train['flux_min']) / agg_train['flux_mean']
agg_train['flux_w_mean'] = agg_train['flux_by_flux_ratio_sq_sum'] / agg_train['flux_ratio_sq_sum']
agg_train['flux_dif3'] = (agg_train['flux_max'] - agg_train['flux_min']) / agg_train['flux_w_mean']

del agg_train['mjd_max'], agg_train['mjd_min']
agg_train.head()

gc.collect()

0

In [31]:
agg_train = agg_train.reset_index()

In [32]:
agg_train.head()

Unnamed: 0,object_id,mjd_size,passband_min,passband_max,passband_mean,passband_median,passband_std,flux_min,flux_max,flux_mean,...,u_mean,z_sum,z_mean,y_sum,y_mean,mjd_diff,flux_diff,flux_dif2,flux_w_mean,flux_dif3
0,615,352,0,5,2.457386,2.0,1.720797,-1100.440063,660.626343,-123.096998,...,0.178977,58,0.164773,57,0.161932,873.7903,1761.066406,-14.306331,-327.742307,-5.373326
1,713,350,0,5,2.4,2.0,1.746056,-14.735178,14.770886,-1.423351,...,0.2,56,0.16,56,0.16,848.8198,29.506064,-20.730002,-4.884564,-6.040676
2,730,330,0,5,2.336364,2.0,1.75875,-19.159811,47.310059,2.267434,...,0.218182,51,0.154545,51,0.154545,853.8455,66.46987,29.315018,25.37311,2.619697
3,745,351,0,5,2.378917,2.0,1.747328,-15.494463,220.795212,8.909206,...,0.205128,56,0.159544,55,0.156695,853.706,236.289675,26.521968,152.835617,1.546038
4,1124,352,0,5,2.457386,2.0,1.720797,-16.543753,143.600189,7.145702,...,0.178977,58,0.164773,57,0.161932,873.7903,160.143942,22.411225,87.85639,1.822792


In [33]:
agg_train.shape

(7848, 41)

In [34]:
fset_cesium_imputed.shape

(7848, 66)

***Merging two training datasets (manually computed features and from cesium)***

In [35]:
mer_data = pd.merge(agg_train, fset_cesium_imputed)

In [36]:
mer_data.shape

(7848, 106)

In [37]:
mer_data.head()

Unnamed: 0,object_id,mjd_size,passband_min,passband_max,passband_mean,passband_median,passband_std,flux_min,flux_max,flux_mean,...,cads_avg,cads_med,cads_std,mean,med_double_to_single_step,med_err,n_epochs,std_double_to_single_step,std_err,total_time
0,615,352,0,5,2.457386,2.0,1.720797,-1100.440063,660.626343,-123.096998,...,2.489431,0.0111,17.326857,-123.096998,0.006542,3.835268,352.0,194.500737,1.742267,873.7903
1,713,350,0,5,2.4,2.0,1.746056,-14.735178,14.770886,-1.423351,...,2.432148,0.011,18.975828,-1.423351,0.007325,1.998217,350.0,194.500737,1.50773,848.8198
2,730,330,0,5,2.336364,2.0,1.75875,-19.159811,47.310059,2.267434,...,2.595275,0.011,18.160894,2.267434,0.006387,1.990851,330.0,194.500737,1.718524,853.8455
3,745,351,0,5,2.378917,2.0,1.747328,-15.494463,220.795212,8.909206,...,2.43916,0.0111,16.745572,8.909206,0.006367,1.819875,351.0,194.500737,3.532281,853.706
4,1124,352,0,5,2.457386,2.0,1.720797,-16.543753,143.600189,7.145702,...,2.489431,0.0111,17.326857,7.145702,0.006542,2.214854,352.0,194.500737,1.931089,873.7903


 ***Merg training dataset and meta_dataset***

In [38]:
training_dataset = pd.merge(mer_data, meta_dataset) # with additional computed features
training_dataset.head()

Unnamed: 0,object_id,mjd_size,passband_min,passband_max,passband_mean,passband_median,passband_std,flux_min,flux_max,flux_mean,...,decl,gal_l,gal_b,ddf,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target,in_our_galaxy
0,615,352,0,5,2.457386,2.0,1.720797,-1100.440063,660.626343,-123.096998,...,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,0.017,class_92,1
1,713,350,0,5,2.4,2.0,1.746056,-14.735178,14.770886,-1.423351,...,-27.784405,223.525509,-54.460748,1,1.6267,0.2552,45.4063,0.007,class_88,0
2,730,330,0,5,2.336364,2.0,1.75875,-19.159811,47.310059,2.267434,...,-6.579593,170.455585,-61.548219,1,0.2262,0.0157,40.2561,0.021,class_42,0
3,745,351,0,5,2.378917,2.0,1.747328,-15.494463,220.795212,8.909206,...,-45.586655,328.254458,-68.969298,1,0.2813,1.1523,40.7951,0.007,class_90,0
4,1124,352,0,5,2.457386,2.0,1.720797,-16.543753,143.600189,7.145702,...,-63.823658,316.922299,-51.059403,1,0.2415,0.0176,40.4166,0.024,class_90,0


In [39]:
training_dataset.to_csv('training_dataset.csv', index=False)
training_dataset = pd.read_csv('training_dataset.csv', index_col=0)
training_dataset.head()

Unnamed: 0_level_0,mjd_size,passband_min,passband_max,passband_mean,passband_median,passband_std,flux_min,flux_max,flux_mean,flux_median,...,decl,gal_l,gal_b,ddf,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target,in_our_galaxy
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
615,352,0,5,2.457386,2.0,1.720797,-1100.440063,660.626343,-123.096998,-89.477524,...,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,0.017,class_92,1
713,350,0,5,2.4,2.0,1.746056,-14.735178,14.770886,-1.423351,-0.873033,...,-27.784405,223.525509,-54.460748,1,1.6267,0.2552,45.4063,0.007,class_88,0
730,330,0,5,2.336364,2.0,1.75875,-19.159811,47.310059,2.267434,0.409172,...,-6.579593,170.455585,-61.548219,1,0.2262,0.0157,40.2561,0.021,class_42,0
745,351,0,5,2.378917,2.0,1.747328,-15.494463,220.795212,8.909206,1.035895,...,-45.586655,328.254458,-68.969298,1,0.2813,1.1523,40.7951,0.007,class_90,0
1124,352,0,5,2.457386,2.0,1.720797,-16.543753,143.600189,7.145702,1.141288,...,-63.823658,316.922299,-51.059403,1,0.2415,0.0176,40.4166,0.024,class_90,0


In [40]:
training_dataset = training_dataset.drop(['passband_min', 'passband_max', 'cad_probs_1', 'cad_probs_500000',
                                          'cad_probs_1000000', 'cad_probs_5000000', 'cad_probs_10000000'], axis=1)

In [41]:
from scipy.stats import boxcox
training_dataset['flux_std'] = boxcox(training_dataset['flux_std'], -0.5)
training_dataset['flux_err_min'] = boxcox(training_dataset['flux_err_min'], -0.5)
training_dataset['flux_err_max'] = boxcox(training_dataset['flux_err_max'], -0.5)
training_dataset['flux_err_mean'] = boxcox(training_dataset['flux_err_mean'], -0.5)
training_dataset['flux_err_median'] = boxcox(training_dataset['flux_err_median'], -0.5)
training_dataset['flux_err_std'] = boxcox(training_dataset['flux_err_std'], -0.5)
training_dataset['detected_mean'] = boxcox(training_dataset['detected_mean'], 0)
training_dataset['flux_ratio_sq_sum'] = boxcox(training_dataset['flux_ratio_sq_sum'], 0)
training_dataset['flux_diff'] = boxcox(training_dataset['flux_diff'], -0.5)
training_dataset['amplitude'] = boxcox(training_dataset['amplitude'], -0.5)
training_dataset['max_slope'] = boxcox(training_dataset['max_slope'], 0)
training_dataset['median_absolute_deviation'] = boxcox(training_dataset['median_absolute_deviation'], -0.5)
training_dataset['period_fast'] = boxcox(training_dataset['period_fast'], 0)
training_dataset['std'] = boxcox(training_dataset['std'], -0.5)
training_dataset['stetson_j'] = boxcox(training_dataset['stetson_j'], -0.5)
training_dataset['all_times_nhist_numpeaks'] = boxcox(training_dataset['all_times_nhist_numpeaks'], 0)
training_dataset['all_times_nhist_peak_1_to_2'] = boxcox(training_dataset['all_times_nhist_peak_1_to_2'], -0.5)
training_dataset['all_times_nhist_peak_1_to_3'] = boxcox(training_dataset['all_times_nhist_peak_1_to_3'], -1)
training_dataset['all_times_nhist_peak_1_to_4'] = boxcox(training_dataset['all_times_nhist_peak_1_to_4'], -1)
training_dataset['all_times_nhist_peak_2_to_3'] = boxcox(training_dataset['all_times_nhist_peak_2_to_3'], -1)
training_dataset['all_times_nhist_peak_2_to_4'] = boxcox(training_dataset['all_times_nhist_peak_2_to_4'], -1)
training_dataset['all_times_nhist_peak_3_to_4'] = boxcox(training_dataset['all_times_nhist_peak_3_to_4'], -1)
training_dataset['all_times_nhist_peak_val'] = boxcox(training_dataset['all_times_nhist_peak_val'], -0.5)
training_dataset['avg_err'] = boxcox(training_dataset['avg_err'], -0.5)
training_dataset['avgt'] = boxcox(training_dataset['avgt'], -1)
training_dataset['cads_avg'] = boxcox(training_dataset['cads_avg'], 0)
training_dataset['cads_std'] = boxcox(training_dataset['cads_std'], -1)
training_dataset['med_err'] = boxcox(training_dataset['med_err'], -0.5)
training_dataset['std_err'] = boxcox(training_dataset['std_err'], -0.5)
training_dataset['total_time'] = boxcox(training_dataset['total_time'], -1)
training_dataset['mwebv'] = boxcox(training_dataset['mwebv'], 0)

In [42]:
training_dataset.head()

Unnamed: 0_level_0,mjd_size,passband_mean,passband_median,passband_std,flux_min,flux_max,flux_mean,flux_median,flux_std,flux_skew,...,decl,gal_l,gal_b,ddf,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target,in_our_galaxy
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
615,352,2.457386,2.0,1.720797,-1100.440063,660.626343,-123.096998,-89.477524,1.899255,-0.34954,...,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,-4.074542,class_92,1
713,350,2.4,2.0,1.746056,-14.735178,14.770886,-1.423351,-0.873033,1.213788,0.014989,...,-27.784405,223.525509,-54.460748,1,1.6267,0.2552,45.4063,-4.961845,class_88,0
730,330,2.336364,2.0,1.75875,-19.159811,47.310059,2.267434,0.409172,1.293874,3.177854,...,-6.579593,170.455585,-61.548219,1,0.2262,0.0157,40.2561,-3.863233,class_42,0
745,351,2.378917,2.0,1.747328,-15.494463,220.795212,8.909206,1.035895,1.619018,4.979826,...,-45.586655,328.254458,-68.969298,1,0.2813,1.1523,40.7951,-4.961845,class_90,0
1124,352,2.457386,2.0,1.720797,-16.543753,143.600189,7.145702,1.141288,1.553364,4.406298,...,-63.823658,316.922299,-51.059403,1,0.2415,0.0176,40.4166,-3.729701,class_90,0


In [43]:
training_dataset.to_csv('training_dataset_cleaned.csv')

In [44]:
training_dataset = pd.read_csv('training_dataset_cleaned.csv', index_col=0)
training_dataset.head()

Unnamed: 0_level_0,mjd_size,passband_mean,passband_median,passband_std,flux_min,flux_max,flux_mean,flux_median,flux_std,flux_skew,...,decl,gal_l,gal_b,ddf,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target,in_our_galaxy
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
615,352,2.457386,2.0,1.720797,-1100.440063,660.626343,-123.096998,-89.477524,1.899255,-0.34954,...,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,-4.074542,class_92,1
713,350,2.4,2.0,1.746056,-14.735178,14.770886,-1.423351,-0.873033,1.213788,0.014989,...,-27.784405,223.525509,-54.460748,1,1.6267,0.2552,45.4063,-4.961845,class_88,0
730,330,2.336364,2.0,1.75875,-19.159811,47.310059,2.267434,0.409172,1.293874,3.177854,...,-6.579593,170.455585,-61.548219,1,0.2262,0.0157,40.2561,-3.863233,class_42,0
745,351,2.378917,2.0,1.747328,-15.494463,220.795212,8.909206,1.035895,1.619018,4.979826,...,-45.586655,328.254458,-68.969298,1,0.2813,1.1523,40.7951,-4.961845,class_90,0
1124,352,2.457386,2.0,1.720797,-16.543753,143.600189,7.145702,1.141288,1.553364,4.406298,...,-63.823658,316.922299,-51.059403,1,0.2415,0.0176,40.4166,-3.729701,class_90,0


In [45]:
Y = training_dataset['target']
Y.head()

object_id
615     class_92
713     class_88
730     class_42
745     class_90
1124    class_90
Name: target, dtype: object

In [46]:
X = training_dataset.drop(['target'], axis=1)
X.head()

(7848, 108)


In [53]:
training_dataset = pd.read_csv('training_dataset_cleaned.csv')
X = training_dataset.drop(['target', 'object_id'], axis=1)
Y = training_dataset['target']
print (X.shape)

(7848, 108)


In [54]:
training_dataset.head()

Unnamed: 0,object_id,mjd_size,passband_mean,passband_median,passband_std,flux_min,flux_max,flux_mean,flux_median,flux_std,...,decl,gal_l,gal_b,ddf,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target,in_our_galaxy
0,615,352,2.457386,2.0,1.720797,-1100.440063,660.626343,-123.096998,-89.477524,1.899255,...,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,-4.074542,class_92,1
1,713,350,2.4,2.0,1.746056,-14.735178,14.770886,-1.423351,-0.873033,1.213788,...,-27.784405,223.525509,-54.460748,1,1.6267,0.2552,45.4063,-4.961845,class_88,0
2,730,330,2.336364,2.0,1.75875,-19.159811,47.310059,2.267434,0.409172,1.293874,...,-6.579593,170.455585,-61.548219,1,0.2262,0.0157,40.2561,-3.863233,class_42,0
3,745,351,2.378917,2.0,1.747328,-15.494463,220.795212,8.909206,1.035895,1.619018,...,-45.586655,328.254458,-68.969298,1,0.2813,1.1523,40.7951,-4.961845,class_90,0
4,1124,352,2.457386,2.0,1.720797,-16.543753,143.600189,7.145702,1.141288,1.553364,...,-63.823658,316.922299,-51.059403,1,0.2415,0.0176,40.4166,-3.729701,class_90,0
