# Feature extraction

In [1]:
import pandas as pd

df = pd.concat(
    (
        pd.read_csv('data/training_set_metadata.csv'),
        pd.read_csv('data/test_set_metadata.csv')
    ),
    ignore_index=True,
    sort=False
)
df['is_train'] = df['target'].notnull()
df['is_galactic'] = df['hostgal_photoz'] == 0

Polar coordinates.

In [2]:
df['rho'] = df.eval('sqrt(gal_l ** 2 + gal_b ** 2)')
df['phi'] = df.eval('arctan2(gal_b, gal_l)')

Photometric redshift.

In [3]:
df['hostgal_photoz_min'] = df.eval('hostgal_photoz - hostgal_photoz_err')
df['hostgal_photoz_max'] = df.eval('hostgal_photoz + hostgal_photoz_err')
df['hostgal_photoz_over_err'] = df.eval('hostgal_photoz / (hostgal_photoz_err + 1)')

Load light curve features.

In [5]:
import numpy as np

with pd.HDFStore('data/features.h5') as store:
    for key in store:
        df = df.join(store.get(key).astype(np.float32), on='object_id')

Build computed features.

In [6]:
#('u', 'g', 'r', 'i', 'z', 'y')

for i in range(6):
    
    df[f'flux_count_ratio_passband_{i}'] = df.eval(f'flux_count_above_mean_passband_{i} / flux_count_below_mean_passband_{i}')
    df[f'flux_mean_over_std_passband_{i}'] = df.eval(f'flux_mean_passband_{i} / flux_std_passband_{i}')

Interactions between passbands.

In [7]:
import itertools

for (i, j) in itertools.combinations(range(6), 2):
    for stat in ('mean', 'min', 'max', 'std', 'skew', 'kurtosis'):
        df[f'flux_{stat}_passband_{i}_over_{j}'] = df.eval(f'flux_{stat}_passband_{i} / (flux_{stat}_passband_{j} + 1)')

See what we got.

In [8]:
df.head()

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,...,flux_max_passband_3_over_5,flux_std_passband_3_over_5,flux_skew_passband_3_over_5,flux_kurtosis_passband_3_over_5,flux_mean_passband_4_over_5,flux_min_passband_4_over_5,flux_max_passband_4_over_5,flux_std_passband_4_over_5,flux_skew_passband_4_over_5,flux_kurtosis_passband_4_over_5
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,...,1.175504,1.134037,0.260496,2.445696,1.204624,1.000876,1.007293,0.986557,0.17782,2.611787
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,...,0.718432,0.79693,-0.137234,-5.631339,1.133581,0.930032,0.623169,0.791566,-0.052647,-5.864639
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.232,0.2262,0.0157,40.2561,...,0.69493,0.571557,0.993181,1.55476,0.777834,0.321386,0.851996,0.747263,0.946817,1.326837
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,...,1.426188,1.292222,0.89948,1.248039,1.118192,1.675188,1.288533,1.222058,0.833502,1.046656
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,...,1.269258,1.170987,0.773668,0.913272,1.254454,1.677773,1.303589,1.19752,0.838255,1.06107


# Learning

In [9]:
to_drop = ['is_train',  'is_galactic', 'target', 'hostgal_specz']

train = df[df['is_train']].set_index('object_id')
test = df[~df['is_train']].set_index('object_id')

X_train = train.drop(columns=to_drop).astype(np.float32)
y_train = train['target'].apply(lambda x: f'class_{int(x)}').astype('category')
X_test = test.drop(columns=to_drop).astype(np.float32)

submission = pd.DataFrame(0.0, index=test.index, columns=y_train.cat.categories)
submission['class_99'] = 0.0

class_weights = {c: 1 for c in y_train.cat.categories}
class_weights['class_64'] = 2
class_weights['class_15'] = 2

In [10]:
assert len(X_train.columns) == len(X_test.columns)
assert len(X_train) == len(y_train)
assert len(X_test) == 3492890
assert len(submission) == 3492890

In [11]:
def make_metric(class_to_int):

    def metric(y_pred, y_true):
        """
        @author olivier https://www.kaggle.com/ogrellier
        multi logloss for PLAsTiCC challenge
        """
        class_weight = {
            class_to_int[k]: v
            for k, v in class_weights.items()
            if k in class_to_int
        }
        
        y_true = y_true.get_label()    
        y_p = y_pred.reshape(y_true.shape[0], len(class_weight), order='F')
        
        # Trasform y_true in dummies
        y_ohe = pd.get_dummies(y_true)
        # Normalize rows and limit y_preds to 1e-15, 1-1e-15
        y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)
        # Transform to log
        y_p_log = np.log(y_p)
        # Get the log for ones, .values is used to drop the index of DataFrames
        # Exclude class 99 for now, since there is no class99 in the training set
        # we gave a special process for that class
        y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
        # Get the number of positives for each class
        nb_pos = y_ohe.sum(axis=0).values.astype(float)
        # Weight average and divide by the number of positives
        class_arr = np.array([class_weight[k] for k in sorted(class_weight.keys())])
        y_w = y_log_ones * class_arr / nb_pos

        loss = - np.sum(y_w) / np.sum(class_arr)
        return 'wloss', loss, False
    
    return metric

## Galactic objects

Select the galactic objects.

In [12]:
X_train_gal = X_train[train['is_galactic']]
y_train_gal = y_train[train['is_galactic']]
X_test_gal = X_test[test['is_galactic']]

class_to_int = {c: i for i, c in enumerate(y_train_gal.unique())}
int_to_class = {i: c for c, i in class_to_int.items()}

Train the model.

In [13]:
import lightgbm as lgbm
import numpy as np
from sklearn import model_selection


params = {
    'application': 'multiclass',
    'boosting_type': 'gbdt',
    'num_classes': y_train_gal.nunique(),
    'metric': 'None',
    'num_threads': -1,
    'num_leaves': 10,
    'max_bin': 127,
    'min_data_in_bin': 20,
    'min_data_in_leaf': 260,
    'min_sum_hessian_in_leaf': 5e-3,
    'learning_rate': 0.04,
    'feature_fraction': 0.85,
    'feature_fraction_seed': 42,
    'bagging_fraction': 1,
    'bagging_freq': 0,
    'bagging_seed': 42,
    'lambda_l1': 0,
    'lambda_l2': 0,
    'verbosity': 2,
}
        

cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
feature_importances = pd.DataFrame(index=X_train_gal.columns)
gal_fit_scores = np.zeros(cv.n_splits)
gal_val_scores = np.zeros(cv.n_splits)
submission.loc[X_test_gal.index, y_train_gal.unique()] = 0.0

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train_gal, y_train_gal)):
    
    X_fit = X_train_gal.iloc[fit_idx]
    y_fit = y_train_gal.iloc[fit_idx].map(class_to_int)
    w_fit = y_train_gal.iloc[fit_idx].map(class_weights)
    X_val = X_train_gal.iloc[val_idx]
    y_val = y_train_gal.iloc[val_idx].map(class_to_int)
    w_val = y_train_gal.iloc[val_idx].map(class_weights)
    
    # Train the model
    fit_set = lgbm.Dataset(X_fit, y_fit, weight=w_fit)
    val_set = lgbm.Dataset(X_val, y_val, reference=fit_set, weight=w_val)

    evals_result = {}
    model = lgbm.train(
        params=params,
        train_set=fit_set,
        feval=make_metric(class_to_int),
        num_boost_round=10000,
        valid_sets=(fit_set, val_set),
        valid_names=('fit', 'val'),
        verbose_eval=50,
        early_stopping_rounds=80,
        evals_result=evals_result
    )
    
    # Store the feature importances
    feature_importances[f'gain_{i}'] = model.feature_importance('gain')
    feature_importances[f'split_{i}'] = model.feature_importance('split')
    
    # Store the predictions
    y_pred = pd.DataFrame(model.predict(X_test_gal), index=X_test_gal.index)
    y_pred.columns = y_pred.columns.map(int_to_class)
    submission.loc[y_pred.index, y_pred.columns] += y_pred / cv.n_splits
    
    # Store the scores
    gal_fit_scores[i] = evals_result['fit']['wloss'][-1]
    gal_val_scores[i] = evals_result['val']['wloss'][-1]

print(f'- Train loss: {gal_fit_scores.mean():.3f} (±{gal_fit_scores.std():.3f})')
print(f'- Valid loss: {gal_val_scores.mean():.3f} (±{gal_val_scores.std():.3f})')

Training until validation scores don't improve for 80 rounds.
[50]	fit's wloss: 0.824854	val's wloss: 0.868656
[100]	fit's wloss: 0.403371	val's wloss: 0.471645
[150]	fit's wloss: 0.170337	val's wloss: 0.25727
[200]	fit's wloss: 0.0710578	val's wloss: 0.16936
[250]	fit's wloss: 0.0325495	val's wloss: 0.132927
[300]	fit's wloss: 0.0158261	val's wloss: 0.116374
[350]	fit's wloss: 0.00796941	val's wloss: 0.106815
[400]	fit's wloss: 0.00417002	val's wloss: 0.101456
[450]	fit's wloss: 0.00225003	val's wloss: 0.0983963
[500]	fit's wloss: 0.00120662	val's wloss: 0.0966447
[550]	fit's wloss: 0.000645259	val's wloss: 0.0970631
[600]	fit's wloss: 0.000351055	val's wloss: 0.0983851
Early stopping, best iteration is:
[536]	fit's wloss: 0.000772022	val's wloss: 0.096225
Training until validation scores don't improve for 80 rounds.
[50]	fit's wloss: 0.793672	val's wloss: 0.826629
[100]	fit's wloss: 0.373884	val's wloss: 0.419753
[150]	fit's wloss: 0.16079	val's wloss: 0.21174
[200]	fit's wloss: 0.07

- Train loss: 0.001 (±0.001)
- Valid loss: 0.072 (±0.052)

In [14]:
feature_importances.sort_values('gain_0', ascending=False).head(10)

Unnamed: 0,gain_0,split_0,gain_1,split_1,gain_2,split_2,gain_3,split_3,gain_4,split_4
flux_skew,10205.492773,311,10149.096927,286,8513.401388,380,8682.475305,187,9563.748186,336
flux_min_detected_1,9146.135095,327,8422.813827,268,8181.59577,235,8890.173519,157,9107.290286,293
flux_skew_passband_2,3982.494776,250,4270.187508,292,5320.523335,398,5266.573731,241,4587.800732,352
flux_min_passband_0_over_2,2834.990274,107,3447.124215,93,4297.143382,180,3590.38031,97,3408.261813,137
flux_err_min,2632.004965,483,2123.980595,482,2716.727026,468,2082.917799,265,2198.62296,627
mjd_std_detected_1,2262.214768,248,2704.072266,258,1921.866449,319,2428.730858,187,2292.656471,238
mjd_log_sum_detected_0,2057.940808,17,1848.916353,35,2169.629236,26,1405.156013,27,1542.423048,32
flux_max_passband_1,1718.892053,33,1306.910803,35,1333.320803,47,1674.271403,30,1367.01833,57
flux_count_below_0_detected_1,1310.818477,194,1182.586931,170,1294.995076,258,376.224001,55,1283.301037,223
flux_skew_passband_1,1289.606077,153,883.03173,121,997.062292,194,984.490647,57,1024.829394,228


## Extragalactic objects

Select the extragalactic objects.

In [15]:
X_train_ex = X_train[~train['is_galactic']]
y_train_ex = y_train[~train['is_galactic']]
X_test_ex = X_test[~test['is_galactic']]

class_to_int = {c: i for i, c in enumerate(y_train_ex.unique())}
int_to_class = {i: c for c, i in class_to_int.items()}

In [16]:
import gc
gc.collect()

508

In [30]:
import lightgbm as lgbm
import numpy as np
from sklearn import model_selection


params = {
    'application': 'multiclass',
    'boosting_type': 'gbdt',
    'num_classes': y_train_ex.nunique(),
    'metric': 'None',
    'num_threads': 8,
    'num_leaves': 8,
    'max_bin': 127,
    'min_data_in_bin': 40,
    'min_data_in_leaf': 200,
    'min_sum_hessian_in_leaf': 5e-3,
    'learning_rate': 0.009,
    'feature_fraction': 0.5,
    'feature_fraction_seed': 42,
    'bagging_fraction': 1,
    'bagging_freq': 0,
    'bagging_seed': 42,
    'lambda_l1': 0.9,
    'lambda_l2': 0.3,
    'verbosity': 2,
}
        

cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
feature_importances = pd.DataFrame(index=X_train_ex.columns)
ex_fit_scores = np.zeros(cv.n_splits)
ex_val_scores = np.zeros(cv.n_splits)
submission.loc[X_test_ex.index, y_train_ex.unique()] = 0.0

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train_ex, y_train_ex)):
    
    X_fit = X_train_ex.iloc[fit_idx]
    y_fit = y_train_ex.iloc[fit_idx].map(class_to_int)
    w_fit = y_train_ex.iloc[fit_idx].map(class_weights)
    X_val = X_train_ex.iloc[val_idx]
    y_val = y_train_ex.iloc[val_idx].map(class_to_int)
    w_val = y_train_ex.iloc[val_idx].map(class_weights)
    
    # Train the model
    fit_set = lgbm.Dataset(X_fit.values.astype(np.float32), y_fit, weight=w_fit)
    val_set = lgbm.Dataset(X_val.values.astype(np.float32), y_val, reference=fit_set, weight=w_val)
    
    evals_result = {}
    model = lgbm.train(
        params=params,
        train_set=fit_set,
        feval=make_metric(class_to_int),
        num_boost_round=10000,
        valid_sets=(fit_set, val_set),
        valid_names=('fit', 'val'),
        verbose_eval=50,
        early_stopping_rounds=80,
        evals_result=evals_result
    )
    
    # Store the feature importances
    feature_importances[f'gain_{i}'] = model.feature_importance('gain')
    feature_importances[f'split_{i}'] = model.feature_importance('split')
    
    # Store the predictions
    y_pred = pd.DataFrame(model.predict(X_test_ex.values), index=X_test_ex.index)
    y_pred.columns = y_pred.columns.map(int_to_class)
    submission.loc[y_pred.index, y_pred.columns] += y_pred / cv.n_splits
    
    # Store the scores
    ex_fit_scores[i] = evals_result['fit']['wloss'][-1]
    ex_val_scores[i] = evals_result['val']['wloss'][-1]

print(f'- Train loss: {ex_fit_scores.mean():.3f} (±{ex_fit_scores.std():.3f})')
print(f'- Valid loss: {ex_val_scores.mean():.3f} (±{ex_val_scores.std():.3f})')

Training until validation scores don't improve for 80 rounds.
[50]	fit's wloss: 1.80873	val's wloss: 1.83889
[100]	fit's wloss: 1.57926	val's wloss: 1.63718
[150]	fit's wloss: 1.42249	val's wloss: 1.50661
[200]	fit's wloss: 1.30185	val's wloss: 1.41208
[250]	fit's wloss: 1.20573	val's wloss: 1.33951
[300]	fit's wloss: 1.12816	val's wloss: 1.28422
[350]	fit's wloss: 1.06237	val's wloss: 1.24069
[400]	fit's wloss: 1.00418	val's wloss: 1.2039
[450]	fit's wloss: 0.951977	val's wloss: 1.17116
[500]	fit's wloss: 0.904057	val's wloss: 1.14392
[550]	fit's wloss: 0.860995	val's wloss: 1.1217
[600]	fit's wloss: 0.820714	val's wloss: 1.10296
[650]	fit's wloss: 0.783221	val's wloss: 1.08676
[700]	fit's wloss: 0.748002	val's wloss: 1.0736
[750]	fit's wloss: 0.714665	val's wloss: 1.06085
[800]	fit's wloss: 0.684151	val's wloss: 1.05078
[850]	fit's wloss: 0.65532	val's wloss: 1.04177
[900]	fit's wloss: 0.628315	val's wloss: 1.03309
[950]	fit's wloss: 0.60257	val's wloss: 1.02675
[1000]	fit's wloss: 0

- Train loss: 0.370 (±0.050)
- Valid loss: 1.009 (±0.030)

In [32]:
feature_importances.sort_values('gain_0', ascending=False).head(10)

Unnamed: 0,gain_0,split_0,gain_1,split_1,gain_2,split_2,gain_3,split_3,gain_4,split_4
mjd_std_detected_1,68929.928689,2957,70180.651254,3198,65198.82548,2647,72502.307879,2313,68567.893714,2282
mjd_ptp_detected_1,45836.932639,1560,53035.260549,2277,54984.337504,1802,48864.111935,1363,46688.045109,1677
distmod,28237.066009,1949,26550.965956,2300,29086.714767,1727,29031.92096,1705,26861.621283,1855
flux_std_passband_0_over_4,26444.65667,365,15089.335243,451,27698.749547,238,20989.777837,289,26255.323286,496
hostgal_photoz,22620.694042,2103,24771.698331,2229,20675.136265,1464,22068.279785,1585,26007.411474,1700
flux_mean_passband_0_over_3,16304.811904,539,13491.408776,425,13169.102713,440,8792.992204,357,13216.973542,354
flux_max_passband_2_over_5,14251.786914,1199,12168.499917,982,14545.011847,804,15360.334924,936,14675.149568,936
hostgal_photoz_over_err,14106.226134,1374,13075.737177,1326,13237.352333,1018,13953.648799,1160,14373.640184,1113
flux_max_passband_0_over_5,12435.119509,600,15062.336098,970,11845.87965,702,7733.862014,476,10119.325628,551
flux_max_passband_0_over_3,12421.782012,451,11791.682442,657,25517.9152,422,31280.769616,620,17661.723337,417


## Class 99

Olivier's way.

In [33]:
submission['class_99'] = 1

for col in submission.drop(columns='class_99'):
    submission['class_99'] *= (1 - submission[col])

submission['class_99'] = 0.14 * submission['class_99'] / submission['class_99'].mean()

## Saving the submission

In [34]:
submission.head()

Unnamed: 0_level_0,class_15,class_16,class_42,class_52,class_53,class_6,class_62,class_64,class_65,class_67,class_88,class_90,class_92,class_95,class_99
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
13,0.001607,0.0,0.530873,0.071716,0.0,0.0,0.038014,0.000274,0.0,0.002476,0.000375,0.352534,0.0,0.00213,0.194726
14,0.013442,0.0,0.132546,0.026439,0.0,0.0,0.017516,0.000506,0.0,0.004486,0.005049,0.795895,0.0,0.004123,0.119069
17,0.002984,0.0,0.054874,0.011877,0.0,0.0,0.019501,0.006394,0.0,0.008041,0.002934,0.88797,0.0,0.005424,0.072261
23,0.001694,0.0,0.10049,0.010732,0.0,0.0,0.055102,0.004483,0.0,0.07594,0.001061,0.746153,0.0,0.004347,0.140925
34,0.002052,0.0,0.039355,0.034385,0.0,0.0,0.009705,8.6e-05,0.0,0.004054,0.000148,0.909925,0.0,0.00029,0.059416


Sanity checks.

In [35]:
assert submission[X_test['hostgal_photoz'] == 0][y_train_ex.unique().categories].sum().sum() == 0
assert submission[X_test['hostgal_photoz'] > 0][y_train_gal.unique().categories].sum().sum() == 0

Save the submission. We align with the sample submission just to make sure.

In [36]:
name = f'{gal_val_scores.mean():.3f}_{gal_val_scores.std():.3f}_{ex_val_scores.mean():.3f}_{ex_val_scores.std():.3f}'

sample_sub = pd.read_csv('data/sample_submission.csv').set_index('object_id')

submission.loc[sample_sub.index, sample_sub.columns].to_csv(f'submissions/{name}.csv.gz', compression='gzip')