# Feature extraction

Load the data from the HDF5 files (it weights much less than the initial files).

In [1]:
import pandas as pd

df = pd.concat(
    (
        pd.read_csv('data/training_set_metadata.csv'),
        pd.read_csv('data/test_set_metadata.csv')
    ),
    ignore_index=True,
    sort=False
)
df['is_train'] = df['target'].notnull()

Polar coordinates.

In [2]:
df['rho'] = df.eval('sqrt(gal_l ** 2 + gal_b ** 2)')
df['phi'] = df.eval('arctan2(gal_b, gal_l)')

Photometric redshift.

In [3]:
df['hostgal_photoz_min'] = df.eval('hostgal_photoz - hostgal_photoz_err')
df['hostgal_photoz_max'] = df.eval('hostgal_photoz + hostgal_photoz_err')
df['hostgal_photoz_over_err'] = df.eval('hostgal_photoz / (hostgal_photoz_err + 1)')

Load light curve features.

In [4]:
import numpy as np

with pd.HDFStore('data/features.h5') as store:
    for key in store:
        df = df.join(store.get(key).astype(np.float32), on='object_id')

Build computed features.

In [5]:
#('u', 'g', 'r', 'i', 'z', 'y')

for i in range(6):
    
    for p1, p2 in [(95, 5), (90, 10), (75, 25), (50, 10), (90, 50)]:
        df[f'flux_p{p1}_minus_p{p2}_{i}'] = df.eval(f'flux_p{p1}_{i} - flux_p{p2}_{i}')
        
    df[f'flux_max_minus_p95_{i}'] = df.eval(f'flux_max_{i} - flux_p95_{i}')
    df[f'flux_p5_minus_min_{i}'] = df.eval(f'flux_p5_{i} - flux_min_{i}')
    
    df[f'flux_count_ratio_{i}'] = df.eval(f'flux_count_above_mean_{i} / flux_count_below_mean_{i}') 
    
    df[f'flux_mean_over_flux_err_mean_{i}'] = df.eval(f'flux_mean_{i} / flux_err_mean_{i}') 
    df[f'flux_mean_over_flux_std_{i}'] = df.eval(f'flux_mean_{i} / flux_std_{i}') 
    
    df[f'flux_fft_amp_0_over_fft_amp_1_{i}'] = df.eval(f'flux_fft_amp_0_{i} / flux_fft_amp_1_{i}') 
    df[f'flux_fft_amp_0_over_fft_amp_2_{i}'] = df.eval(f'flux_fft_amp_0_{i} / flux_fft_amp_2_{i}')

Interactions between passbands.

In [6]:
import itertools

for (i, j) in itertools.combinations(range(6), 2):
    for stat in ('mean', 'min', 'max', 'std', 'skew', 'kurtosis'):
        df[f'flux_{stat}_{i}_over_{j}'] = df.eval(f'flux_{stat}_{i} / (flux_{stat}_{j} + 1)')

See what we got.

In [7]:
df.head()

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,...,flux_max_3_over_5,flux_std_3_over_5,flux_skew_3_over_5,flux_kurtosis_3_over_5,flux_mean_4_over_5,flux_min_4_over_5,flux_max_4_over_5,flux_std_4_over_5,flux_skew_4_over_5,flux_kurtosis_4_over_5
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,...,1.175504,1.134038,0.260496,2.445696,1.204623,1.000876,1.007293,0.986557,0.17782,2.611787
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,...,0.718432,0.79693,-0.137234,-5.631334,1.133582,0.930032,0.623169,0.791566,-0.052647,-5.864635
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.232,0.2262,0.0157,40.2561,...,0.69493,0.571557,0.993181,1.55476,0.777834,0.321386,0.851996,0.747263,0.946817,1.326837
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,...,1.426188,1.292222,0.89948,1.248039,1.118192,1.675188,1.288533,1.222058,0.833502,1.046656
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,...,1.269258,1.170987,0.773668,0.913272,1.254454,1.677773,1.303589,1.19752,0.838255,1.06107


# Learning

In [8]:
to_drop = ['is_train', 'target', 'hostgal_specz']

train = df[df['is_train']].set_index('object_id')
test = df[~df['is_train']].set_index('object_id')

X_train = train.drop(columns=to_drop)
y_train = train['target'].apply(lambda x: f'class_{int(x)}').astype('category')
X_test = test.drop(columns=to_drop)

submission = pd.DataFrame(0.0, index=test.index, columns=y_train.cat.categories)
submission['class_99'] = 0.0

class_weights = {c: 1 for c in y_train.cat.categories}
class_weights['class_64'] = 2
class_weights['class_15'] = 2

In [9]:
assert len(X_train.columns) == len(X_test.columns)
assert len(X_train) == len(y_train)
assert len(X_test) == 3492890
assert len(submission) == 3492890

In [10]:
def make_metric(class_to_int):

    def metric(y_pred, y_true):
        """
        @author olivier https://www.kaggle.com/ogrellier
        multi logloss for PLAsTiCC challenge
        """
        class_weight = {
            class_to_int[k]: v
            for k, v in class_weights.items()
            if k in class_to_int
        }
        
        y_true = y_true.get_label()    
        y_p = y_pred.reshape(y_true.shape[0], len(class_weight), order='F')
        
        # Trasform y_true in dummies
        y_ohe = pd.get_dummies(y_true)
        # Normalize rows and limit y_preds to 1e-15, 1-1e-15
        y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)
        # Transform to log
        y_p_log = np.log(y_p)
        # Get the log for ones, .values is used to drop the index of DataFrames
        # Exclude class 99 for now, since there is no class99 in the training set
        # we gave a special process for that class
        y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
        # Get the number of positives for each class
        nb_pos = y_ohe.sum(axis=0).values.astype(float)
        # Weight average and divide by the number of positives
        class_arr = np.array([class_weight[k] for k in sorted(class_weight.keys())])
        y_w = y_log_ones * class_arr / nb_pos

        loss = - np.sum(y_w) / np.sum(class_arr)
        return 'wloss', loss, False
    
    return metric

## Galactic objects

Select the galactic objects.

In [11]:
X_train_gal = X_train[X_train['hostgal_photoz'] == 0]
y_train_gal = y_train[X_train['hostgal_photoz'] == 0]
X_test_gal = X_test[X_test['hostgal_photoz'] == 0]

class_to_int = {c: i for i, c in enumerate(y_train_gal.unique())}
int_to_class = {i: c for c, i in class_to_int.items()}

Train the model.

In [12]:
import lightgbm as lgbm
import numpy as np
from sklearn import model_selection


params = {
    'application': 'multiclass',
    'boosting_type': 'gbdt',
    'num_classes': y_train_gal.nunique(),
    'metric': 'None',
    'num_threads': -1,
    'num_leaves': 7,
    'max_bin': 127,
    'min_data_in_bin': 20,
    'min_data_in_leaf': 340,
    'min_sum_hessian_in_leaf': 5e-2,
    'learning_rate': 0.07,
    'feature_fraction': 0.7,
    'feature_fraction_seed': 42,
    'bagging_fraction': 1,
    'bagging_freq': 0,
    'bagging_seed': 42,
    'lambda_l1': 0.6,
    'lambda_l2': 0.1,
    'verbosity': 2,
}
        

cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
feature_importances = pd.DataFrame(index=X_train_gal.columns)
gal_fit_scores = np.zeros(cv.n_splits)
gal_val_scores = np.zeros(cv.n_splits)
submission.loc[X_test_gal.index, y_train_gal.unique()] = 0.0

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train_gal, y_train_gal)):
    
    X_fit = X_train_gal.iloc[fit_idx]
    y_fit = y_train_gal.iloc[fit_idx].map(class_to_int)
    w_fit = y_train_gal.iloc[fit_idx].map(class_weights)
    X_val = X_train_gal.iloc[val_idx]
    y_val = y_train_gal.iloc[val_idx].map(class_to_int)
    w_val = y_train_gal.iloc[val_idx].map(class_weights)
    
    # Train the model
    fit_set = lgbm.Dataset(X_fit, y_fit, weight=w_fit)
    val_set = lgbm.Dataset(X_val, y_val, reference=fit_set, weight=w_val)

    evals_result = {}
    model = lgbm.train(
        params=params,
        train_set=fit_set,
        feval=make_metric(class_to_int),
        num_boost_round=10000,
        valid_sets=(fit_set, val_set),
        valid_names=('fit', 'val'),
        verbose_eval=50,
        early_stopping_rounds=80,
        evals_result=evals_result
    )
    
    # Store the feature importances
    feature_importances[f'gain_{i}'] = model.feature_importance('gain')
    feature_importances[f'split_{i}'] = model.feature_importance('split')
    
    # Store the predictions
    y_pred = pd.DataFrame(model.predict(X_test_gal), index=X_test_gal.index)
    y_pred.columns = y_pred.columns.map(int_to_class)
    submission.loc[y_pred.index, y_pred.columns] += y_pred / cv.n_splits
    
    # Store the scores
    gal_fit_scores[i] = evals_result['fit']['wloss'][-1]
    gal_val_scores[i] = evals_result['val']['wloss'][-1]

print(f'- Train loss: {gal_fit_scores.mean():.3f} (±{gal_fit_scores.std():.3f})')
print(f'- Valid loss: {gal_val_scores.mean():.3f} (±{gal_val_scores.std():.3f})')

Training until validation scores don't improve for 80 rounds.
[50]	fit's wloss: 0.607848	val's wloss: 0.702398
[100]	fit's wloss: 0.203374	val's wloss: 0.338498
[150]	fit's wloss: 0.0829563	val's wloss: 0.235889
[200]	fit's wloss: 0.0435351	val's wloss: 0.209353
[250]	fit's wloss: 0.0283632	val's wloss: 0.199604
[300]	fit's wloss: 0.0216562	val's wloss: 0.195329
[350]	fit's wloss: 0.0177593	val's wloss: 0.19166
[400]	fit's wloss: 0.0156739	val's wloss: 0.192049
[450]	fit's wloss: 0.0148484	val's wloss: 0.193344
Early stopping, best iteration is:
[397]	fit's wloss: 0.0157378	val's wloss: 0.1914
Training until validation scores don't improve for 80 rounds.
[50]	fit's wloss: 0.590795	val's wloss: 0.629526
[100]	fit's wloss: 0.196801	val's wloss: 0.249022
[150]	fit's wloss: 0.0844492	val's wloss: 0.1484
[200]	fit's wloss: 0.0442697	val's wloss: 0.117733
[250]	fit's wloss: 0.0288777	val's wloss: 0.108254
[300]	fit's wloss: 0.0214874	val's wloss: 0.1028
[350]	fit's wloss: 0.0174044	val's wlo

- Train loss: 0.014 (±0.001)
- Valid loss: 0.131 (±0.037)

In [13]:
feature_importances.sort_values('gain_0', ascending=False)

Unnamed: 0,gain_0,split_0,gain_1,split_1,gain_2,split_2,gain_3,split_3,gain_4,split_4
flux_skew,5337.756124,122,5243.309733,161,4595.404502,117,4567.239498,84,5064.072288,126
flux_p50_minus_p10_2,2967.843339,32,2670.645252,8,2551.749651,10,2726.583953,8,2670.414921,10
flux_p50_minus_p10_1,2887.791971,32,2731.102019,40,3112.725825,30,3212.960105,36,2985.769070,31
flux_skew_2,1653.315620,120,1825.814212,127,2486.642952,136,2403.556493,126,1961.579174,81
flux_p75_minus_p25_2,1370.013326,76,1435.155902,74,2097.973082,80,1559.328748,38,1397.300181,45
flux_p90_minus_p10_2,1272.906927,24,1230.608916,19,630.495808,17,920.000295,18,806.843810,9
flux_skew_3,1113.082481,75,1613.125132,76,1219.693988,78,1549.509502,74,1563.612063,67
flux_p75_minus_p25_1,1019.511474,31,1183.154529,15,725.504536,9,1040.269548,27,1433.660596,30
detected_mean,984.986689,82,656.185008,67,807.478966,65,450.185205,76,8.972844,24
flux_count_ratio_2,976.201955,45,765.555220,7,824.088241,9,772.166214,6,905.869490,21


## Extragalactic objects

Select the extragalactic objects.

In [14]:
X_train_ex = X_train[X_train['hostgal_photoz'] > 0]
y_train_ex = y_train[X_train['hostgal_photoz'] > 0]
X_test_ex = X_test[X_test['hostgal_photoz'] > 0]

class_to_int = {c: i for i, c in enumerate(y_train_ex.unique())}
int_to_class = {i: c for c, i in class_to_int.items()}

In [15]:
import lightgbm as lgbm
import numpy as np
from sklearn import model_selection


params = {
    'application': 'multiclass',
    'boosting_type': 'gbdt',
    'num_classes': y_train_ex.nunique(),
    'metric': 'None',
    'num_threads': 8,
    'num_leaves': 8,
    'max_bin': 255,
    'min_data_in_bin': 40,
    'min_data_in_leaf': 420,
    'min_sum_hessian_in_leaf': 2e-4,
    'learning_rate': 0.07,
    'feature_fraction': 0.7,
    'feature_fraction_seed': 42,
    'bagging_fraction': 1,
    'bagging_freq': 0,
    'bagging_seed': 42,
    'lambda_l1': 1.5,
    'lambda_l2': 0.2,
    'verbosity': 2,
}
        

cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
feature_importances = pd.DataFrame(index=X_train_ex.columns)
ex_fit_scores = np.zeros(cv.n_splits)
ex_val_scores = np.zeros(cv.n_splits)
submission.loc[X_test_ex.index, y_train_ex.unique()] = 0.0

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train_ex, y_train_ex)):
    
    X_fit = X_train_ex.iloc[fit_idx]
    y_fit = y_train_ex.iloc[fit_idx].map(class_to_int)
    w_fit = y_train_ex.iloc[fit_idx].map(class_weights)
    X_val = X_train_ex.iloc[val_idx]
    y_val = y_train_ex.iloc[val_idx].map(class_to_int)
    w_val = y_train_ex.iloc[val_idx].map(class_weights)
    
    # Train the model
    fit_set = lgbm.Dataset(X_fit.values.astype(np.float32), y_fit, weight=w_fit)
    val_set = lgbm.Dataset(X_val.values.astype(np.float32), y_val, reference=fit_set, weight=w_val)
    
    evals_result = {}
    model = lgbm.train(
        params=params,
        train_set=fit_set,
        feval=make_metric(class_to_int),
        num_boost_round=10000,
        valid_sets=(fit_set, val_set),
        valid_names=('fit', 'val'),
        verbose_eval=50,
        early_stopping_rounds=80,
        evals_result=evals_result
    )
    
    # Store the feature importances
    feature_importances[f'gain_{i}'] = model.feature_importance('gain')
    feature_importances[f'split_{i}'] = model.feature_importance('split')
    
    # Store the predictions
    y_pred = pd.DataFrame(model.predict(X_test_ex.values), index=X_test_ex.index)
    y_pred.columns = y_pred.columns.map(int_to_class)
    submission.loc[y_pred.index, y_pred.columns] += y_pred / cv.n_splits
    
    # Store the scores
    ex_fit_scores[i] = evals_result['fit']['wloss'][-1]
    ex_val_scores[i] = evals_result['val']['wloss'][-1]

print(f'- Train loss: {ex_fit_scores.mean():.3f} (±{ex_fit_scores.std():.3f})')
print(f'- Valid loss: {ex_val_scores.mean():.3f} (±{ex_val_scores.std():.3f})')

Training until validation scores don't improve for 80 rounds.
[50]	fit's wloss: 1.24248	val's wloss: 1.39995
[100]	fit's wloss: 0.869457	val's wloss: 1.15991
[150]	fit's wloss: 0.645273	val's wloss: 1.08193
[200]	fit's wloss: 0.499194	val's wloss: 1.05429
[250]	fit's wloss: 0.397396	val's wloss: 1.04881
[300]	fit's wloss: 0.323924	val's wloss: 1.05547
Early stopping, best iteration is:
[231]	fit's wloss: 0.431817	val's wloss: 1.04769
Training until validation scores don't improve for 80 rounds.
[50]	fit's wloss: 1.22598	val's wloss: 1.36647
[100]	fit's wloss: 0.867542	val's wloss: 1.15939
[150]	fit's wloss: 0.645449	val's wloss: 1.07038
[200]	fit's wloss: 0.50244	val's wloss: 1.03164
[250]	fit's wloss: 0.401582	val's wloss: 1.01316
[300]	fit's wloss: 0.328943	val's wloss: 1.00895
[350]	fit's wloss: 0.274781	val's wloss: 1.01144
Early stopping, best iteration is:
[274]	fit's wloss: 0.364134	val's wloss: 1.00657
Training until validation scores don't improve for 80 rounds.
[50]	fit's wlo

- Train loss: 0.329 (±0.036)
- Valid loss: 1.072 (±0.040)

In [16]:
feature_importances.sort_values('gain_0', ascending=False)

Unnamed: 0,gain_0,split_0,gain_1,split_1,gain_2,split_2,gain_3,split_3,gain_4,split_4
mjd_ptp_1,10906.069974,351,11527.523372,405,11283.694047,364,11033.527175,267,10942.484801,321
distmod,4659.675314,212,3999.719491,233,4481.286495,195,4909.251560,207,4150.148583,202
flux_std_0_over_4,3582.236764,37,1163.795200,56,3410.899868,15,2146.156751,24,3050.102678,46
hostgal_photoz,2518.901788,170,3313.502169,183,2411.539048,129,2199.965317,133,3515.064532,194
flux_kurtosis,2114.570258,190,1532.873134,181,1924.711674,182,1582.037115,131,2005.482486,184
flux_mean_0_over_3,1930.621483,47,2140.398599,50,1932.042197,42,1617.373653,41,1761.165118,28
flux_skew,1854.555177,68,2021.340467,87,1877.928994,83,1795.231695,65,1958.088205,87
flux_p50_minus_p10_1,1788.741887,19,1707.466975,29,1535.179367,22,2033.096161,20,1957.423987,19
flux_max_2_over_5,1743.826954,92,1547.583392,106,1824.644907,75,1847.315703,114,1675.847648,99
flux_mean_0_over_4,1599.627129,21,1930.138823,45,1241.789357,24,1427.245720,9,1024.558751,16


## Class 99

In [17]:
submission['class_99'] = (1 - submission[submission.columns.drop('class_99')].max(axis='columns')) / 2.3

In [22]:
preds_99 = np.ones(submission.shape[0])
for col in submission[submission.columns.drop('class_99')].columns:
    preds_99 *= (1 - submission[col])
submission['class_99'] = 0.14 * preds_99 / np.mean(preds_99) 

class_15
class_16
class_42
class_52
class_53
class_6
class_62
class_64
class_65
class_67
class_88
class_90
class_92
class_95


## Saving the submission

In [23]:
submission.head()

Unnamed: 0_level_0,class_15,class_16,class_42,class_52,class_53,class_6,class_62,class_64,class_65,class_67,class_88,class_90,class_92,class_95,class_99
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
13,0.001491,0.0,0.482992,0.097757,0.0,0.0,0.021793,0.000234,0.0,0.00287,0.000443,0.389068,0.0,0.00335,0.19437
14,0.008019,0.0,0.171066,0.016562,0.0,0.0,0.018951,0.000544,0.0,0.003063,0.006354,0.768539,0.0,0.006901,0.126948
17,0.003069,0.0,0.096649,0.011633,0.0,0.0,0.018848,0.009686,0.0,0.007451,0.003548,0.819525,0.0,0.029592,0.105332
23,0.001128,0.0,0.06938,0.012166,0.0,0.0,0.031437,0.005518,0.0,0.063092,0.003907,0.806339,0.0,0.007033,0.111608
34,0.000593,0.0,0.042397,0.038988,0.0,0.0,0.01071,6.7e-05,0.0,0.0052,0.000209,0.901524,0.0,0.000314,0.062636


In [18]:
submission.head()

Unnamed: 0_level_0,class_15,class_16,class_42,class_52,class_53,class_6,class_62,class_64,class_65,class_67,class_88,class_90,class_92,class_95,class_99
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
13,0.001491,0.0,0.482992,0.097757,0.0,0.0,0.021793,0.000234,0.0,0.00287,0.000443,0.389068,0.0,0.00335,0.224786
14,0.008019,0.0,0.171066,0.016562,0.0,0.0,0.018951,0.000544,0.0,0.003063,0.006354,0.768539,0.0,0.006901,0.100635
17,0.003069,0.0,0.096649,0.011633,0.0,0.0,0.018848,0.009686,0.0,0.007451,0.003548,0.819525,0.0,0.029592,0.078467
23,0.001128,0.0,0.06938,0.012166,0.0,0.0,0.031437,0.005518,0.0,0.063092,0.003907,0.806339,0.0,0.007033,0.0842
34,0.000593,0.0,0.042397,0.038988,0.0,0.0,0.01071,6.7e-05,0.0,0.0052,0.000209,0.901524,0.0,0.000314,0.042816


Sanity checks.

In [24]:
assert submission[X_test['hostgal_photoz'] == 0][y_train_ex.unique().categories].sum().sum() == 0
assert submission[X_test['hostgal_photoz'] > 0][y_train_gal.unique().categories].sum().sum() == 0

Save the submission. We align with the sample submission just to make sure.

In [25]:
name = f'{gal_val_scores.mean():.3f}_{gal_val_scores.std():.3f}_{ex_val_scores.mean():.3f}_{ex_val_scores.std():.3f}'

sample_sub = pd.read_csv('data/sample_submission.csv').set_index('object_id')

submission.loc[sample_sub.index, sample_sub.columns].to_csv(f'submissions/lol_{name}.csv.gz', compression='gzip')