# Feature extraction

**You only have to run the following cell once**.

In [1]:
import numpy as np
import pandas as pd

dtypes = {
    'object_id': np.uint32,
    'mjd': np.float32,
    'passband': np.uint8,
    'flux': np.float32,
    'flux_err': np.float32,
    'detected': bool
}
lcs = pd.concat(
    (
        pd.read_csv('data/training_set.csv', dtype=dtypes),
        pd.read_csv('data/test_set.csv', dtype=dtypes)
    ),
    sort=False,
    ignore_index=True
)
lcs.to_hdf('data/data.h5', 'light_curves')

dtypes = {
    'object_id': np.uint32
}
df = pd.concat(
    (
        pd.read_csv('data/training_set_metadata.csv', dtype=dtypes),
        pd.read_csv('data/test_set_metadata.csv', dtype=dtypes)
    ),
    sort=False,
    ignore_index=True
)
df['is_train'] = df['target'].notnull()
df.to_hdf('data/data.h5', 'meta')

KeyboardInterrupt: 

Load the data from the HDF5 files (it weights much less than the initial files).

In [1]:
import pandas as pd
import numpy as np

lcs = pd.read_hdf('data/data.h5', 'light_curves')
df = pd.read_hdf('data/data.h5', 'meta')

Parse the time.

In [2]:
def mjd_to_unix(mjd):
    return (mjd - 40587) * 86400

Object/passband features.

In [2]:
import numpy as np

stats = pd.read_csv('data/features/flux_stats.csv')\
          .pivot(index='object_id', columns='passband')\
          .astype(np.float32)

# Collapse the column names
names = stats.columns.get_level_values(0)
passbands = stats.columns.get_level_values(1).astype(str)
stats.columns = ['_'.join(pair) for pair in zip(names, passbands)]

stats['flux_diff_min_0'].replace(np.inf, stats['flux_diff_min_0'][stats['flux_diff_min_0'] != np.inf].max(), inplace=True)

df = df.join(stats, on='object_id')

Object features.

In [3]:
stats2 = pd.read_csv('data/features/flux_stats2.csv').set_index('object_id')
df = df.join(stats2, on='object_id')

Compute ratios because why not.

In [4]:
import itertools

for (a, b) in itertools.combinations(range(6), 2):
    for stat in ('bfr', 'mean', 'min', 'max', 'ptp', 'skew', 'kurtosis'):
        df[f'flux_{stat}_{a}_{b}'] = df[f'flux_{stat}_{a}'] / (df[f'flux_{stat}_{b}'] + 1)

See what we got.

In [5]:
df.head()

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,...,flux_ptp_3_5,flux_skew_3_5,flux_kurtosis_3_5,flux_bfr_4_5,flux_mean_4_5,flux_min_4_5,flux_max_4_5,flux_ptp_4_5,flux_skew_4_5,flux_kurtosis_4_5
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,...,1.217429,0.254461,2.553047,0.940293,1.204624,1.000876,1.007293,1.002662,0.173701,2.714094
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,...,0.791304,-0.134179,-6.874031,0.031504,1.133581,0.930032,0.623169,0.737599,-0.051475,-7.132717
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.232,0.2262,0.0157,40.2561,...,0.578153,0.982566,1.537941,2.819986,0.777834,0.321386,0.851996,0.696552,0.936143,1.305303
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,...,1.362864,0.893881,1.243282,0.960026,1.118192,1.675188,1.288533,1.303509,0.828314,1.041245
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,...,1.17849,0.769534,0.908654,8.674057,1.254454,1.677773,1.303589,1.323305,0.833776,1.056873


# Learning

In [6]:
to_drop = ['is_train', 'target', 'hostgal_specz']

train = df[df['is_train']].set_index('object_id')
test = df[~df['is_train']].set_index('object_id')

X_train = train.drop(columns=to_drop)
y_train = train['target'].apply(lambda x: f'class_{int(x)}').astype('category')
X_test = test.drop(columns=to_drop)
submission = pd.DataFrame(0.0, index=test.index, columns=y_train.cat.categories)
submission['class_99'] = 0.0

class_weights = {c: 1 for c in y_train.cat.categories}
class_weights['class_64'] = 2
class_weights['class_15'] = 2

In [7]:
assert len(X_train.columns) == len(X_test.columns)
assert len(X_train) == len(y_train)
assert len(X_test) == 3492890
assert len(submission) == 3492890

## Galactic objects

Select the galactic objects.

In [8]:
X_train_gal = X_train[X_train['hostgal_photoz'] == 0]
y_train_gal = y_train[X_train['hostgal_photoz'] == 0]
X_test_gal = X_test[X_test['hostgal_photoz'] == 0]

class_to_int = {c: i for i, c in enumerate(y_train_gal.unique())}
int_to_class = {i: c for c, i in class_to_int.items()}

Train the model.

In [9]:
import lightgbm as lgbm
import numpy as np
from sklearn import model_selection


params = {
    'application': 'multiclass',
    'boosting_type': 'gbdt',
    'num_classes': y_train_gal.nunique(),
    'metric': 'multi_logloss',
    'num_threads': 8,
    'num_leaves': 2 ** 3,
    'min_data_per_group': 300,
    'max_cat_threshold': 32,
    'max_cat_to_onehot': 6,
    'cat_smooth': 30,
    'cat_l2': 10,
    'max_bin': 255,
    'min_data_in_bin': 20,
    'min_data_in_leaf': 100,
    'learning_rate': 0.08,
    'feature_fraction': 0.7,
    'feature_fraction_seed': 42,
    'bagging_fraction': 1,
    'bagging_freq': 0,
    'bagging_seed': 42,
    'lambda_l1': 0,
    'lambda_l2': 0,
    'verbosity': 2,
}
        

cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
feature_importances = pd.DataFrame(index=X_train_gal.columns)
gal_fit_scores = np.zeros(cv.n_splits)
gal_val_scores = np.zeros(cv.n_splits)
submission.loc[X_test_gal.index, y_train_gal.unique()] = 0.0

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train_gal, y_train_gal)):
    
    X_fit = X_train_gal.iloc[fit_idx]
    y_fit = y_train_gal.iloc[fit_idx].map(class_to_int)
    w_fit = y_train_gal.iloc[fit_idx].map(class_weights)
    X_val = X_train_gal.iloc[val_idx]
    y_val = y_train_gal.iloc[val_idx].map(class_to_int)
    w_val = y_train_gal.iloc[val_idx].map(class_weights)
    
    # Train the model
    fit_set = lgbm.Dataset(X_fit, y_fit, weight=w_fit)
    val_set = lgbm.Dataset(X_val, y_val, reference=fit_set, weight=w_val)

    evals_result = {}
    model = lgbm.train(
        params=params,
        train_set=fit_set,
        num_boost_round=10000,
        valid_sets=(fit_set, val_set),
        valid_names=('fit', 'val'),
        verbose_eval=50,
        early_stopping_rounds=50,
        evals_result=evals_result
    )
    
    # Store the feature importances
    feature_importances[f'gain_{i}'] = model.feature_importance('gain')
    feature_importances[f'split_{i}'] = model.feature_importance('split')
    
    # Store the predictions
    y_pred = pd.DataFrame(model.predict(X_test_gal), index=X_test_gal.index)
    y_pred.columns = y_pred.columns.map(int_to_class)
    submission.loc[y_pred.index, y_pred.columns] += y_pred / cv.n_splits
    
    # Store the scores
    gal_fit_scores[i] = evals_result['fit']['multi_logloss'][-1]
    gal_val_scores[i] = evals_result['val']['multi_logloss'][-1]

print(f'- Train logloss: {gal_fit_scores.mean():.3f} (±{gal_fit_scores.std():.3f})')
print(f'- Valid logloss: {gal_val_scores.mean():.3f} (±{gal_val_scores.std():.3f})')

Training until validation scores don't improve for 50 rounds.
[50]	fit's multi_logloss: 0.128621	val's multi_logloss: 0.200793
[100]	fit's multi_logloss: 0.0209878	val's multi_logloss: 0.0940356
[150]	fit's multi_logloss: 0.00421758	val's multi_logloss: 0.0783682
[200]	fit's multi_logloss: 0.000938744	val's multi_logloss: 0.0772804
Early stopping, best iteration is:
[198]	fit's multi_logloss: 0.000998732	val's multi_logloss: 0.0772504
Training until validation scores don't improve for 50 rounds.
[50]	fit's multi_logloss: 0.13312	val's multi_logloss: 0.135975
[100]	fit's multi_logloss: 0.0232558	val's multi_logloss: 0.0486849
[150]	fit's multi_logloss: 0.00516147	val's multi_logloss: 0.0361724
[200]	fit's multi_logloss: 0.00119667	val's multi_logloss: 0.0326759
[250]	fit's multi_logloss: 0.000292402	val's multi_logloss: 0.0281451
[300]	fit's multi_logloss: 7.41583e-05	val's multi_logloss: 0.028934
Early stopping, best iteration is:
[256]	fit's multi_logloss: 0.000248029	val's multi_logl

- Train logloss: 0.001 (±0.001)
- Valid logloss: 0.072 (±0.028)

In [10]:
feature_importances.sort_values('gain_0', ascending=False).head()

Unnamed: 0,gain_0,split_0,gain_1,split_1,gain_2,split_2,gain_3,split_3,gain_4,split_4
flux_skew_2,3923.312595,245,3864.632253,263,3936.560117,226,3991.861857,192,3846.072873,248
flux_min_0_2,3275.177206,110,2515.655173,112,3731.89179,99,2308.091591,57,2628.702468,89
flux_max_1,1713.459198,46,1292.323942,55,1463.907245,39,1684.991553,43,1435.815768,72
flux_err_mean_1,1059.900253,126,1226.947777,102,823.756809,107,814.487945,64,1161.369968,93
flux_min_2_5,785.890369,90,598.553954,150,579.932184,122,529.984494,88,144.868597,99


## Extragalactic objects

Select the extragalactic objects.

In [11]:
X_train_ex = X_train[X_train['hostgal_photoz'] > 0]
y_train_ex = y_train[X_train['hostgal_photoz'] > 0]
X_test_ex = X_test[X_test['hostgal_photoz'] > 0]

class_to_int = {c: i for i, c in enumerate(y_train_ex.unique())}
int_to_class = {i: c for c, i in class_to_int.items()}

In [12]:
import lightgbm as lgbm
import numpy as np
from sklearn import model_selection


params = {
    'application': 'multiclass',
    'boosting_type': 'gbdt',
    'num_classes': y_train_ex.nunique(),
    'metric': 'multi_logloss',
    'num_threads': 8,
    'num_leaves': 2 ** 4,
    'min_data_per_group': 300,
    'max_cat_threshold': 32,
    'max_cat_to_onehot': 6,
    'cat_smooth': 30,
    'cat_l2': 10,
    'max_bin': 255,
    'min_data_in_bin': 20,
    'min_data_in_leaf': 100,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'feature_fraction_seed': 42,
    'bagging_fraction': 1,
    'bagging_freq': 0,
    'bagging_seed': 42,
    'lambda_l1': 1,
    'lambda_l2': 2,
    'verbosity': 2,
}
        

cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
feature_importances = pd.DataFrame(index=X_train_ex.columns)
ex_fit_scores = np.zeros(cv.n_splits)
ex_val_scores = np.zeros(cv.n_splits)
submission.loc[X_test_ex.index, y_train_ex.unique()] = 0.0

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train_ex, y_train_ex)):
    
    X_fit = X_train_ex.iloc[fit_idx]
    y_fit = y_train_ex.iloc[fit_idx].map(class_to_int)
    w_fit = y_train_ex.iloc[fit_idx].map(class_weights)
    X_val = X_train_ex.iloc[val_idx]
    y_val = y_train_ex.iloc[val_idx].map(class_to_int)
    w_val = y_train_ex.iloc[val_idx].map(class_weights)
    
    # Train the model
    fit_set = lgbm.Dataset(X_fit.values.astype(np.float32), y_fit, weight=w_fit)
    val_set = lgbm.Dataset(X_val.values.astype(np.float32), y_val, reference=fit_set, weight=w_val)
    
    evals_result = {}
    model = lgbm.train(
        params=params,
        train_set=fit_set,
        num_boost_round=10000,
        valid_sets=(fit_set, val_set),
        valid_names=('fit', 'val'),
        verbose_eval=50,
        early_stopping_rounds=50,
        evals_result=evals_result
    )
    
    # Store the feature importances
    feature_importances[f'gain_{i}'] = model.feature_importance('gain')
    feature_importances[f'split_{i}'] = model.feature_importance('split')
    
    # Store the predictions
    y_pred = pd.DataFrame(model.predict(X_test_ex.values.astype(np.float32)), index=X_test_ex.index)
    y_pred.columns = y_pred.columns.map(int_to_class)
    submission.loc[y_pred.index, y_pred.columns] += y_pred / cv.n_splits
    
    # Store the scores
    ex_fit_scores[i] = evals_result['fit']['multi_logloss'][-1]
    ex_val_scores[i] = evals_result['val']['multi_logloss'][-1]

print(f'- Train logloss: {ex_fit_scores.mean():.3f} (±{ex_fit_scores.std():.3f})')
print(f'- Valid logloss: {ex_val_scores.mean():.3f} (±{ex_val_scores.std():.3f})')

Training until validation scores don't improve for 50 rounds.
[50]	fit's multi_logloss: 0.855813	val's multi_logloss: 1.0364
[100]	fit's multi_logloss: 0.615097	val's multi_logloss: 0.906576
[150]	fit's multi_logloss: 0.484772	val's multi_logloss: 0.868299
[200]	fit's multi_logloss: 0.39619	val's multi_logloss: 0.853095
[250]	fit's multi_logloss: 0.330429	val's multi_logloss: 0.850368
[300]	fit's multi_logloss: 0.279836	val's multi_logloss: 0.848187
[350]	fit's multi_logloss: 0.239848	val's multi_logloss: 0.849843
Early stopping, best iteration is:
[323]	fit's multi_logloss: 0.260222	val's multi_logloss: 0.847664
Training until validation scores don't improve for 50 rounds.
[50]	fit's multi_logloss: 0.86565	val's multi_logloss: 0.985339
[100]	fit's multi_logloss: 0.622524	val's multi_logloss: 0.856409
[150]	fit's multi_logloss: 0.490689	val's multi_logloss: 0.81911
[200]	fit's multi_logloss: 0.401764	val's multi_logloss: 0.802765
[250]	fit's multi_logloss: 0.335043	val's multi_logloss:

- Train logloss: 0.228 (±0.022)
- Valid logloss: 0.839 (±0.035)

In [13]:
feature_importances.sort_values('gain_0', ascending=False).head()

Unnamed: 0,gain_0,split_0,gain_1,split_1,gain_2,split_2,gain_3,split_3,gain_4,split_4
distmod,5232.722202,553,4395.977741,512,5112.973466,539,6137.207058,508,5003.321872,461
hostgal_photoz,4423.98232,556,5319.86055,671,4676.300651,648,3706.277157,467,4649.583332,587
flux_min_1,3432.067098,176,3038.369926,148,2614.41882,169,3070.556864,154,3163.811519,123
flux_max_0_4,2505.937873,93,1695.758131,81,2352.623481,120,2312.764972,92,3119.292677,68
flux_mean_0_4,2318.959552,86,1465.185663,73,1367.641956,86,1616.38034,59,748.188956,69


## Class 99

In [14]:
submission['class_99'] = (1 - submission[submission.columns.drop('class_99')].max(axis='columns')) / 2

## Putting it all together

In [15]:
submission.head()

Unnamed: 0_level_0,class_15,class_16,class_42,class_52,class_53,class_6,class_62,class_64,class_65,class_67,class_88,class_90,class_92,class_95,class_99
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
13,0.000559,0.0,0.538966,0.032829,0.0,0.0,0.040014,5.3e-05,0.0,0.004167,0.000176,0.382606,0.0,0.000631,0.230517
14,0.010043,0.0,0.192385,0.028432,0.0,0.0,0.01703,0.001124,0.0,0.003811,0.003508,0.739839,0.0,0.003828,0.13008
17,0.002264,0.0,0.084542,0.01549,0.0,0.0,0.017709,0.00073,0.0,0.0037,0.001085,0.868628,0.0,0.005852,0.065686
23,0.000852,0.0,0.134251,0.010185,0.0,0.0,0.050753,0.000431,0.0,0.061123,0.00077,0.730264,0.0,0.01137,0.134868
34,0.001118,0.0,0.034112,0.063115,0.0,0.0,0.007006,7.4e-05,0.0,0.001586,8.3e-05,0.892646,0.0,0.000259,0.053677


Sanity checks.

In [16]:
assert submission[X_test['hostgal_photoz'] == 0][y_train_ex.unique().categories].sum().sum() == 0
assert submission[X_test['hostgal_photoz'] > 0][y_train_gal.unique().categories].sum().sum() == 0

Save the submission. We align with the sample submission just to make sure.

In [17]:
name = f'{gal_val_scores.mean():.3f}_{gal_val_scores.std():.3f}_{ex_val_scores.mean():.3f}_{ex_val_scores.std():.3f}'

sample_sub = pd.read_csv('data/sample_submission.csv').set_index('object_id')

submission.loc[sample_sub.index, sample_sub.columns].to_csv(f'submissions/{name}.csv.gz', compression='gzip')

In [48]:
from sklearn import metrics

metrics.log_loss(y_fit, y_pred)

1.2026580625061984

In [18]:
from sklearn import preprocessing


def weighted_log_loss(y_true, y_pred, class_weights, eps=10e-15):
    y_true = preprocessing.LabelBinarizer().fit_transform(y_true)
    
    # Normalize row-wise
    y_pred /= y_pred.sum(axis=1)[:, np.newaxis]
    
    # Limit 0s and 1s
    y_pred = np.clip(y_pred, eps, 1-eps)
    
    return -(y_true * np.log(y_pred)).sum(axis=1).mean()
    
    
class_weights = np.array([1] * 14)
weighted_log_loss(y_fit, y_pred, class_weights)

ValueError: Unable to coerce to DataFrame, shape must be (3102380, 9): given (3102380, 1)