# Feature extraction

**You only have to run the following cell once**.

In [1]:
import numpy as np
import pandas as pd

dtypes = {
    'object_id': np.uint32,
    'mjd': np.float32,
    'passband': np.uint8,
    'flux': np.float32,
    'flux_err': np.float32,
    'detected': bool
}
lcs = pd.concat(
    (
        pd.read_csv('data/training_set.csv', dtype=dtypes),
        pd.read_csv('data/test_set.csv', dtype=dtypes)
    ),
    sort=False,
    ignore_index=True
)
lcs.to_hdf('data/data.h5', 'light_curves')

dtypes = {
    'object_id': np.uint32
}
df = pd.concat(
    (
        pd.read_csv('data/training_set_metadata.csv', dtype=dtypes),
        pd.read_csv('data/test_set_metadata.csv', dtype=dtypes)
    ),
    sort=False,
    ignore_index=True
)
df['is_train'] = df['target'].notnull()
df.to_hdf('data/data.h5', 'meta')

KeyboardInterrupt: 

Load the data from the HDF5 files (it weights much less than the initial files).

In [3]:
import pandas as pd
import numpy as np

lcs = pd.read_hdf('data/data.h5', 'light_curves')
df = pd.read_hdf('data/data.h5', 'meta')

Parse the time.

In [2]:
def mjd_to_unix(mjd):
    return (mjd - 40587) * 86400

#lcs['mjd'] = pd.to_datetime(lcs['mjd'].apply(mjd_to_unix), unit='s')

Object/passband features.

In [17]:
import numpy as np

stats = pd.read_csv('data/features/flux_stats.csv')\
          .pivot(index='object_id', columns='passband')\
          .astype(np.float32)

# Collapse the column names
names = stats.columns.get_level_values(0)
passbands = stats.columns.get_level_values(1).astype(str)
stats.columns = ['_'.join(pair) for pair in zip(names, passbands)]

stats['flux_diff_min_0'].replace(np.inf, stats['flux_diff_min_0'][stats['flux_diff_min_0'] != np.inf].max(), inplace=True)

df = df.join(stats, on='object_id')

ValueError: columns overlap but no suffix specified: Index(['bfr_0', 'bfr_1', 'bfr_2', 'bfr_3', 'bfr_4', 'bfr_5',
       'flux_diff_kurtosis_0', 'flux_diff_kurtosis_1', 'flux_diff_kurtosis_2',
       'flux_diff_kurtosis_3',
       ...
       'flux_ptp_2', 'flux_ptp_3', 'flux_ptp_4', 'flux_ptp_5', 'flux_skew_0',
       'flux_skew_1', 'flux_skew_2', 'flux_skew_3', 'flux_skew_4',
       'flux_skew_5'],
      dtype='object', length=114)

Object features.

In [17]:
stats2 = pd.read_csv('data/features/flux_stats2.csv').set_index('object_id')
df = df.join(stats2, on='object_id')

See what we got.

In [18]:
df.head()

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,...,flux_ptp_4,flux_ptp_5,flux_skew_0,flux_skew_1,flux_skew_2,flux_skew_3,flux_skew_4,flux_skew_5,count,passband_n_unique
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,...,804.138245,801.003235,0.125827,0.404755,0.331063,0.285492,0.194883,0.121948,352,6
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,...,22.114735,28.98205,0.254446,-0.085494,-0.022066,-0.162664,-0.062403,0.212294,350,6
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.232,0.2262,0.0157,40.2561,...,46.996292,66.469872,0.349431,0.457635,2.315707,2.584661,2.462542,1.63052,330,6
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,...,199.127579,151.76268,1.980815,6.817995,5.534683,3.650356,3.3826,3.083716,351,6
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,...,160.143936,120.018127,-0.324207,2.315295,2.995322,3.509344,3.802308,3.560348,352,6


# Learning

In [35]:
to_drop = ['is_train', 'target', 'hostgal_specz']

train = df.query('is_train == True').set_index('object_id')
test = df.query('is_train == False').set_index('object_id')

X_train = train.drop(columns=to_drop)
y_train = train['target'].apply(lambda x: f'class_{int(x)}').astype('category')
X_test = test.drop(columns=to_drop)
submission = pd.DataFrame(0.0, index=test.index, columns=y_train.cat.categories)
submission['class_99'] = 0.0

class_weights = {c: 1 for c in y_train.cat.categories}
class_weights['class_64'] = 2
class_weights['class_15'] = 2

In [36]:
assert len(X_train.columns) == len(X_test.columns)
assert len(X_train) == len(y_train)
assert len(X_train) == len(w_train)
assert len(X_test) == 3492890
assert len(submission) == 3492890

## Galactic objects

Select the galactic objects.

In [40]:
X_train_gal = X_train[X_train['hostgal_photoz'] == 0]
y_train_gal = y_train[X_train['hostgal_photoz'] == 0]
X_test_gal = X_test[X_test['hostgal_photoz'] == 0]

class_to_int = {c: i for i, c in enumerate(y_train_gal.unique())}
int_to_class = {i: c for c, i in class_to_int.items()}

Train the model.

In [42]:
import lightgbm as lgbm
import numpy as np
from sklearn import model_selection


params = {
    'application': 'multiclass',
    'boosting_type': 'gbdt',
    'num_classes': y_train_gal.nunique(),
    'metric': 'multi_logloss',
    'num_threads': 8,
    'num_leaves': 2 ** 3,
    'min_data_per_group': 300,
    'max_cat_threshold': 32,
    'max_cat_to_onehot': 6,
    'cat_smooth': 30,
    'cat_l2': 10,
    'max_bin': 255,
    'min_data_in_bin': 20,
    'min_data_in_leaf': 100,
    'learning_rate': 0.08,
    'feature_fraction': 1,
    'feature_fraction_seed': 42,
    'bagging_fraction': 1,
    'bagging_freq': 0,
    'bagging_seed': 42,
    'lambda_l1': 0.01,
    'lambda_l2': 0.001,
    'verbosity': 2,
}
        

cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
feature_importances = pd.DataFrame(index=X_train_gal.columns)
gal_fit_scores = np.zeros(cv.n_splits)
gal_val_scores = np.zeros(cv.n_splits)
submission.loc[X_test_gal.index, y_train_gal.unique()] = 0.0

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train_gal, y_train_gal)):
    
    X_fit = X_train_gal.iloc[fit_idx]
    y_fit = y_train_gal.iloc[fit_idx].map(class_to_int)
    w_fit = y_train_gal.iloc[fit_idx].map(class_weights)
    X_val = X_train_gal.iloc[val_idx]
    y_val = y_train_gal.iloc[val_idx].map(class_to_int)
    w_val = y_train_gal.iloc[val_idx].map(class_weights)
    
    # Train the model
    fit_set = lgbm.Dataset(X_fit, y_fit, weight=w_fit)
    val_set = lgbm.Dataset(X_val, y_val, reference=fit_set, weight=w_val)

    evals_result = {}
    model = lgbm.train(
        params=params,
        train_set=fit_set,
        num_boost_round=10000,
        valid_sets=(fit_set, val_set),
        valid_names=('fit', 'val'),
        verbose_eval=50,
        early_stopping_rounds=50,
        evals_result=evals_result
    )
    
    # Store the feature importances
    feature_importances[f'gain_{i}'] = model.feature_importance('gain')
    feature_importances[f'split_{i}'] = model.feature_importance('split')
    
    # Store the predictions
    y_pred = pd.DataFrame(model.predict(X_test_gal), index=X_test_gal.index)
    y_pred.columns = y_pred.columns.map(int_to_class)
    submission.loc[y_pred.index, y_pred.columns] += y_pred / cv.n_splits
    
    # Store the scores
    gal_fit_scores[i] = evals_result['fit']['multi_logloss'][-1]
    gal_val_scores[i] = evals_result['val']['multi_logloss'][-1]

print(f'- Train logloss: {gal_fit_scores.mean():.3f} (±{gal_fit_scores.std():.3f})')
print(f'- Valid logloss: {gal_val_scores.mean():.3f} (±{gal_val_scores.std():.3f})')

Training until validation scores don't improve for 50 rounds.
[50]	fit's multi_logloss: 0.124001	val's multi_logloss: 0.231071
[100]	fit's multi_logloss: 0.021045	val's multi_logloss: 0.133079
[150]	fit's multi_logloss: 0.00448423	val's multi_logloss: 0.118232
[200]	fit's multi_logloss: 0.00113722	val's multi_logloss: 0.121249
Early stopping, best iteration is:
[160]	fit's multi_logloss: 0.00335769	val's multi_logloss: 0.117226
Training until validation scores don't improve for 50 rounds.
[50]	fit's multi_logloss: 0.138861	val's multi_logloss: 0.140161
[100]	fit's multi_logloss: 0.0253423	val's multi_logloss: 0.0554612
[150]	fit's multi_logloss: 0.0059239	val's multi_logloss: 0.0446788
[200]	fit's multi_logloss: 0.00165123	val's multi_logloss: 0.0426712
Early stopping, best iteration is:
[186]	fit's multi_logloss: 0.00232408	val's multi_logloss: 0.0421209
Training until validation scores don't improve for 50 rounds.
[50]	fit's multi_logloss: 0.127218	val's multi_logloss: 0.193191
[100]

- Train logloss: 0.002 (±0.001)
- Valid logloss: 0.091 (±0.028)

In [43]:
feature_importances.sort_values('gain_0', ascending=False).head()

Unnamed: 0,gain_0,split_0,gain_1,split_1,gain_2,split_2,gain_3,split_3,gain_4,split_4
flux_skew_2,6268.3425,274,6185.21221,258,6429.423737,209,6223.756176,238,6167.066547,221
flux_min_2,4398.438483,77,3199.334359,95,3744.41771,91,4597.056659,89,4469.617465,102
flux_max_1,2673.007779,70,2307.7075,81,2585.771134,58,2324.724651,64,2345.841771,85
flux_err_mean_1,1813.655786,55,2452.766638,33,2260.552261,40,1094.681437,39,1411.550384,34
flux_skew_1,1103.994437,222,868.260065,231,890.892121,171,885.035262,165,870.010742,207


## Extragalactic objects

Select the extragalactic objects.

In [44]:
X_train_ex = X_train[X_train['hostgal_photoz'] > 0]
y_train_ex = y_train[X_train['hostgal_photoz'] > 0]
X_test_ex = X_test[X_test['hostgal_photoz'] > 0]

class_to_int = {c: i for i, c in enumerate(y_train_ex.unique())}
int_to_class = {i: c for c, i in class_to_int.items()}

In [45]:
import lightgbm as lgbm
import numpy as np
from sklearn import model_selection


params = {
    'application': 'multiclass',
    'boosting_type': 'gbdt',
    'num_classes': y_train_ex.nunique(),
    'metric': 'multi_logloss',
    'num_threads': 8,
    'num_leaves': 2 ** 4,
    'min_data_per_group': 300,
    'max_cat_threshold': 32,
    'max_cat_to_onehot': 6,
    'cat_smooth': 30,
    'cat_l2': 10,
    'max_bin': 255,
    'min_data_in_bin': 20,
    'min_data_in_leaf': 100,
    'learning_rate': 0.1,
    'feature_fraction': 0.7,
    'feature_fraction_seed': 42,
    'bagging_fraction': 1,
    'bagging_freq': 0,
    'bagging_seed': 42,
    'lambda_l1': 1,
    'lambda_l2': 2,
    'verbosity': 2,
}
        

cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
feature_importances = pd.DataFrame(index=X_train_ex.columns)
ex_fit_scores = np.zeros(cv.n_splits)
ex_val_scores = np.zeros(cv.n_splits)
submission.loc[X_test_ex.index, y_train_ex.unique()] = 0.0

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train_ex, y_train_ex)):
    
    X_fit = X_train_ex.iloc[fit_idx]
    y_fit = y_train_ex.iloc[fit_idx].map(class_to_int)
    w_fit = y_train_ex.iloc[fit_idx].map(class_weights)
    X_val = X_train_ex.iloc[val_idx]
    y_val = y_train_ex.iloc[val_idx].map(class_to_int)
    w_val = y_train_ex.iloc[val_idx].map(class_weights)
    
    # Train the model
    fit_set = lgbm.Dataset(X_fit, y_fit, weight=w_fit)
    val_set = lgbm.Dataset(X_val, y_val, reference=fit_set, weight=w_val)
    
    evals_result = {}
    model = lgbm.train(
        params=params,
        train_set=fit_set,
        num_boost_round=10000,
        valid_sets=(fit_set, val_set),
        valid_names=('fit', 'val'),
        verbose_eval=50,
        early_stopping_rounds=50,
        evals_result=evals_result
    )
    
    # Store the feature importances
    feature_importances[f'gain_{i}'] = model.feature_importance('gain')
    feature_importances[f'split_{i}'] = model.feature_importance('split')
    
    # Store the predictions
    y_pred = pd.DataFrame(model.predict(X_test_ex), index=X_test_ex.index)
    y_pred.columns = y_pred.columns.map(int_to_class)
    submission.loc[y_pred.index, y_pred.columns] += y_pred / cv.n_splits
    
    # Store the scores
    ex_fit_scores[i] = evals_result['fit']['multi_logloss'][-1]
    ex_val_scores[i] = evals_result['val']['multi_logloss'][-1]

print(f'- Train logloss: {ex_fit_scores.mean():.3f} (±{ex_fit_scores.std():.3f})')
print(f'- Valid logloss: {ex_val_scores.mean():.3f} (±{ex_val_scores.std():.3f})')

Training until validation scores don't improve for 50 rounds.
[50]	fit's multi_logloss: 0.643759	val's multi_logloss: 1.0054
[100]	fit's multi_logloss: 0.37406	val's multi_logloss: 0.937066
[150]	fit's multi_logloss: 0.243032	val's multi_logloss: 0.928959
Early stopping, best iteration is:
[141]	fit's multi_logloss: 0.261112	val's multi_logloss: 0.928419
Training until validation scores don't improve for 50 rounds.
[50]	fit's multi_logloss: 0.64593	val's multi_logloss: 0.965183
[100]	fit's multi_logloss: 0.376948	val's multi_logloss: 0.89567
[150]	fit's multi_logloss: 0.245991	val's multi_logloss: 0.893025
Early stopping, best iteration is:
[136]	fit's multi_logloss: 0.274855	val's multi_logloss: 0.889508
Training until validation scores don't improve for 50 rounds.
[50]	fit's multi_logloss: 0.649332	val's multi_logloss: 0.96035
[100]	fit's multi_logloss: 0.381577	val's multi_logloss: 0.886196
[150]	fit's multi_logloss: 0.250663	val's multi_logloss: 0.866362
[200]	fit's multi_logloss: 

- Train logloss: 0.163 (±0.013)
- Valid logloss: 0.870 (±0.030)

In [47]:
feature_importances.sort_values('gain_0', ascending=False).head()

Unnamed: 0,gain_0,split_0,gain_1,split_1,gain_2,split_2,gain_3,split_3,gain_4,split_4
hostgal_photoz,5032.842919,557,5096.438541,607,3807.106237,662,3865.95182,484,5714.119225,610
flux_mean_0,4904.933324,399,4818.442032,339,5150.897088,416,4569.698603,317,4515.560972,363
distmod,4112.976403,443,3997.78239,404,5305.585292,483,5424.734604,495,3682.956371,433
flux_min_1,4035.697156,251,3685.435178,212,3450.278714,331,3659.994542,258,3727.274163,235
flux_max_0,2649.276457,235,2155.395661,222,2224.260391,265,2437.38951,215,2739.504185,240


## Novelty detection

http://scikit-learn.org/stable/modules/outlier_detection.html

In [61]:
submission['class_99'] = 1 - submission[submission.columns.drop('class_99')].max(axis='columns')

## Putting it all together

In [51]:
submission.head()

Unnamed: 0_level_0,class_15,class_16,class_42,class_52,class_53,class_6,class_62,class_64,class_65,class_67,class_88,class_90,class_92,class_95,class_99
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
13,0.001499,0.0,0.415565,0.083925,0.0,0.0,0.022008,0.000227,0.0,0.00348,0.000508,0.471693,0.0,0.001096,0.047189
14,0.002266,0.0,0.059256,0.018871,0.0,0.0,0.045229,0.000887,0.0,0.005896,0.008468,0.85537,0.0,0.003757,0.606019
17,0.001606,0.0,0.02976,0.005006,0.0,0.0,0.018507,0.000812,0.0,0.023316,0.002987,0.911846,0.0,0.006159,0.763314
23,0.001093,0.0,0.092174,0.005154,0.0,0.0,0.018885,0.00103,0.0,0.042884,0.00162,0.830709,0.0,0.00645,0.549133
34,0.005132,0.0,0.149135,0.030653,0.0,0.0,0.013828,0.000539,0.0,0.005821,0.000302,0.793711,0.0,0.000879,0.029262


Sanity checks.

In [52]:
assert submission[X_test['hostgal_photoz'] == 0][y_train_ex.unique().categories].sum().sum() == 0
assert submission[X_test['hostgal_photoz'] > 0][y_train_gal.unique().categories].sum().sum() == 0

Save the submission. We align with the sample submission just to make sure.

In [60]:
name = f'{gal_val_scores.mean():.3f}_{ex_val_scores.mean():.3f}_{out_val_scores.mean():.3f}'

sample_sub = pd.read_csv('data/sample_submission.csv').set_index('object_id')

submission.loc[sample_sub.index, sample_sub.columns].to_csv(f'submissions/{name}.csv.gz', compression='gzip')

In [48]:
from sklearn import metrics

metrics.log_loss(y_fit, y_pred)

1.2026580625061984

In [18]:
from sklearn import preprocessing


def weighted_log_loss(y_true, y_pred, class_weights, eps=10e-15):
    y_true = preprocessing.LabelBinarizer().fit_transform(y_true)
    
    # Normalize row-wise
    y_pred /= y_pred.sum(axis=1)[:, np.newaxis]
    
    # Limit 0s and 1s
    y_pred = np.clip(y_pred, eps, 1-eps)
    
    return -(y_true * np.log(y_pred)).sum(axis=1).mean()
    
    
class_weights = np.array([1] * 14)
weighted_log_loss(y_fit, y_pred, class_weights)

ValueError: Unable to coerce to DataFrame, shape must be (3102380, 9): given (3102380, 1)