In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
import gc
import os
import matplotlib.pyplot as plt
import seaborn as sns 
import lightgbm as lgb
from catboost import Pool, CatBoostClassifier
import itertools
import pickle, gzip
import glob
from sklearn.preprocessing import StandardScaler

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
DATA_DIR = '../../data/raw'

In [4]:
gc.enable() # включение garbage collector

train = pd.read_csv(os.path.join(DATA_DIR, 'training_set.csv')) # считывание тренировочных данных (кривые)
train['flux_ratio_sq'] = np.power(train['flux'] / train['flux_err'], 2.0)
train['flux_by_flux_ratio_sq'] = train['flux'] * train['flux_ratio_sq']

aggs = {
    'mjd': ['min', 'max', 'size'],
    'flux': ['min', 'max', 'mean', 'median', 'std','skew'],
    'flux_err': ['min', 'max', 'mean', 'median', 'std','skew'],
    'detected': ['mean'],
    'flux_ratio_sq':['sum','skew'],
    'flux_by_flux_ratio_sq':['sum','skew'],
}

agg_train = train.groupby(['object_id', 'passband']).agg(aggs)
new_columns = [
    k + '_' + agg for k in aggs.keys() for agg in aggs[k]
]
agg_train.columns = new_columns
agg_train['mjd_diff'] = agg_train['mjd_max'] - agg_train['mjd_min']
agg_train['flux_diff'] = agg_train['flux_max'] - agg_train['flux_min']
agg_train['flux_dif2'] = (agg_train['flux_max'] - agg_train['flux_min']) / agg_train['flux_mean']
agg_train['flux_w_mean'] = agg_train['flux_by_flux_ratio_sq_sum'] / agg_train['flux_ratio_sq_sum']
agg_train['flux_dif3'] = (agg_train['flux_max'] - agg_train['flux_min']) / agg_train['flux_w_mean']

del train
gc.collect() # сбор мусора

# convert from multindex df to single index
agg_train = agg_train.reset_index(level=[0,1])


In [5]:
features = agg_train.columns.drop(['passband', 'object_id'])

agg_train = agg_train.groupby('object_id')[features].apply(lambda x: pd.DataFrame(x.values)).unstack().reset_index()


In [6]:
agg_train.head()

Unnamed: 0_level_0,object_id,0,0,0,0,0,0,1,1,1,...,23,23,23,23,24,24,24,24,24,24
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1,2,3,4,5,0,1,2,...,2,3,4,5,0,1,2,3,4,5
0,615,59819.1532,59750.4306,59750.4229,59750.4383,59750.445,59752.4435,60617.0295,60624.1836,60624.176,...,-214.947803,-232.265606,-111.843477,-126.326197,-12.552744,-2.790325,-6.019338,-4.203729,-7.189854,-6.340753
1,713,59851.2006,59825.2676,59825.26,59825.2752,59825.2862,59825.2971,60674.0798,60668.0723,60668.0647,...,-3.821839,-4.894058,-4.179287,-4.90486,-4.110659,-4.106502,-5.38928,-4.847697,-5.291509,-5.908843
2,730,59818.274,59798.3281,59798.3205,59798.3357,59798.3466,59798.3576,60648.0642,60652.1365,60652.1289,...,16.797566,26.385068,33.303042,32.979248,216.22279,26.022646,1.419465,1.478408,1.411171,2.015506
3,745,59818.2219,59770.374,59770.3662,59770.3817,59770.3928,59770.4039,60620.1257,60624.0425,60624.0348,...,166.409065,138.122853,121.77942,74.94827,1.831855,1.115757,1.339801,1.507316,1.63515,2.024899
4,1124,59819.1532,59750.4306,59750.4229,59750.4383,59750.445,59752.4435,60617.0295,60624.1836,60624.176,...,75.730405,105.099501,115.267128,86.915714,7.993349,1.601824,1.436097,1.356987,1.389329,1.380856


In [7]:
new_columns = []

for feat in features:
    for i in range(0, 6):
        new_columns.append(feat + "_pb" + str(i))
        
new_columns = ['object_id'] + new_columns

In [8]:
agg_train.columns = new_columns
agg_train.head()

Unnamed: 0,object_id,mjd_min_pb0,mjd_min_pb1,mjd_min_pb2,mjd_min_pb3,mjd_min_pb4,mjd_min_pb5,mjd_max_pb0,mjd_max_pb1,mjd_max_pb2,...,flux_w_mean_pb2,flux_w_mean_pb3,flux_w_mean_pb4,flux_w_mean_pb5,flux_dif3_pb0,flux_dif3_pb1,flux_dif3_pb2,flux_dif3_pb3,flux_dif3_pb4,flux_dif3_pb5
0,615,59819.1532,59750.4306,59750.4229,59750.4383,59750.445,59752.4435,60617.0295,60624.1836,60624.176,...,-214.947803,-232.265606,-111.843477,-126.326197,-12.552744,-2.790325,-6.019338,-4.203729,-7.189854,-6.340753
1,713,59851.2006,59825.2676,59825.26,59825.2752,59825.2862,59825.2971,60674.0798,60668.0723,60668.0647,...,-3.821839,-4.894058,-4.179287,-4.90486,-4.110659,-4.106502,-5.38928,-4.847697,-5.291509,-5.908843
2,730,59818.274,59798.3281,59798.3205,59798.3357,59798.3466,59798.3576,60648.0642,60652.1365,60652.1289,...,16.797566,26.385068,33.303042,32.979248,216.22279,26.022646,1.419465,1.478408,1.411171,2.015506
3,745,59818.2219,59770.374,59770.3662,59770.3817,59770.3928,59770.4039,60620.1257,60624.0425,60624.0348,...,166.409065,138.122853,121.77942,74.94827,1.831855,1.115757,1.339801,1.507316,1.63515,2.024899
4,1124,59819.1532,59750.4306,59750.4229,59750.4383,59750.445,59752.4435,60617.0295,60624.1836,60624.176,...,75.730405,105.099501,115.267128,86.915714,7.993349,1.601824,1.436097,1.356987,1.389329,1.380856


In [9]:
meta_train = pd.read_csv(os.path.join(DATA_DIR, 'training_set_metadata.csv')) # считывание тренировочных данных (метадата)

full_train = agg_train.reset_index().merge(
    right=meta_train,
    how='outer',
    on='object_id'
)

if 'target' in full_train:
    y = full_train['target']
    del full_train['target']
classes = sorted(y.unique())

# Taken from Giba's topic : https://www.kaggle.com/titericz
# https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
# with Kyle Boone's post https://www.kaggle.com/kyleboone
# определение веса каждого класса
class_weight = {
    c: 1 for c in classes
}
for c in [64, 15]:
    class_weight[c] = 2

print('Unique classes : ', classes)

Unique classes :  [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]


In [10]:
meta_train.head()

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017,92
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,0.007,88
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.232,0.2262,0.0157,40.2561,0.021,42
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,0.007,90
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,0.024,90


In [11]:
if 'object_id' in full_train:
    oof_df = full_train[['object_id']]
    del full_train['object_id'], full_train['distmod'], full_train['hostgal_specz'] # удаление колонок
    del full_train['ra'], full_train['decl'], full_train['gal_l'], full_train['gal_b'], full_train['ddf'] # удаление колонок
    

# заполнение NaN
train_mean = full_train.mean(axis=0)
full_train.fillna(train_mean, inplace=True)

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1) # деление данных на фолды для кросс-валидации

In [12]:
full_train_new = full_train.copy()
ss = StandardScaler()
full_train_ss = ss.fit_transform(full_train_new)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
